mirror of
https://www.modelscope.cn/AI-ModelScope/Florence-2-large.git
synced 2026-04-02 21:52:53 +08:00
update_model_init_fp16 (#53)
- update model init with float16 (93d5b247d87e5e0afeafe5b49c4a796aa5ba9e7c) - update model init with float16 (bab1b3cbc340fac15205568422af11476c232442) Co-authored-by: Haiping Wu <haipingwu@users.noreply.huggingface.co>
This commit is contained in:
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -34,4 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|||||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
*bin filter=lfs diff=lfs merge=lfs -text
|
*bin filter=lfs diff=lfs merge=lfs -text
|
||||||
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
|
||||||
|
|||||||
@ -7,11 +7,11 @@
|
|||||||
"AutoConfig": "configuration_florence2.Florence2Config",
|
"AutoConfig": "configuration_florence2.Florence2Config",
|
||||||
"AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
|
"AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
|
||||||
},
|
},
|
||||||
"bos_token_id": 2,
|
"bos_token_id": 0,
|
||||||
"eos_token_id": 1,
|
"eos_token_id": 2,
|
||||||
"ignore_index": -100,
|
"ignore_index": -100,
|
||||||
"model_type": "florence2",
|
"model_type": "florence2",
|
||||||
"pad_token_id": 0,
|
"pad_token_id": 1,
|
||||||
"projection_dim": 1024,
|
"projection_dim": 1024,
|
||||||
"text_config": {
|
"text_config": {
|
||||||
"vocab_size": 51289,
|
"vocab_size": 51289,
|
||||||
@ -79,7 +79,7 @@
|
|||||||
"image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
|
"image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
|
||||||
},
|
},
|
||||||
"vocab_size": 51289,
|
"vocab_size": 51289,
|
||||||
"torch_dtype": "float32",
|
"torch_dtype": "float16",
|
||||||
"transformers_version": "4.41.0.dev0",
|
"transformers_version": "4.41.0.dev0",
|
||||||
"is_encoder_decoder": true
|
"is_encoder_decoder": true
|
||||||
}
|
}
|
||||||
@ -2240,6 +2240,10 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
|
|||||||
decoding.
|
decoding.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||||
|
Language modeling loss.
|
||||||
|
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||||
|
|
||||||
@ -2288,7 +2292,8 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
|
|||||||
|
|
||||||
image_hidden_states of the model produced by the vision encoder
|
image_hidden_states of the model produced by the vision encoder
|
||||||
"""
|
"""
|
||||||
|
loss: Optional[torch.FloatTensor] = None
|
||||||
|
logits: torch.FloatTensor = None
|
||||||
last_hidden_state: torch.FloatTensor = None
|
last_hidden_state: torch.FloatTensor = None
|
||||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||||
decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
|
decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
|
||||||
@ -2297,6 +2302,7 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
|
|||||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||||
encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
|
encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
|
||||||
encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
|
encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
|
||||||
|
image_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
|
||||||
|
|
||||||
|
|
||||||
FLORENCE2_START_DOCSTRING = r"""
|
FLORENCE2_START_DOCSTRING = r"""
|
||||||
@ -2527,7 +2533,6 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
|
|||||||
def __init__(self, config: Florence2Config):
|
def __init__(self, config: Florence2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
|
assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
|
||||||
del config.vision_config.model_type
|
|
||||||
self.vision_tower = DaViT.from_config(config=config.vision_config)
|
self.vision_tower = DaViT.from_config(config=config.vision_config)
|
||||||
# remove unused layers
|
# remove unused layers
|
||||||
del self.vision_tower.head
|
del self.vision_tower.head
|
||||||
@ -2731,6 +2736,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
|
|||||||
image_features = self._encode_image(pixel_values)
|
image_features = self._encode_image(pixel_values)
|
||||||
inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
|
inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
|
||||||
|
|
||||||
|
if inputs_embeds is not None:
|
||||||
attention_mask = attention_mask.to(inputs_embeds.dtype)
|
attention_mask = attention_mask.to(inputs_embeds.dtype)
|
||||||
outputs = self.language_model(
|
outputs = self.language_model(
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
|
|||||||
@ -596,7 +596,7 @@ class Florence2PostProcesser(object):
|
|||||||
{
|
{
|
||||||
'TASK_NAME': 'ocr',
|
'TASK_NAME': 'ocr',
|
||||||
'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
|
'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
|
||||||
'AREA_THRESHOLD': 0.01
|
'AREA_THRESHOLD': 0.00
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
'TASK_NAME': 'phrase_grounding',
|
'TASK_NAME': 'phrase_grounding',
|
||||||
@ -1025,7 +1025,7 @@ class Florence2PostProcesser(object):
|
|||||||
text,
|
text,
|
||||||
pattern=pattern,
|
pattern=pattern,
|
||||||
image_size=image_size,
|
image_size=image_size,
|
||||||
area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.01),
|
area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.0),
|
||||||
)
|
)
|
||||||
parsed_dict['ocr'] = instances
|
parsed_dict['ocr'] = instances
|
||||||
elif task == 'phrase_grounding':
|
elif task == 'phrase_grounding':
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user