update_model_init_fp16 (#53)

- update model init with float16 (93d5b247d87e5e0afeafe5b49c4a796aa5ba9e7c)
- update model init with float16 (bab1b3cbc340fac15205568422af11476c232442)

Co-authored-by: Haiping Wu <haipingwu@users.noreply.huggingface.co>
This commit is contained in:
ai-modelscope
2024-07-26 20:46:36 +08:00
parent 92117d39db
commit 064b22a4ff
5 changed files with 393 additions and 46 deletions

1
.gitattributes vendored
View File

@ -34,4 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text
*bin filter=lfs diff=lfs merge=lfs -text *bin filter=lfs diff=lfs merge=lfs -text
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text

View File

@ -7,11 +7,11 @@
"AutoConfig": "configuration_florence2.Florence2Config", "AutoConfig": "configuration_florence2.Florence2Config",
"AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration" "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
}, },
"bos_token_id": 2, "bos_token_id": 0,
"eos_token_id": 1, "eos_token_id": 2,
"ignore_index": -100, "ignore_index": -100,
"model_type": "florence2", "model_type": "florence2",
"pad_token_id": 0, "pad_token_id": 1,
"projection_dim": 1024, "projection_dim": 1024,
"text_config": { "text_config": {
"vocab_size": 51289, "vocab_size": 51289,
@ -79,7 +79,7 @@
"image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"] "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
}, },
"vocab_size": 51289, "vocab_size": 51289,
"torch_dtype": "float32", "torch_dtype": "float16",
"transformers_version": "4.41.0.dev0", "transformers_version": "4.41.0.dev0",
"is_encoder_decoder": true "is_encoder_decoder": true
} }

View File

@ -2240,6 +2240,10 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
decoding. decoding.
Args: Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model. Sequence of hidden-states at the output of the last layer of the decoder of the model.
@ -2288,7 +2292,8 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
image_hidden_states of the model produced by the vision encoder image_hidden_states of the model produced by the vision encoder
""" """
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
last_hidden_state: torch.FloatTensor = None last_hidden_state: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@ -2297,6 +2302,7 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
encoder_last_hidden_state: Optional[torch.FloatTensor] = None encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
image_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
FLORENCE2_START_DOCSTRING = r""" FLORENCE2_START_DOCSTRING = r"""
@ -2527,7 +2533,6 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
def __init__(self, config: Florence2Config): def __init__(self, config: Florence2Config):
super().__init__(config) super().__init__(config)
assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now' assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
del config.vision_config.model_type
self.vision_tower = DaViT.from_config(config=config.vision_config) self.vision_tower = DaViT.from_config(config=config.vision_config)
# remove unused layers # remove unused layers
del self.vision_tower.head del self.vision_tower.head
@ -2731,7 +2736,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
image_features = self._encode_image(pixel_values) image_features = self._encode_image(pixel_values)
inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds) inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
attention_mask = attention_mask.to(inputs_embeds.dtype) if inputs_embeds is not None:
attention_mask = attention_mask.to(inputs_embeds.dtype)
outputs = self.language_model( outputs = self.language_model(
attention_mask=attention_mask, attention_mask=attention_mask,
labels=labels, labels=labels,

View File

@ -596,7 +596,7 @@ class Florence2PostProcesser(object):
{ {
'TASK_NAME': 'ocr', 'TASK_NAME': 'ocr',
'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>', 'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
'AREA_THRESHOLD': 0.01 'AREA_THRESHOLD': 0.00
}, },
{ {
'TASK_NAME': 'phrase_grounding', 'TASK_NAME': 'phrase_grounding',
@ -1025,7 +1025,7 @@ class Florence2PostProcesser(object):
text, text,
pattern=pattern, pattern=pattern,
image_size=image_size, image_size=image_size,
area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.01), area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.0),
) )
parsed_dict['ocr'] = instances parsed_dict['ocr'] = instances
elif task == 'phrase_grounding': elif task == 'phrase_grounding':

File diff suppressed because one or more lines are too long