diff --git a/.gitattributes b/.gitattributes index c9cc3f5..a7402f4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text *bin filter=lfs diff=lfs merge=lfs -text pytorch_model.bin filter=lfs diff=lfs merge=lfs -text +model.safetensors filter=lfs diff=lfs merge=lfs -text diff --git a/configuration_florence2.py b/configuration_florence2.py index 622f749..b886961 100644 --- a/configuration_florence2.py +++ b/configuration_florence2.py @@ -77,7 +77,7 @@ class Florence2VisionConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "florence2_vision" + model_type = "davit" keys_to_ignore_at_inference = ["past_key_values"] def __init__( @@ -327,7 +327,7 @@ class Florence2Config(PretrainedConfig): self.vocab_size = vocab_size self.projection_dim = projection_dim if vision_config is not None: - vision_config = PretrainedConfig(**vision_config) + vision_config = Florence2VisionConfig(**vision_config) self.vision_config = vision_config self.vocab_size = self.vocab_size diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..5065556 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f38ce741c6b71188fe2b3419a55e11917a8a7b321ae2e63c61da0191b0ebad7 +size 1553563458 diff --git a/modeling_florence2.py b/modeling_florence2.py index ccca154..9020854 100644 --- a/modeling_florence2.py +++ b/modeling_florence2.py @@ -26,7 +26,7 @@ import torch.utils.checkpoint as checkpoint from torch.nn import CrossEntropyLoss from collections import OrderedDict from einops import rearrange -from timm.models.layers import DropPath, trunc_normal_ +from timm.layers import DropPath, trunc_normal_ from transformers.modeling_utils import PreTrainedModel from transformers.generation.utils import GenerationMixin @@ -610,29 +610,10 @@ class DaViT(nn.Module): self.avgpool = nn.AdaptiveAvgPool1d(1) self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity() - self.apply(self._init_weights) - @property def dim_out(self): return self.embed_dims[-1] - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Conv2d): - nn.init.normal_(m.weight, std=0.02) - for name, _ in m.named_parameters(): - if name in ['bias']: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.weight, 1.0) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1.0) - nn.init.constant_(m.bias, 0) - def forward_features_unpool(self, x): """ forward until avg pooling @@ -1451,6 +1432,17 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.Conv2d): + nn.init.normal_(module.weight, std=0.02) + for name, _ in module.named_parameters(): + if name == "bias": + nn.init.constant_(module.bias, 0) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.weight, 1.0) + nn.init.constant_(module.bias, 0) + elif isinstance(module, nn.BatchNorm2d): + nn.init.constant_(module.weight, 1.0) + nn.init.constant_(module.bias, 0) @property def dummy_inputs(self): @@ -2074,14 +2066,20 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel # Initialize weights and apply final processing self.post_init() + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.model.encoder.embed_tokens, self.model.shared) + self._tie_or_clone_weights(self.model.decoder.embed_tokens, self.model.shared) + self._tie_or_clone_weights(self.lm_head, self.model.shared) + def get_encoder(self): return self.model.get_encoder() def get_decoder(self): return self.model.get_decoder() - def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding: - new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, **kwargs) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs) self._resize_final_logits_bias(new_embeddings.weight.shape[0]) return new_embeddings @@ -2531,6 +2529,8 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel): FLORENCE2_START_DOCSTRING, ) class Florence2ForConditionalGeneration(Florence2PreTrainedModel): + _tied_weights_keys = ["language_model.encoder.embed_tokens.weight", "language_model.decoder.embed_tokens.weight", "language_model.lm_head.weight"] + def __init__(self, config: Florence2Config): super().__init__(config) assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now' @@ -2545,8 +2545,6 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel): language_model = Florence2LanguageForConditionalGeneration(config=config.text_config) - if language_model._tied_weights_keys is not None: - self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys] self.language_model = language_model self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 @@ -2589,8 +2587,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel): def get_input_embeddings(self): return self.language_model.get_input_embeddings() - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None, **kwargs) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs) # update vocab size self.config.text_config.vocab_size = model_embeds.num_embeddings self.config.vocab_size = model_embeds.num_embeddings