Update configuration_florence2.py (#62)

- Update configuration_florence2.py (447e712b7902f4ad5279f9906bc16d81c9a58856)

Co-authored-by: zbing <zbing@users.noreply.huggingface.co>
This commit is contained in:
ai-modelscope
2025-08-05 15:01:02 +08:00
parent b7db1d9ea3
commit 8216ef0209
4 changed files with 30 additions and 28 deletions

1
.gitattributes vendored
View File

@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text
*bin filter=lfs diff=lfs merge=lfs -text *bin filter=lfs diff=lfs merge=lfs -text
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
model.safetensors filter=lfs diff=lfs merge=lfs -text

View File

@ -77,7 +77,7 @@ class Florence2VisionConfig(PretrainedConfig):
>>> configuration = model.config >>> configuration = model.config
```""" ```"""
model_type = "florence2_vision" model_type = "davit"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
def __init__( def __init__(
@ -327,7 +327,7 @@ class Florence2Config(PretrainedConfig):
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.projection_dim = projection_dim self.projection_dim = projection_dim
if vision_config is not None: if vision_config is not None:
vision_config = PretrainedConfig(**vision_config) vision_config = Florence2VisionConfig(**vision_config)
self.vision_config = vision_config self.vision_config = vision_config
self.vocab_size = self.vocab_size self.vocab_size = self.vocab_size

BIN
model.safetensors (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -26,7 +26,7 @@ import torch.utils.checkpoint as checkpoint
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from collections import OrderedDict from collections import OrderedDict
from einops import rearrange from einops import rearrange
from timm.models.layers import DropPath, trunc_normal_ from timm.layers import DropPath, trunc_normal_
from transformers.modeling_utils import PreTrainedModel from transformers.modeling_utils import PreTrainedModel
from transformers.generation.utils import GenerationMixin from transformers.generation.utils import GenerationMixin
@ -610,29 +610,10 @@ class DaViT(nn.Module):
self.avgpool = nn.AdaptiveAvgPool1d(1) self.avgpool = nn.AdaptiveAvgPool1d(1)
self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity() self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
self.apply(self._init_weights)
@property @property
def dim_out(self): def dim_out(self):
return self.embed_dims[-1] return self.embed_dims[-1]
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=0.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Conv2d):
nn.init.normal_(m.weight, std=0.02)
for name, _ in m.named_parameters():
if name in ['bias']:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.weight, 1.0)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1.0)
nn.init.constant_(m.bias, 0)
def forward_features_unpool(self, x): def forward_features_unpool(self, x):
""" """
forward until avg pooling forward until avg pooling
@ -1451,6 +1432,17 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel):
module.weight.data.normal_(mean=0.0, std=std) module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None: if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_() module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.Conv2d):
nn.init.normal_(module.weight, std=0.02)
for name, _ in module.named_parameters():
if name == "bias":
nn.init.constant_(module.bias, 0)
elif isinstance(module, nn.LayerNorm):
nn.init.constant_(module.weight, 1.0)
nn.init.constant_(module.bias, 0)
elif isinstance(module, nn.BatchNorm2d):
nn.init.constant_(module.weight, 1.0)
nn.init.constant_(module.bias, 0)
@property @property
def dummy_inputs(self): def dummy_inputs(self):
@ -2074,14 +2066,20 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
# Initialize weights and apply final processing # Initialize weights and apply final processing
self.post_init() self.post_init()
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.model.encoder.embed_tokens, self.model.shared)
self._tie_or_clone_weights(self.model.decoder.embed_tokens, self.model.shared)
self._tie_or_clone_weights(self.lm_head, self.model.shared)
def get_encoder(self): def get_encoder(self):
return self.model.get_encoder() return self.model.get_encoder()
def get_decoder(self): def get_decoder(self):
return self.model.get_decoder() return self.model.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding: def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, **kwargs) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of) new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs)
self._resize_final_logits_bias(new_embeddings.weight.shape[0]) self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings return new_embeddings
@ -2531,6 +2529,8 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
FLORENCE2_START_DOCSTRING, FLORENCE2_START_DOCSTRING,
) )
class Florence2ForConditionalGeneration(Florence2PreTrainedModel): class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
_tied_weights_keys = ["language_model.encoder.embed_tokens.weight", "language_model.decoder.embed_tokens.weight", "language_model.lm_head.weight"]
def __init__(self, config: Florence2Config): def __init__(self, config: Florence2Config):
super().__init__(config) super().__init__(config)
assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now' assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
@ -2545,8 +2545,6 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
language_model = Florence2LanguageForConditionalGeneration(config=config.text_config) language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
if language_model._tied_weights_keys is not None:
self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
self.language_model = language_model self.language_model = language_model
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
@ -2589,8 +2587,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
def get_input_embeddings(self): def get_input_embeddings(self):
return self.language_model.get_input_embeddings() return self.language_model.get_input_embeddings()
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None, **kwargs) -> nn.Embedding:
model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs)
# update vocab size # update vocab size
self.config.text_config.vocab_size = model_embeds.num_embeddings self.config.text_config.vocab_size = model_embeds.num_embeddings
self.config.vocab_size = model_embeds.num_embeddings self.config.vocab_size = model_embeds.num_embeddings