mirror of
https://www.modelscope.cn/AI-ModelScope/Florence-2-large.git
synced 2026-04-02 21:52:53 +08:00
Update configuration_florence2.py (#62)
- Update configuration_florence2.py (447e712b7902f4ad5279f9906bc16d81c9a58856) Co-authored-by: zbing <zbing@users.noreply.huggingface.co>
This commit is contained in:
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|||||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
*bin filter=lfs diff=lfs merge=lfs -text
|
*bin filter=lfs diff=lfs merge=lfs -text
|
||||||
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
|||||||
@ -77,7 +77,7 @@ class Florence2VisionConfig(PretrainedConfig):
|
|||||||
>>> configuration = model.config
|
>>> configuration = model.config
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
model_type = "florence2_vision"
|
model_type = "davit"
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -327,7 +327,7 @@ class Florence2Config(PretrainedConfig):
|
|||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.projection_dim = projection_dim
|
self.projection_dim = projection_dim
|
||||||
if vision_config is not None:
|
if vision_config is not None:
|
||||||
vision_config = PretrainedConfig(**vision_config)
|
vision_config = Florence2VisionConfig(**vision_config)
|
||||||
self.vision_config = vision_config
|
self.vision_config = vision_config
|
||||||
self.vocab_size = self.vocab_size
|
self.vocab_size = self.vocab_size
|
||||||
|
|
||||||
|
|||||||
BIN
model.safetensors
(Stored with Git LFS)
Normal file
BIN
model.safetensors
(Stored with Git LFS)
Normal file
Binary file not shown.
@ -26,7 +26,7 @@ import torch.utils.checkpoint as checkpoint
|
|||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
from timm.models.layers import DropPath, trunc_normal_
|
from timm.layers import DropPath, trunc_normal_
|
||||||
|
|
||||||
from transformers.modeling_utils import PreTrainedModel
|
from transformers.modeling_utils import PreTrainedModel
|
||||||
from transformers.generation.utils import GenerationMixin
|
from transformers.generation.utils import GenerationMixin
|
||||||
@ -610,29 +610,10 @@ class DaViT(nn.Module):
|
|||||||
self.avgpool = nn.AdaptiveAvgPool1d(1)
|
self.avgpool = nn.AdaptiveAvgPool1d(1)
|
||||||
self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
|
self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
|
||||||
|
|
||||||
self.apply(self._init_weights)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dim_out(self):
|
def dim_out(self):
|
||||||
return self.embed_dims[-1]
|
return self.embed_dims[-1]
|
||||||
|
|
||||||
def _init_weights(self, m):
|
|
||||||
if isinstance(m, nn.Linear):
|
|
||||||
trunc_normal_(m.weight, std=0.02)
|
|
||||||
if m.bias is not None:
|
|
||||||
nn.init.constant_(m.bias, 0)
|
|
||||||
elif isinstance(m, nn.Conv2d):
|
|
||||||
nn.init.normal_(m.weight, std=0.02)
|
|
||||||
for name, _ in m.named_parameters():
|
|
||||||
if name in ['bias']:
|
|
||||||
nn.init.constant_(m.bias, 0)
|
|
||||||
elif isinstance(m, nn.LayerNorm):
|
|
||||||
nn.init.constant_(m.weight, 1.0)
|
|
||||||
nn.init.constant_(m.bias, 0)
|
|
||||||
elif isinstance(m, nn.BatchNorm2d):
|
|
||||||
nn.init.constant_(m.weight, 1.0)
|
|
||||||
nn.init.constant_(m.bias, 0)
|
|
||||||
|
|
||||||
def forward_features_unpool(self, x):
|
def forward_features_unpool(self, x):
|
||||||
"""
|
"""
|
||||||
forward until avg pooling
|
forward until avg pooling
|
||||||
@ -1451,6 +1432,17 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
module.weight.data[module.padding_idx].zero_()
|
module.weight.data[module.padding_idx].zero_()
|
||||||
|
elif isinstance(module, nn.Conv2d):
|
||||||
|
nn.init.normal_(module.weight, std=0.02)
|
||||||
|
for name, _ in module.named_parameters():
|
||||||
|
if name == "bias":
|
||||||
|
nn.init.constant_(module.bias, 0)
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
nn.init.constant_(module.weight, 1.0)
|
||||||
|
nn.init.constant_(module.bias, 0)
|
||||||
|
elif isinstance(module, nn.BatchNorm2d):
|
||||||
|
nn.init.constant_(module.weight, 1.0)
|
||||||
|
nn.init.constant_(module.bias, 0)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dummy_inputs(self):
|
def dummy_inputs(self):
|
||||||
@ -2074,14 +2066,20 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
|
|||||||
# Initialize weights and apply final processing
|
# Initialize weights and apply final processing
|
||||||
self.post_init()
|
self.post_init()
|
||||||
|
|
||||||
|
def _tie_weights(self):
|
||||||
|
if self.config.tie_word_embeddings:
|
||||||
|
self._tie_or_clone_weights(self.model.encoder.embed_tokens, self.model.shared)
|
||||||
|
self._tie_or_clone_weights(self.model.decoder.embed_tokens, self.model.shared)
|
||||||
|
self._tie_or_clone_weights(self.lm_head, self.model.shared)
|
||||||
|
|
||||||
def get_encoder(self):
|
def get_encoder(self):
|
||||||
return self.model.get_encoder()
|
return self.model.get_encoder()
|
||||||
|
|
||||||
def get_decoder(self):
|
def get_decoder(self):
|
||||||
return self.model.get_decoder()
|
return self.model.get_decoder()
|
||||||
|
|
||||||
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
|
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, **kwargs) -> nn.Embedding:
|
||||||
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
|
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs)
|
||||||
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
|
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
|
||||||
return new_embeddings
|
return new_embeddings
|
||||||
|
|
||||||
@ -2531,6 +2529,8 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
|
|||||||
FLORENCE2_START_DOCSTRING,
|
FLORENCE2_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
|
class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
|
||||||
|
_tied_weights_keys = ["language_model.encoder.embed_tokens.weight", "language_model.decoder.embed_tokens.weight", "language_model.lm_head.weight"]
|
||||||
|
|
||||||
def __init__(self, config: Florence2Config):
|
def __init__(self, config: Florence2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
|
assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
|
||||||
@ -2545,8 +2545,6 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
|
|||||||
|
|
||||||
language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
|
language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
|
||||||
|
|
||||||
if language_model._tied_weights_keys is not None:
|
|
||||||
self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
|
|
||||||
self.language_model = language_model
|
self.language_model = language_model
|
||||||
|
|
||||||
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
|
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
|
||||||
@ -2589,8 +2587,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
|
|||||||
def get_input_embeddings(self):
|
def get_input_embeddings(self):
|
||||||
return self.language_model.get_input_embeddings()
|
return self.language_model.get_input_embeddings()
|
||||||
|
|
||||||
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
|
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None, **kwargs) -> nn.Embedding:
|
||||||
model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
|
model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs)
|
||||||
# update vocab size
|
# update vocab size
|
||||||
self.config.text_config.vocab_size = model_embeds.num_embeddings
|
self.config.text_config.vocab_size = model_embeds.num_embeddings
|
||||||
self.config.vocab_size = model_embeds.num_embeddings
|
self.config.vocab_size = model_embeds.num_embeddings
|
||||||
|
|||||||
Reference in New Issue
Block a user