Update README.md

2026-05-19 19:32:53 +08:00 · 2024-06-20 10:36:29 +08:00
parent 16042094a1
commit bdfe4cb6aa
18 changed files with 55950 additions and 53 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,37 +1,37 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bin.* filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
-*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
-*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
-*.db* filter=lfs diff=lfs merge=lfs -text
+*bin filter=lfs diff=lfs merge=lfs -text
-*.ark* filter=lfs diff=lfs merge=lfs -text
+pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
 **/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
 **/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
 **/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.gguf* filter=lfs diff=lfs merge=lfs -text
 *.ggml filter=lfs diff=lfs merge=lfs -text
 *.llamafile* filter=lfs diff=lfs merge=lfs -text
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,9 @@
 # Microsoft Open Source Code of Conduct
 This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 Resources:
 - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
--- a/21
+++ b/21
@ -0,0 +1,21 @@
    MIT License
    Copyright (c) Microsoft Corporation.
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
    The above copyright notice and this permission notice shall be included in all
    copies or substantial portions of the Software.
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE
--- a/README.md
+++ b/README.md
@ -1,47 +1,256 @@
 ---
-license: Apache License 2.0
+license: mit
 license_link: https://huggingface.co/microsoft/Florence-2-large/resolve/main/LICENSE
-#model-type:
+pipeline_tag: image-to-text
-##如 gpt、phi、llama、chatglm、baichuan 等
+tags:
-#- gpt
+- vision
 #domain:
 ##如 nlp、cv、audio、multi-modal
 #- nlp
 #language:
 ##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
 #- cn 
 #metrics:
 ##如 CIDEr、Blue、ROUGE 等
 #- CIDEr
 #tags:
 ##各种自定义，包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
 #- pretrained
 #tools:
 ##如 vllm、fastchat、llamacpp、AdaSeq 等
 #- vllm
 ---
 ### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重，可浏览“模型文件”页面获取。
 #### 您可以通过如下git clone命令，或者ModelScope SDK来下载模型
-SDK下载
+# Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks
-```bash
+
-#安装ModelScope
+## Model Summary
-pip install modelscope
+
-```
+This Hub repository contains a HuggingFace's `transformers` implementation of Florence-2 model from Microsoft.
 Florence-2 is an advanced vision foundation model that uses a prompt-based approach to handle a wide range of vision and vision-language tasks.  Florence-2 can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation. It leverages our FLD-5B dataset, containing 5.4 billion annotations across 126 million images, to master multi-task learning. The model's sequence-to-sequence architecture enables it to excel in both zero-shot and fine-tuned settings, proving to be a competitive vision foundation model. 
 Resources and Technical Documentation:
 + [Florence-2 technical report](https://arxiv.org/abs/2311.06242). 
 + [Jupyter Notebook for inference and visualization of Florence-2-large](https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb)
 | Model   | Model size | Model Description | 
 | ------- | ------------- |   ------------- |  
 | Florence-2-base[[HF]](https://huggingface.co/microsoft/Florence-2-base) | 0.23B | Pretrained model with FLD-5B  
 | Florence-2-large[[HF]](https://huggingface.co/microsoft/Florence-2-large) | 0.77B  | Pretrained model with FLD-5B  
 | Florence-2-base-ft[[HF]](https://huggingface.co/microsoft/Florence-2-base-ft) | 0.23B  | Finetuned model on a colletion of downstream tasks
 | Florence-2-large-ft[[HF]](https://huggingface.co/microsoft/Florence-2-large-ft) | 0.77B | Finetuned model on a colletion of downstream tasks
 ## How to Get Started with the Model
 Use the code below to get started with the model.
 ```python
-#SDK模型下载
+import requests
-from modelscope import snapshot_download
+
-model_dir = snapshot_download('AI-ModelScope/Florence-2-large')
+from PIL import Image
-```
+from transformers import AutoProcessor, AutoModelForCausalLM 
-Git下载
+
-```
+
-#Git模型下载
+model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
-git clone https://www.modelscope.cn/AI-ModelScope/Florence-2-large.git
+processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
 prompt = "<OD>"
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
 image = Image.open(requests.get(url, stream=True).raw)
 inputs = processor(text=prompt, images=image, return_tensors="pt")
 generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=1024,
    num_beams=3,
    do_sample=False
 )
 generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
 parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))
 print(parsed_answer)
 ```
-<p style="color: lightgrey;">如果您是本模型的贡献者，我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>，及时完善模型卡片内容。</p>
+
 ## Tasks
 This model is capable of performing different tasks through changing the prompts.
 First, let's define a function to run a prompt.
 <details>
 <summary> Click to expand </summary>
 ```python
 import requests
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM 
 model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
 processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
 image = Image.open(requests.get(url, stream=True).raw)
 def run_example(task_prompt, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    generated_ids = model.generate(
      input_ids=inputs["input_ids"],
      pixel_values=inputs["pixel_values"],
      max_new_tokens=1024,
      num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    print(parsed_answer)
 ```
 </details>
 Here are the tasks `Florence-2` could perform:
 <details>
 <summary> Click to expand </summary>
 ### Caption
 ```python
 prompt = "<CAPTION>"
 run_example(prompt)
 ```
 ### Detailed Caption
 ```python
 prompt = "<DETAILED_CAPTION>"
 run_example(prompt)
 ```
 ### More Detailed Caption
 ```python
 prompt = "<MORE_DETAILED_CAPTION>"
 run_example(prompt)
 ```
 ### Caption to Phrase Grounding 
 caption to phrase grounding task requires additional text input, i.e. caption. 
 Caption to phrase grounding results format: 
 {'\<CAPTION_TO_PHRASE_GROUNDING>': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}
 ```python
 task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
 results = run_example(task_prompt, text_input="A green car parked in front of a yellow building.")
 ```
 ### Object Detection
 OD results format: 
 {'\<OD>': {'bboxes': [[x1, y1, x2, y2], ...], 
 'labels': ['label1', 'label2', ...]} }
 ```python
 prompt = "<OD>"
 run_example(prompt)
 ```
 ### Dense Region Caption
 Dense region caption results format: 
 {'\<DENSE_REGION_CAPTION>' : {'bboxes': [[x1, y1, x2, y2], ...], 
 'labels': ['label1', 'label2', ...]} }
 ```python
 prompt = "<DENSE_REGION_CAPTION>"
 run_example(prompt)
 ```
 ### Region proposal
 Dense region caption results format: 
 {'\<REGION_PROPOSAL>': {'bboxes': [[x1, y1, x2, y2], ...], 
 'labels': ['', '', ...]}}
 ```python
 prompt = "<REGION_PROPOSAL>"
 run_example(prompt)
 ```
 ### OCR 
 ```python
 prompt = "<OCR>"
 run_example(prompt)
 ```
 ### OCR with Region
 OCR with region output format:
 {'\<OCR_WITH_REGION>': {'quad_boxes': [[x1, y1, x2, y2, x3, y3, x4, y4], ...], 'labels': ['text1', ...]}}
 ```python
 prompt = "<OCR_WITH_REGION>"
 run_example(prompt)
 ```
 for More detailed examples, please refer to [notebook](https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb)
 </details>
 # Benchmarks
 ## Florence-2 Zero-shot performance
 The following table presents the zero-shot performance of generalist vision foundation models on image captioning and object detection evaluation tasks. These models have not been exposed to the training data of the evaluation tasks during their training phase.  
 | Method | #params | COCO Cap. test CIDEr | NoCaps val CIDEr | TextCaps val CIDEr | COCO Det. val2017 mAP |  
 |--------|---------|----------------------|------------------|--------------------|-----------------------|
 | Flamingo | 80B | 84.3 | - | - | - | 
 | Florence-2-base| 0.23B | 133.0 | 118.7 | 70.1 | 34.7 | 
 | Florence-2-large| 0.77B | 135.6 | 120.8 | 72.8 | 37.5 |
 The following table continues the comparison with performance on other vision-language evaluation tasks.  
 | Method | Flickr30k test R@1 | Refcoco val Accuracy | Refcoco test-A Accuracy | Refcoco test-B Accuracy | Refcoco+ val Accuracy | Refcoco+ test-A Accuracy | Refcoco+ test-B Accuracy | Refcocog val Accuracy | Refcocog test Accuracy | Refcoco RES val mIoU |  
 |--------|----------------------|----------------------|-------------------------|-------------------------|-----------------------|--------------------------|--------------------------|-----------------------|------------------------|----------------------|  
 | Kosmos-2 | 78.7 | 52.3 | 57.4 | 47.3 | 45.5 | 50.7 | 42.2 | 60.6 | 61.7 | - |  
 | Florence-2-base | 83.6 | 53.9 | 58.4 | 49.7 | 51.5 | 56.4 | 47.9 | 66.3 | 65.1 | 34.6 |  
 | Florence-2-large | 84.4 | 56.3 | 61.6 | 51.4 | 53.6 | 57.9 | 49.9 | 68.0 | 67.0 | 35.8 |  
 ## Florence-2 finetuned performance 
 We finetune Florence-2 models with a collection of downstream tasks, resulting two generalist models *Florence-2-base-ft* and *Florence-2-large-ft* that can conduct a wide range of downstream tasks. 
 The table below compares the performance of specialist and generalist models on various captioning and Visual Question Answering (VQA) tasks. Specialist models are fine-tuned specifically for each task, whereas generalist models are fine-tuned in a task-agnostic manner across all tasks. The symbol "▲" indicates the usage of external OCR as input.  
 | Method         | # Params | COCO Caption Karpathy test CIDEr | NoCaps val CIDEr | TextCaps val CIDEr | VQAv2 test-dev Acc | TextVQA test-dev Acc | VizWiz VQA test-dev Acc |  
 |----------------|----------|-----------------------------------|------------------|--------------------|--------------------|----------------------|-------------------------|  
 | **Specialist Models**   |          |                                   |                  |                    |                    |                      |                         |  
 | CoCa           | 2.1B     | 143.6                              | 122.4            | -                  | 82.3               | -                    | -                       |  
 | BLIP-2         | 7.8B     | 144.5                              | 121.6            | -                  | 82.2               | -                    | -                       |  
 | GIT2           | 5.1B     | 145.0                              | 126.9            | 148.6              | 81.7               | 67.3                 | 71.0                    |  
 | Flamingo       | 80B      | 138.1                              | -                | -                  | 82.0               | 54.1                 | 65.7                    |  
 | PaLI           | 17B      | 149.1                              | 127.0            | 160.0▲             | 84.3               | 58.8 / 73.1▲         | 71.6 / 74.4▲            |  
 | PaLI-X         | 55B      | 149.2                              | 126.3            | 147.0 / 163.7▲     | 86.0               | 71.4 / 80.8▲         | 70.9 / 74.6▲            |  
 | **Generalist Models**   |          |                                   |                  |                    |                    |                      |                         |  
 | Unified-IO     | 2.9B     | -                                  | 100.0            | -                  | 77.9               | -                    | 57.4                    |  
 | Florence-2-base-ft | 0.23B  | 140.0                              | 116.7            | 143.9              | 79.7               | 63.6                 | 63.6                    |  
 | Florence-2-large-ft | 0.77B  | 143.3                              | 124.9            | 151.1              | 81.7               | 73.5                 | 72.6                    |  
 | Method               | # Params | COCO Det. val2017 mAP | Flickr30k test R@1 | RefCOCO val Accuracy | RefCOCO test-A Accuracy | RefCOCO test-B Accuracy | RefCOCO+ val Accuracy | RefCOCO+ test-A Accuracy | RefCOCO+ test-B Accuracy | RefCOCOg val Accuracy | RefCOCOg test Accuracy | RefCOCO RES val mIoU |  
 |----------------------|----------|-----------------------|--------------------|----------------------|-------------------------|-------------------------|------------------------|---------------------------|---------------------------|------------------------|-----------------------|------------------------|  
 | **Specialist Models** |          |                       |                    |                      |                         |                         |                        |                           |                           |                        |                       |                        |  
 | SeqTR                | -        | -                     | -                  | 83.7                 | 86.5                    | 81.2                    | 71.5                   | 76.3                      | 64.9                      | 74.9                   | 74.2                  | -                      |  
 | PolyFormer           | -        | -                     | -                  | 90.4                 | 92.9                    | 87.2                    | 85.0                   | 89.8                      | 78.0                      | 85.8                   | 85.9                  | 76.9                   |  
 | UNINEXT              | 0.74B    | 60.6                  | -                  | 92.6                 | 94.3                    | 91.5                    | 85.2                   | 89.6                      | 79.8                      | 88.7                   | 89.4                  | -                      |  
 | Ferret               | 13B      | -                     | -                  | 89.5                 | 92.4                    | 84.4                    | 82.8                   | 88.1                      | 75.2                      | 85.8                   | 86.3                  | -                      |  
 | **Generalist Models** |          |                       |                    |                      |                         |                         |                        |                           |                           |                        |                       |                        |  
 | UniTAB               | -        | -                     | -                  | 88.6                 | 91.1                    | 83.8                    | 81.0                   | 85.4                      | 71.6                      | 84.6                   | 84.7                  | -                      |  
 | Florence-2-base-ft | 0.23B    | 41.4                  | 84.0                | 92.6                 | 94.8                    | 91.5                   | 86.8                   | 91.7                      | 82.2                      | 89.8                   | 82.2                  | 78.0                  |  
 | Florence-2-large-ft| 0.77B    | 43.4                  | 85.2               | 93.4                 | 95.3                    | 92.0                    | 88.3                   | 92.9                      | 83.6                      | 91.2                   | 91.7                  | 80.5                   |  
 ## BibTex and citation info
 ```
@article{xiao2023florence,
  title={Florence-2: Advancing a unified representation for a variety of vision tasks},
  author={Xiao, Bin and Wu, Haiping and Xu, Weijian and Dai, Xiyang and Hu, Houdong and Lu, Yumao and Zeng, Michael and Liu, Ce and Yuan, Lu},
  journal={arXiv preprint arXiv:2311.06242},
  year={2023}
 }
 ```
--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1,41 @@
 <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 ## Security
 Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 ## Reporting Security Issues
 **Please do not report security vulnerabilities through public GitHub issues.**
 Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
 If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
 You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
 Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
  * Full paths of source file(s) related to the manifestation of the issue
  * The location of the affected source code (tag/branch/commit or direct URL)
  * Any special configuration required to reproduce the issue
  * Step-by-step instructions to reproduce the issue
  * Proof-of-concept or exploit code (if possible)
  * Impact of the issue, including how an attacker might exploit the issue
 This information will help us triage your report more quickly.
 If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
 ## Preferred Languages
 We prefer all communications to be in English.
 ## Policy
 Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
 <!-- END MICROSOFT SECURITY.MD BLOCK -->
--- a/SUPPORT.md
+++ b/SUPPORT.md
@ -0,0 +1,25 @@
 # TODO: The maintainer of this repo has not yet edited this file
 **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 - **No CSS support:** Fill out this template with information about how to file issues and get help.
 - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
 # Support
 ## How to file issues and get help  
 This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
 issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
 feature request as a new Issue.
 For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
 FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
 CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
 ## Microsoft Support Policy  
 Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
--- a/config.json
+++ b/config.json
@ -0,0 +1,85 @@
 {
  "_name_or_path": "florence2",
  "architectures": [
    "Florence2ForConditionalGeneration"
  ],
  "auto_map": {
    "AutoConfig": "configuration_florence2.Florence2Config",
    "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
  },
  "bos_token_id": 2,
  "eos_token_id": 1,
  "ignore_index": -100,
  "model_type": "florence2",
  "pad_token_id": 0,
  "projection_dim": 1024,
  "text_config": {
      "vocab_size": 51289,
      "activation_dropout": 0.1,
      "activation_function": "gelu",
      "add_bias_logits": false,
      "add_final_layer_norm": false,
      "attention_dropout": 0.1,
      "bos_token_id": 0,
      "classif_dropout": 0.1,
      "classifier_dropout": 0.0,
      "d_model": 1024,
      "decoder_attention_heads": 16,
      "decoder_ffn_dim": 4096,
      "decoder_layerdrop": 0.0,
      "decoder_layers": 12,
      "decoder_start_token_id": 2,
      "dropout": 0.1,
      "early_stopping": true,
      "encoder_attention_heads": 16,
      "encoder_ffn_dim": 4096,
      "encoder_layerdrop": 0.0,
      "encoder_layers": 12,
      "eos_token_id": 2,
      "forced_eos_token_id": 2,
      "forced_bos_token_id": 0,
      "gradient_checkpointing": false,
      "init_std": 0.02,
      "is_encoder_decoder": true,
      "label2id": {
        "LABEL_0": 0,
        "LABEL_1": 1,
        "LABEL_2": 2
      },
      "max_position_embeddings": 1024,
      "no_repeat_ngram_size": 3,
      "normalize_before": false,
      "num_hidden_layers": 12,
      "pad_token_id": 1,
      "scale_embedding": false,
      "num_beams": 3
  },
  "vision_config": {
    "model_type": "davit",
    "drop_path_rate": 0.1,  
    "patch_size": [7, 3, 3, 3],  
    "patch_stride": [4, 2, 2, 2],  
    "patch_padding": [3, 1, 1, 1],  
    "patch_prenorm": [false, true, true, true],  
    "enable_checkpoint": false,  
    "dim_embed": [256, 512, 1024, 2048],  
    "num_heads": [8, 16, 32, 64],  
    "num_groups": [8, 16, 32, 64],  
    "depths": [1, 1, 9, 1],  
    "window_size": 12,
    "projection_dim": 1024,
    "visual_temporal_embedding": {
        "type": "COSINE",
        "max_temporal_embeddings": 100
    },
    "image_pos_embed": {
        "type": "learned_abs_2d",
        "max_pos_embeddings": 50
    },
    "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
  },
  "vocab_size": 51289,
  "torch_dtype": "float32",
  "transformers_version": "4.41.0.dev0",
  "is_encoder_decoder": true
 }
--- a/configuration.json
+++ b/configuration.json
@ -0,0 +1 @@
 {"framework": "pytorch", "task": "image-caption", "allow_remote": true}
--- a/configuration_florence2.py
+++ b/configuration_florence2.py
@ -0,0 +1,340 @@
 # coding=utf-8
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import warnings
 """ Florence-2 configuration"""
 from typing import Optional
 from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class Florence2VisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
    according to the specified arguments, defining the model architecture. Instantiating a configuration with the 
    defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        drop_path_rate (`float`, *optional*, defaults to 0.1):
            The dropout rate of the drop path layer.
        patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
            The patch size of the image.
        patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
            The patch stride of the image.
        patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
            The patch padding of the image.
        patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
            Whether to apply layer normalization before the patch embedding layer.
        enable_checkpoint (`bool`, *optional*, defaults to False):
            Whether to enable checkpointing.
        dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
            The dimension of the embedding layer.
        num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
            The number of attention heads.
        num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
            The number of groups.
        depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
            The depth of the model.
        window_size (`int`, *optional*, defaults to 12):
            The window size of the model.
        projection_dim (`int`, *optional*, defaults to 1024):
            The dimension of the projection layer.
        visual_temporal_embedding (`dict`, *optional*):
            The configuration of the visual temporal embedding.
        image_pos_embed (`dict`, *optional*):
            The configuration of the image position embedding.
        image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
            The source of the image feature.
    Example:
    ```python
    >>> from transformers import Florence2VisionConfig, Florence2VisionModel
    >>> # Initializing a Florence2 Vision style configuration
    >>> configuration = Florence2VisionConfig()
    >>> # Initializing a model (with random weights)
    >>> model = Florence2VisionModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "florence2_vision"
    keys_to_ignore_at_inference = ["past_key_values"]
    def __init__(
        self,
        drop_path_rate=0.1,
        patch_size=[7, 3, 3, 3],
        patch_stride=[4, 2, 2, 2],
        patch_padding=[3, 1, 1, 1],
        patch_prenorm=[False, True, True, True],
        enable_checkpoint=False,
        dim_embed=[256, 512, 1024, 2048],
        num_heads=[8, 16, 32, 64],
        num_groups=[8, 16, 32, 64],
        depths=[1, 1, 9, 1],
        window_size=12,
        projection_dim=1024,
        visual_temporal_embedding=None,
        image_pos_embed=None,
        image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
        **kwargs,
    ):
        self.drop_path_rate = drop_path_rate
        self.patch_size = patch_size
        self.patch_stride = patch_stride
        self.patch_padding = patch_padding
        self.patch_prenorm = patch_prenorm
        self.enable_checkpoint = enable_checkpoint
        self.dim_embed = dim_embed
        self.num_heads = num_heads
        self.num_groups = num_groups
        self.depths = depths
        self.window_size = window_size
        self.projection_dim = projection_dim
        self.visual_temporal_embedding = visual_temporal_embedding
        self.image_pos_embed = image_pos_embed
        self.image_feature_source = image_feature_source
        super().__init__(**kwargs)
 class Florence2LanguageConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the BART
    [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 51289):
            Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Florence2LanguageModel`].
        d_model (`int`, *optional*, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
        encoder_layers (`int`, *optional*, defaults to 12):
            Number of encoder layers.
        decoder_layers (`int`, *optional*, defaults to 12):
            Number of decoder layers.
        encoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
        max_position_embeddings (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
            for more details.
        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
            for more details.
        scale_embedding (`bool`, *optional*, defaults to `False`):
            Scale embeddings by diving by sqrt(d_model).
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        num_labels (`int`, *optional*, defaults to 3):
            The number of labels to use in [`Florence2LanguageForSequenceClassification`].
        forced_eos_token_id (`int`, *optional*, defaults to 2):
            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
            `eos_token_id`.
    Example:
    ```python
    >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
    >>> # Initializing a Florence2 Language style configuration
    >>> configuration = Florence2LanguageConfig()
    >>> # Initializing a model (with random weights)
    >>> model = Florence2LangaugeModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "florence2_language"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
    def __init__(
        self,
        vocab_size=51289,
        max_position_embeddings=1024,
        encoder_layers=12,
        encoder_ffn_dim=4096,
        encoder_attention_heads=16,
        decoder_layers=12,
        decoder_ffn_dim=4096,
        decoder_attention_heads=16,
        encoder_layerdrop=0.0,
        decoder_layerdrop=0.0,
        activation_function="gelu",
        d_model=1024,
        dropout=0.1,
        attention_dropout=0.0,
        activation_dropout=0.0,
        init_std=0.02,
        classifier_dropout=0.0,
        scale_embedding=False,
        use_cache=True,
        num_labels=3,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        is_encoder_decoder=True,
        decoder_start_token_id=2,
        forced_eos_token_id=2,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.d_model = d_model
        self.encoder_ffn_dim = encoder_ffn_dim
        self.encoder_layers = encoder_layers
        self.encoder_attention_heads = encoder_attention_heads
        self.decoder_ffn_dim = decoder_ffn_dim
        self.decoder_layers = decoder_layers
        self.decoder_attention_heads = decoder_attention_heads
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
        self.activation_function = activation_function
        self.init_std = init_std
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
        self.classifier_dropout = classifier_dropout
        self.use_cache = use_cache
        self.num_hidden_layers = encoder_layers
        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
        super().__init__(
            num_labels=num_labels,
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            decoder_start_token_id=decoder_start_token_id,
            forced_eos_token_id=forced_eos_token_id,
            **kwargs,
        )
        # ensure backward compatibility for BART CNN models
        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
            self.forced_bos_token_id = self.bos_token_id
            warnings.warn(
                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
                "The config can simply be saved and uploaded again to be fixed."
            )
 class Florence2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
    Florence-2 model according to the specified arguments, defining the model architecture. 
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vision_config (`Florence2VisionConfig`,  *optional*):
            Custom vision config or dict
        text_config (`Union[AutoConfig, dict]`, *optional*):
            The config object of the text backbone. 
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        vocab_size (`int`, *optional*, defaults to 51289):
            Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimension of the multimodal projection space.
    Example:
    ```python
    >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
    >>> # Initializing a clip-like vision config
    >>> vision_config = CLIPVisionConfig()
    >>> # Initializing a Bart config
    >>> text_config = BartConfig()
    >>> # Initializing a Florence-2 configuration
    >>> configuration = Florence2Config(vision_config, text_config)
    >>> # Initializing a model from the florence-2 configuration
    >>> model = Florence2ForConditionalGeneration(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "florence2"
    is_composition = False
    def __init__(
        self,
        vision_config=None,
        text_config=None,
        ignore_index=-100,
        vocab_size=51289,
        projection_dim=1024,
        **kwargs,
    ):
        self.ignore_index = ignore_index
        self.vocab_size = vocab_size
        self.projection_dim = projection_dim
        if vision_config is not None:
            vision_config = PretrainedConfig(**vision_config)
        self.vision_config = vision_config
        self.vocab_size = self.vocab_size
        self.text_config = text_config
        if text_config is not None:
            self.text_config = Florence2LanguageConfig(**text_config)
        super().__init__(**kwargs)
--- a/generation_config.json
+++ b/generation_config.json
@ -0,0 +1,4 @@
 {
    "num_beams": 3,
    "early_stopping": false
 }
--- a/modeling_florence2.py
+++ b/modeling_florence2.py
--- a/preprocessor_config.json
+++ b/preprocessor_config.json
@ -0,0 +1,39 @@
 {
  "auto_map": {
    "AutoProcessor": "processing_florence2.Florence2Processor"
   },
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "return_tensors",
    "data_format",
    "input_data_format",
    "do_convert_rgb"
  ],
  "do_convert_rgb": null,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "do_center_crop": false,
  "image_processor_type": "CLIPImageProcessor",
  "image_seq_length": 577,
  "image_mean": [0.485, 0.456, 0.406],
  "image_std":  [0.229, 0.224, 0.225],
  "processor_class": "Florence2Processor",
  "resample": 3,
  "size": {
    "height": 768,
    "width":768 
  },
  "crop_size": {
    "height": 768,
    "width": 768
  }
 }
--- a/processing_florence2.py
+++ b/processing_florence2.py
--- a/pytorch_model.bin
+++ b/pytorch_model.bin
--- a/sample_inference.ipynb
+++ b/sample_inference.ipynb
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@ -0,0 +1,4 @@
 {
    "model_max_length": 1024
 }
--- a/vocab.json
+++ b/vocab.json
		`@ -0,0 +1 @@`
							`{"framework": "pytorch", "task": "image-caption", "allow_remote": true}`