Update README.md

2026-05-19 19:32:53 +08:00 · 2024-06-20 10:36:29 +08:00
parent 16042094a1
commit bdfe4cb6aa
18 changed files with 55950 additions and 53 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,37 +1,37 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bin.* filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
-*.zstandard filter=lfs diff=lfs merge=lfs -text
-*.tfevents* filter=lfs diff=lfs merge=lfs -text
-*.db* filter=lfs diff=lfs merge=lfs -text
-*.ark* filter=lfs diff=lfs merge=lfs -text
-**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
-**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
-**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.gguf* filter=lfs diff=lfs merge=lfs -text
-*.ggml filter=lfs diff=lfs merge=lfs -text
-*.llamafile* filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*bin filter=lfs diff=lfs merge=lfs -text
+pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,9 @@
+# Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+
+Resources:
+
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+    MIT License
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
--- a/README.md
+++ b/README.md
@ -1,47 +1,256 @@
 ---
-license: Apache License 2.0
+license: mit
+license_link: https://huggingface.co/microsoft/Florence-2-large/resolve/main/LICENSE

-#model-type:
-##如 gpt、phi、llama、chatglm、baichuan 等
-#- gpt
-
-#domain:
-##如 nlp、cv、audio、multi-modal
-#- nlp
-
-#language:
-##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
-#- cn 
-
-#metrics:
-##如 CIDEr、Blue、ROUGE 等
-#- CIDEr
-
-#tags:
-##各种自定义，包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
-#- pretrained
-
-#tools:
-##如 vllm、fastchat、llamacpp、AdaSeq 等
-#- vllm
+pipeline_tag: image-to-text
+tags:
+- vision
 ---
-### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重，可浏览“模型文件”页面获取。
-#### 您可以通过如下git clone命令，或者ModelScope SDK来下载模型

-SDK下载
-```bash
-#安装ModelScope
-pip install modelscope
-```
+# Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks
+
+## Model Summary
+
+This Hub repository contains a HuggingFace's `transformers` implementation of Florence-2 model from Microsoft.
+
+Florence-2 is an advanced vision foundation model that uses a prompt-based approach to handle a wide range of vision and vision-language tasks.  Florence-2 can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation. It leverages our FLD-5B dataset, containing 5.4 billion annotations across 126 million images, to master multi-task learning. The model's sequence-to-sequence architecture enables it to excel in both zero-shot and fine-tuned settings, proving to be a competitive vision foundation model. 
+
+Resources and Technical Documentation:
+ [Florence-2 technical report](https://arxiv.org/abs/2311.06242). 
+ [Jupyter Notebook for inference and visualization of Florence-2-large](https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb)
+
+| Model   | Model size | Model Description | 
+| ------- | ------------- |   ------------- |  
+| Florence-2-base[[HF]](https://huggingface.co/microsoft/Florence-2-base) | 0.23B | Pretrained model with FLD-5B  
+| Florence-2-large[[HF]](https://huggingface.co/microsoft/Florence-2-large) | 0.77B  | Pretrained model with FLD-5B  
+| Florence-2-base-ft[[HF]](https://huggingface.co/microsoft/Florence-2-base-ft) | 0.23B  | Finetuned model on a colletion of downstream tasks
+| Florence-2-large-ft[[HF]](https://huggingface.co/microsoft/Florence-2-large-ft) | 0.77B | Finetuned model on a colletion of downstream tasks
+ 
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
 ```python
-#SDK模型下载
-from modelscope import snapshot_download
-model_dir = snapshot_download('AI-ModelScope/Florence-2-large')
-```
-Git下载
-```
-#Git模型下载
-git clone https://www.modelscope.cn/AI-ModelScope/Florence-2-large.git
+import requests
+
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM 
+
+
+model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
+processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
+
+prompt = "<OD>"
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+generated_ids = model.generate(
+    input_ids=inputs["input_ids"],
+    pixel_values=inputs["pixel_values"],
+    max_new_tokens=1024,
+    num_beams=3,
+    do_sample=False
+)
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+
+parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))
+
+print(parsed_answer)
+
 ```

-<p style="color: lightgrey;">如果您是本模型的贡献者，我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>，及时完善模型卡片内容。</p>
+
+## Tasks
+
+This model is capable of performing different tasks through changing the prompts.
+
+First, let's define a function to run a prompt.
+
+<details>
+<summary> Click to expand </summary>
+
+```python
+import requests
+
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM 
+
+
+model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
+processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
+image = Image.open(requests.get(url, stream=True).raw)
+
+def run_example(task_prompt, text_input=None):
+    if text_input is None:
+        prompt = task_prompt
+    else:
+        prompt = task_prompt + text_input
+    inputs = processor(text=prompt, images=image, return_tensors="pt")
+    generated_ids = model.generate(
+      input_ids=inputs["input_ids"],
+      pixel_values=inputs["pixel_values"],
+      max_new_tokens=1024,
+      num_beams=3
+    )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+
+    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
+
+    print(parsed_answer)
+```
+</details>
+
+Here are the tasks `Florence-2` could perform:
+
+<details>
+<summary> Click to expand </summary>
+
+
+
+### Caption
+```python
+prompt = "<CAPTION>"
+run_example(prompt)
+```
+
+### Detailed Caption
+```python
+prompt = "<DETAILED_CAPTION>"
+run_example(prompt)
+```
+
+### More Detailed Caption
+```python
+prompt = "<MORE_DETAILED_CAPTION>"
+run_example(prompt)
+```
+
+### Caption to Phrase Grounding 
+caption to phrase grounding task requires additional text input, i.e. caption. 
+
+Caption to phrase grounding results format: 
+{'\<CAPTION_TO_PHRASE_GROUNDING>': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}
+```python
+task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
+results = run_example(task_prompt, text_input="A green car parked in front of a yellow building.")
+```
+
+### Object Detection
+
+OD results format: 
+{'\<OD>': {'bboxes': [[x1, y1, x2, y2], ...], 
+'labels': ['label1', 'label2', ...]} }
+
+```python
+prompt = "<OD>"
+run_example(prompt)
+```
+
+### Dense Region Caption
+Dense region caption results format: 
+{'\<DENSE_REGION_CAPTION>' : {'bboxes': [[x1, y1, x2, y2], ...], 
+'labels': ['label1', 'label2', ...]} }
+```python
+prompt = "<DENSE_REGION_CAPTION>"
+run_example(prompt)
+```
+
+### Region proposal
+Dense region caption results format: 
+{'\<REGION_PROPOSAL>': {'bboxes': [[x1, y1, x2, y2], ...], 
+'labels': ['', '', ...]}}
+```python
+prompt = "<REGION_PROPOSAL>"
+run_example(prompt)
+```
+
+### OCR 
+
+```python
+prompt = "<OCR>"
+run_example(prompt)
+```
+
+### OCR with Region
+OCR with region output format:
+{'\<OCR_WITH_REGION>': {'quad_boxes': [[x1, y1, x2, y2, x3, y3, x4, y4], ...], 'labels': ['text1', ...]}}
+```python
+prompt = "<OCR_WITH_REGION>"
+run_example(prompt)
+```
+
+for More detailed examples, please refer to [notebook](https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb)
+</details>
+
+# Benchmarks
+
+## Florence-2 Zero-shot performance
+  
+The following table presents the zero-shot performance of generalist vision foundation models on image captioning and object detection evaluation tasks. These models have not been exposed to the training data of the evaluation tasks during their training phase.  
+  
+| Method | #params | COCO Cap. test CIDEr | NoCaps val CIDEr | TextCaps val CIDEr | COCO Det. val2017 mAP |  
+|--------|---------|----------------------|------------------|--------------------|-----------------------|
+| Flamingo | 80B | 84.3 | - | - | - | 
+| Florence-2-base| 0.23B | 133.0 | 118.7 | 70.1 | 34.7 | 
+| Florence-2-large| 0.77B | 135.6 | 120.8 | 72.8 | 37.5 |
+
+  
+The following table continues the comparison with performance on other vision-language evaluation tasks.  
+  
+| Method | Flickr30k test R@1 | Refcoco val Accuracy | Refcoco test-A Accuracy | Refcoco test-B Accuracy | Refcoco+ val Accuracy | Refcoco+ test-A Accuracy | Refcoco+ test-B Accuracy | Refcocog val Accuracy | Refcocog test Accuracy | Refcoco RES val mIoU |  
+|--------|----------------------|----------------------|-------------------------|-------------------------|-----------------------|--------------------------|--------------------------|-----------------------|------------------------|----------------------|  
+| Kosmos-2 | 78.7 | 52.3 | 57.4 | 47.3 | 45.5 | 50.7 | 42.2 | 60.6 | 61.7 | - |  
+| Florence-2-base | 83.6 | 53.9 | 58.4 | 49.7 | 51.5 | 56.4 | 47.9 | 66.3 | 65.1 | 34.6 |  
+| Florence-2-large | 84.4 | 56.3 | 61.6 | 51.4 | 53.6 | 57.9 | 49.9 | 68.0 | 67.0 | 35.8 |  
+
+
+
+## Florence-2 finetuned performance 
+
+We finetune Florence-2 models with a collection of downstream tasks, resulting two generalist models *Florence-2-base-ft* and *Florence-2-large-ft* that can conduct a wide range of downstream tasks. 
+  
+The table below compares the performance of specialist and generalist models on various captioning and Visual Question Answering (VQA) tasks. Specialist models are fine-tuned specifically for each task, whereas generalist models are fine-tuned in a task-agnostic manner across all tasks. The symbol "▲" indicates the usage of external OCR as input.  
+  
+| Method         | # Params | COCO Caption Karpathy test CIDEr | NoCaps val CIDEr | TextCaps val CIDEr | VQAv2 test-dev Acc | TextVQA test-dev Acc | VizWiz VQA test-dev Acc |  
+|----------------|----------|-----------------------------------|------------------|--------------------|--------------------|----------------------|-------------------------|  
+| **Specialist Models**   |          |                                   |                  |                    |                    |                      |                         |  
+| CoCa           | 2.1B     | 143.6                              | 122.4            | -                  | 82.3               | -                    | -                       |  
+| BLIP-2         | 7.8B     | 144.5                              | 121.6            | -                  | 82.2               | -                    | -                       |  
+| GIT2           | 5.1B     | 145.0                              | 126.9            | 148.6              | 81.7               | 67.3                 | 71.0                    |  
+| Flamingo       | 80B      | 138.1                              | -                | -                  | 82.0               | 54.1                 | 65.7                    |  
+| PaLI           | 17B      | 149.1                              | 127.0            | 160.0▲             | 84.3               | 58.8 / 73.1▲         | 71.6 / 74.4▲            |  
+| PaLI-X         | 55B      | 149.2                              | 126.3            | 147.0 / 163.7▲     | 86.0               | 71.4 / 80.8▲         | 70.9 / 74.6▲            |  
+| **Generalist Models**   |          |                                   |                  |                    |                    |                      |                         |  
+| Unified-IO     | 2.9B     | -                                  | 100.0            | -                  | 77.9               | -                    | 57.4                    |  
+| Florence-2-base-ft | 0.23B  | 140.0                              | 116.7            | 143.9              | 79.7               | 63.6                 | 63.6                    |  
+| Florence-2-large-ft | 0.77B  | 143.3                              | 124.9            | 151.1              | 81.7               | 73.5                 | 72.6                    |  
+  
+  
+| Method               | # Params | COCO Det. val2017 mAP | Flickr30k test R@1 | RefCOCO val Accuracy | RefCOCO test-A Accuracy | RefCOCO test-B Accuracy | RefCOCO+ val Accuracy | RefCOCO+ test-A Accuracy | RefCOCO+ test-B Accuracy | RefCOCOg val Accuracy | RefCOCOg test Accuracy | RefCOCO RES val mIoU |  
+|----------------------|----------|-----------------------|--------------------|----------------------|-------------------------|-------------------------|------------------------|---------------------------|---------------------------|------------------------|-----------------------|------------------------|  
+| **Specialist Models** |          |                       |                    |                      |                         |                         |                        |                           |                           |                        |                       |                        |  
+| SeqTR                | -        | -                     | -                  | 83.7                 | 86.5                    | 81.2                    | 71.5                   | 76.3                      | 64.9                      | 74.9                   | 74.2                  | -                      |  
+| PolyFormer           | -        | -                     | -                  | 90.4                 | 92.9                    | 87.2                    | 85.0                   | 89.8                      | 78.0                      | 85.8                   | 85.9                  | 76.9                   |  
+| UNINEXT              | 0.74B    | 60.6                  | -                  | 92.6                 | 94.3                    | 91.5                    | 85.2                   | 89.6                      | 79.8                      | 88.7                   | 89.4                  | -                      |  
+| Ferret               | 13B      | -                     | -                  | 89.5                 | 92.4                    | 84.4                    | 82.8                   | 88.1                      | 75.2                      | 85.8                   | 86.3                  | -                      |  
+| **Generalist Models** |          |                       |                    |                      |                         |                         |                        |                           |                           |                        |                       |                        |  
+| UniTAB               | -        | -                     | -                  | 88.6                 | 91.1                    | 83.8                    | 81.0                   | 85.4                      | 71.6                      | 84.6                   | 84.7                  | -                      |  
+| Florence-2-base-ft | 0.23B    | 41.4                  | 84.0                | 92.6                 | 94.8                    | 91.5                   | 86.8                   | 91.7                      | 82.2                      | 89.8                   | 82.2                  | 78.0                  |  
+| Florence-2-large-ft| 0.77B    | 43.4                  | 85.2               | 93.4                 | 95.3                    | 92.0                    | 88.3                   | 92.9                      | 83.6                      | 91.2                   | 91.7                  | 80.5                   |  
+  
+
+## BibTex and citation info
+
+```
+@article{xiao2023florence,
+  title={Florence-2: Advancing a unified representation for a variety of vision tasks},
+  author={Xiao, Bin and Wu, Haiping and Xu, Weijian and Dai, Xiyang and Hu, Houdong and Lu, Yumao and Zeng, Michael and Liu, Ce and Yuan, Lu},
+  journal={arXiv preprint arXiv:2311.06242},
+  year={2023}
+}
+```
--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
--- a/SUPPORT.md
+++ b/SUPPORT.md
@ -0,0 +1,25 @@
+# TODO: The maintainer of this repo has not yet edited this file
+
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
+- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
+
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+
+# Support
+
+## How to file issues and get help  
+
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
+feature request as a new Issue.
+
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+
+## Microsoft Support Policy  
+
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
--- a/config.json
+++ b/config.json
@ -0,0 +1,85 @@
+{
+  "_name_or_path": "florence2",
+  "architectures": [
+    "Florence2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_florence2.Florence2Config",
+    "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
+  },
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "ignore_index": -100,
+  "model_type": "florence2",
+  "pad_token_id": 0,
+  "projection_dim": 1024,
+  "text_config": {
+      "vocab_size": 51289,
+      "activation_dropout": 0.1,
+      "activation_function": "gelu",
+      "add_bias_logits": false,
+      "add_final_layer_norm": false,
+      "attention_dropout": 0.1,
+      "bos_token_id": 0,
+      "classif_dropout": 0.1,
+      "classifier_dropout": 0.0,
+      "d_model": 1024,
+      "decoder_attention_heads": 16,
+      "decoder_ffn_dim": 4096,
+      "decoder_layerdrop": 0.0,
+      "decoder_layers": 12,
+      "decoder_start_token_id": 2,
+      "dropout": 0.1,
+      "early_stopping": true,
+      "encoder_attention_heads": 16,
+      "encoder_ffn_dim": 4096,
+      "encoder_layerdrop": 0.0,
+      "encoder_layers": 12,
+      "eos_token_id": 2,
+      "forced_eos_token_id": 2,
+      "forced_bos_token_id": 0,
+      "gradient_checkpointing": false,
+      "init_std": 0.02,
+      "is_encoder_decoder": true,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1,
+        "LABEL_2": 2
+      },
+      "max_position_embeddings": 1024,
+      "no_repeat_ngram_size": 3,
+      "normalize_before": false,
+      "num_hidden_layers": 12,
+      "pad_token_id": 1,
+      "scale_embedding": false,
+      "num_beams": 3
+  },
+  "vision_config": {
+    "model_type": "davit",
+    "drop_path_rate": 0.1,  
+    "patch_size": [7, 3, 3, 3],  
+    "patch_stride": [4, 2, 2, 2],  
+    "patch_padding": [3, 1, 1, 1],  
+    "patch_prenorm": [false, true, true, true],  
+    "enable_checkpoint": false,  
+    "dim_embed": [256, 512, 1024, 2048],  
+    "num_heads": [8, 16, 32, 64],  
+    "num_groups": [8, 16, 32, 64],  
+    "depths": [1, 1, 9, 1],  
+    "window_size": 12,
+    "projection_dim": 1024,
+    "visual_temporal_embedding": {
+        "type": "COSINE",
+        "max_temporal_embeddings": 100
+    },
+    "image_pos_embed": {
+        "type": "learned_abs_2d",
+        "max_pos_embeddings": 50
+    },
+    "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
+  },
+  "vocab_size": 51289,
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.0.dev0",
+  "is_encoder_decoder": true
+}
--- a/configuration.json
+++ b/configuration.json
@ -0,0 +1 @@
+{"framework": "pytorch", "task": "image-caption", "allow_remote": true}
--- a/configuration_florence2.py
+++ b/configuration_florence2.py
@ -0,0 +1,340 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+""" Florence-2 configuration"""
+
+from typing import Optional
+
+from transformers import AutoConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+class Florence2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the 
+    defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The dropout rate of the drop path layer.
+        patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
+            The patch size of the image.
+        patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
+            The patch stride of the image.
+        patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
+            The patch padding of the image.
+        patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
+            Whether to apply layer normalization before the patch embedding layer.
+        enable_checkpoint (`bool`, *optional*, defaults to False):
+            Whether to enable checkpointing.
+        dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
+            The dimension of the embedding layer.
+        num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of attention heads.
+        num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of groups.
+        depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
+            The depth of the model.
+        window_size (`int`, *optional*, defaults to 12):
+            The window size of the model.
+        projection_dim (`int`, *optional*, defaults to 1024):
+            The dimension of the projection layer.
+        visual_temporal_embedding (`dict`, *optional*):
+            The configuration of the visual temporal embedding.
+        image_pos_embed (`dict`, *optional*):
+            The configuration of the image position embedding.
+        image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
+            The source of the image feature.
+    Example:
+
+    ```python
+    >>> from transformers import Florence2VisionConfig, Florence2VisionModel
+
+    >>> # Initializing a Florence2 Vision style configuration
+    >>> configuration = Florence2VisionConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "florence2_vision"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        drop_path_rate=0.1,
+        patch_size=[7, 3, 3, 3],
+        patch_stride=[4, 2, 2, 2],
+        patch_padding=[3, 1, 1, 1],
+        patch_prenorm=[False, True, True, True],
+        enable_checkpoint=False,
+        dim_embed=[256, 512, 1024, 2048],
+        num_heads=[8, 16, 32, 64],
+        num_groups=[8, 16, 32, 64],
+        depths=[1, 1, 9, 1],
+        window_size=12,
+        projection_dim=1024,
+        visual_temporal_embedding=None,
+        image_pos_embed=None,
+        image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
+        **kwargs,
+    ):
+        self.drop_path_rate = drop_path_rate
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.patch_prenorm = patch_prenorm
+        self.enable_checkpoint = enable_checkpoint
+        self.dim_embed = dim_embed
+        self.num_heads = num_heads
+        self.num_groups = num_groups
+        self.depths = depths
+        self.window_size = window_size
+        self.projection_dim = projection_dim
+        self.visual_temporal_embedding = visual_temporal_embedding
+        self.image_pos_embed = image_pos_embed
+        self.image_feature_source = image_feature_source
+
+        super().__init__(**kwargs)
+
+
+
+class Florence2LanguageConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BART
+    [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Florence2LanguageModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`Florence2LanguageForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
+
+    >>> # Initializing a Florence2 Language style configuration
+    >>> configuration = Florence2LanguageConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2LangaugeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "florence2_language"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=51289,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            warnings.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )
+
+class Florence2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
+    Florence-2 model according to the specified arguments, defining the model architecture. 
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Florence2VisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. 
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
+        projection_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the multimodal projection space.
+
+    Example:
+
+    ```python
+    >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
+
+    >>> # Initializing a clip-like vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Bart config
+    >>> text_config = BartConfig()
+
+    >>> # Initializing a Florence-2 configuration
+    >>> configuration = Florence2Config(vision_config, text_config)
+
+    >>> # Initializing a model from the florence-2 configuration
+    >>> model = Florence2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "florence2"
+    is_composition = False
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        vocab_size=51289,
+        projection_dim=1024,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        if vision_config is not None:
+            vision_config = PretrainedConfig(**vision_config)
+        self.vision_config = vision_config
+        self.vocab_size = self.vocab_size
+
+        self.text_config = text_config
+        if text_config is not None:
+            self.text_config = Florence2LanguageConfig(**text_config)
+
+
+        super().__init__(**kwargs)
+
--- a/generation_config.json
+++ b/generation_config.json
@ -0,0 +1,4 @@
+{
+    "num_beams": 3,
+    "early_stopping": false
+}
--- a/modeling_florence2.py
+++ b/modeling_florence2.py
--- a/preprocessor_config.json
+++ b/preprocessor_config.json
@ -0,0 +1,39 @@
+{
+  "auto_map": {
+    "AutoProcessor": "processing_florence2.Florence2Processor"
+   },
+  "_valid_processor_keys": [
+    "images",
+    "do_resize",
+    "size",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "return_tensors",
+    "data_format",
+    "input_data_format",
+    "do_convert_rgb"
+  ],
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_center_crop": false,
+  "image_processor_type": "CLIPImageProcessor",
+  "image_seq_length": 577,
+  "image_mean": [0.485, 0.456, 0.406],
+  "image_std":  [0.229, 0.224, 0.225],
+  "processor_class": "Florence2Processor",
+  "resample": 3,
+  "size": {
+    "height": 768,
+    "width":768 
+  },
+  "crop_size": {
+    "height": 768,
+    "width": 768
+  }
+}
--- a/processing_florence2.py
+++ b/processing_florence2.py
--- a/pytorch_model.bin
+++ b/pytorch_model.bin
--- a/sample_inference.ipynb
+++ b/sample_inference.ipynb
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@ -0,0 +1,4 @@
+{
+    "model_max_length": 1024
+}
+
--- a/vocab.json
+++ b/vocab.json
				`@ -0,0 +1 @@`
				`{"framework": "pytorch", "task": "image-caption", "allow_remote": true}`