Update README.md

This commit is contained in:
ai-modelscope
2024-10-24 13:04:22 +08:00
parent eb1df5e44e
commit 6ce0f40291
57 changed files with 1351 additions and 57 deletions

71
.gitattributes vendored
View File

@ -1,38 +1,81 @@
*.7z filter=lfs diff=lfs merge=lfs -text *.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text *.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text *.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text *.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text *.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text *.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text *.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text *.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text *.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text *.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text *.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text *.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text *.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text *.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text *.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text *.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text *.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text *.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text *.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text
*.tfevents* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text
*.db* filter=lfs diff=lfs merge=lfs -text images/overflow.png filter=lfs diff=lfs merge=lfs -text
*.ark* filter=lfs diff=lfs merge=lfs -text images/show_case/1.png filter=lfs diff=lfs merge=lfs -text
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text images/show_case/10.png filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text images/show_case/11.png filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text images/show_case/12.png filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text images/show_case/14.png filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text images/show_case/15.png filter=lfs diff=lfs merge=lfs -text
*.gguf* filter=lfs diff=lfs merge=lfs -text images/show_case/16.png filter=lfs diff=lfs merge=lfs -text
*.ggml filter=lfs diff=lfs merge=lfs -text images/show_case/17.png filter=lfs diff=lfs merge=lfs -text
*.llamafile* filter=lfs diff=lfs merge=lfs -text images/show_case/18.png filter=lfs diff=lfs merge=lfs -text
*.pt2 filter=lfs diff=lfs merge=lfs -text images/show_case/19.png filter=lfs diff=lfs merge=lfs -text
images/show_case/2.png filter=lfs diff=lfs merge=lfs -text
images/show_case/21.png filter=lfs diff=lfs merge=lfs -text
images/show_case/22.png filter=lfs diff=lfs merge=lfs -text
images/show_case/23.png filter=lfs diff=lfs merge=lfs -text
images/show_case/24.png filter=lfs diff=lfs merge=lfs -text
images/show_case/26.png filter=lfs diff=lfs merge=lfs -text
images/show_case/27.png filter=lfs diff=lfs merge=lfs -text
images/show_case/28.png filter=lfs diff=lfs merge=lfs -text
images/show_case/29.png filter=lfs diff=lfs merge=lfs -text
images/show_case/3.png filter=lfs diff=lfs merge=lfs -text
images/show_case/30.png filter=lfs diff=lfs merge=lfs -text
images/show_case/31.png filter=lfs diff=lfs merge=lfs -text
images/show_case/32.png filter=lfs diff=lfs merge=lfs -text
images/show_case/34.png filter=lfs diff=lfs merge=lfs -text
images/show_case/35.png filter=lfs diff=lfs merge=lfs -text
images/show_case/36.png filter=lfs diff=lfs merge=lfs -text
images/show_case/38.png filter=lfs diff=lfs merge=lfs -text
images/show_case/39.png filter=lfs diff=lfs merge=lfs -text
images/show_case/4.png filter=lfs diff=lfs merge=lfs -text
images/show_case/40.png filter=lfs diff=lfs merge=lfs -text
images/show_case/41.png filter=lfs diff=lfs merge=lfs -text
images/show_case/42.png filter=lfs diff=lfs merge=lfs -text
images/show_case/43.png filter=lfs diff=lfs merge=lfs -text
images/show_case/44.png filter=lfs diff=lfs merge=lfs -text
images/show_case/45.png filter=lfs diff=lfs merge=lfs -text
images/show_case/46.png filter=lfs diff=lfs merge=lfs -text
images/show_case/47.png filter=lfs diff=lfs merge=lfs -text
images/show_case/48.png filter=lfs diff=lfs merge=lfs -text
images/show_case/49.png filter=lfs diff=lfs merge=lfs -text
images/show_case/50.png filter=lfs diff=lfs merge=lfs -text
images/show_case/6.png filter=lfs diff=lfs merge=lfs -text
images/show_case/7.png filter=lfs diff=lfs merge=lfs -text
images/show_case/8.png filter=lfs diff=lfs merge=lfs -text
images/show_case/9.png filter=lfs diff=lfs merge=lfs -text
diffusion_pytorch_model.safetensors filter=lfs diff=lfs merge=lfs -text

226
README.md
View File

@ -1,47 +1,187 @@
--- ---
license: Apache License 2.0 license: apache-2.0
#model-type:
##如 gpt、phi、llama、chatglm、baichuan 等
#- gpt
#domain:
##如 nlp、cv、audio、multi-modal
#- nlp
#language:
##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
#- cn
#metrics:
##如 CIDEr、Blue、ROUGE 等
#- CIDEr
#tags:
##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
#- pretrained
#tools:
##如 vllm、fastchat、llamacpp、AdaSeq 等
#- vllm
--- ---
### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重,可浏览“模型文件”页面获取。 <div style="display: flex; justify-content: center; align-items: center;">
#### 您可以通过如下git clone命令或者ModelScope SDK来下载模型 <img src="./images/images_alibaba.png" alt="alibaba" style="width: 20%; height: auto; margin-right: 5%;">
<img src="./images/images_alimama.png" alt="alimama" style="width: 20%; height: auto;">
</div>
SDK下载 [中文版Readme](./README_ZH.md)
```bash
#安装ModelScope
pip install modelscope
```
```python
#SDK模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('alimama-creative/SDXL-EcomID')
```
Git下载
```
#Git模型下载
git clone https://www.modelscope.cn/alimama-creative/SDXL-EcomID.git
```
<p style="color: lightgrey;">如果您是本模型的贡献者,我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>,及时完善模型卡片内容。</p> EcomID aims to generate customized images from a single reference ID image, ensuring strong semantic consistency while being controlled by keypoints.
This repository provides the EcomID method and model, combining the strengths of [PuLID](https://github.com/ToTheBeginning/PuLID) and [InstantID](https://github.com/instantX-research/InstantID) for better background consistency, facial keypoint control, and realistic facial representation with improved similarity.
# EcomID Overview
## EcomID Structure
<img src="./images/overflow.png" alt="alibaba" style="width: 100%; height: auto; margin-right: 5%;">
- **IP-Adapter of PuLID**: EcomID incorporates the ID-Encoder and cross-attention components from PuLID, trained with alignment loss.
This method effectively reduces the interference of ID embeddings on text embeddings within the cross-attention part, minimizing disruption to the underlying model's text-to-image capabilities.
- **InstantIDs IdentityNet Architecture**: Utilizing **a dataset of 2 million aesthetically pleasing portrait images**, IdentityNet enhances keypoint control, improving ID consistency and facial realism. During training, the IP-adapter is frozen, and only the IdentityNet is trained. Facial landmarks are used as conditional inputs, while face embeddings are integrated into IdentityNet via cross-attention.
# Show Cases
## Comparison with Other Methods
### 1、Preserved Text-to-Image Capability
<table>
<tr>
<th style="width: 28%;">Prompt</th>
<th style="width: 24%;">Reference Image</th>
<th style="width: 24%;">EcomID</th>
<th style="width: 24%;">InstantID</th>
</tr>
<tr>
<td style="font-size: 12px;">girl, white skin, black hair, long wavy hair, <span style="color:red"><strong>in European style living room, Retro tone, decorations</strong></span>, depth of field.</td>
<td><img src="images/show_case/50.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/49.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/48.png" alt="InstantID图像" width="100%"></td>
</tr>
<table>
As shown above, EcomID ***preserves background generation abilities while minimizing stylization, greatly enhancing realism***.
The visualizations highlight more authentic portraits with improved background semantic consistency, showcasing EcomID's advantage in generating realistic images.
### 2、Improved Facial Control and Consistency
<table>
<tr>
<th style="width: 24%;">Prompt</th>
<th style="width: 19%;">Reference Image</th>
<th style="width: 19%;">EcomID</th>
<th style="width: 19%;">InstantID</th>
<th style="width: 19%;">PuLID</th>
</tr>
<tr>
<td style="font-size: 12px;">A close-up portrait of a man standing in the library, holding <span style="color:red"><strong>two smiling toddlers</strong></span> next to him.</td>
<td><img src="images/show_case/20.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/17.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/18.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/19.png" alt="PuLID图像" width="100%"></td>
</tr>
<table>
As shown above, EcomID employs keypoints as conditional inputs for training, ***allowing for precise adjustments of facial positions, sizes, and orientations***. This capability ensures that the generated portraits are more controllable while further enhancing facial similarity and the overall quality of the images.
### More showcases
EcomID enhances portrait representation, delivering a more authentic and aesthetically pleasing appearance while ensuring semantic consistency and greater internal ID similarity (i.e., traits that do not vary with age, hairstyle, glasses, or other physical changes).
<table>
<tr>
<th style="width: 24%;">Prompt</th>
<th style="width: 19%;">Reference Image</th>
<th style="width: 19%;">EcomID</th>
<th style="width: 19%;">InstantID</th>
<th style="width: 19%;">PuLID</th>
</tr>
<tr>
<td style="font-size: 12px;">A close-up portrait of a <span style="color:red"><strong>little girl with double braids</strong></span>, wearing a white dress, standing on the beach during sunset.</td>
<td><img src="images/show_case/21.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/22.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/23.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/24.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td style="font-size: 12px;">A close-up portrait of a <span style="color:red"><strong>very little girl</strong></span> with double braids, wearing <span style="color:red"><strong>a hat</strong></span> and white dress, standing on the beach during sunset.</td>
<td><img src="images/show_case/44.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/47.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/46.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/45.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td style="font-size: 12px;">Agrizzled detective, <span style="color:red"><strong>fedora</strong></span> casting a shadow over his square jaw, a <span style="color:red"><strong>cigar dangling from his lips</strong></span>, his trench coat evocative of film noir, in a <span style="color:red"><strong>rainy alley</strong></span>.</td>
<td><img src="images/show_case/25.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/26.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/27.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/28.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td style="font-size: 12px;">A smiling girl with <span style="color:red"><strong>bangs and long hair</strong></span> in a school uniform stands under cherry trees, holding a book.</td>
<td><img src="images/show_case/29.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/30.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/31.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/32.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td style="font-size: 12px;">A <span style="color:red"><strong>very old</strong></span> witch, wearing a black cloak, with a pointed hat, holding a magic wand, against a background of a misty forest.</td>
<td><img src="images/show_case/33.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/34.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/35.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/36.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td style="font-size: 12px;">A man clad in cyberpunk fashion: <span style="color:red"><strong>neon accents, reflective sunglasses</strong></span> and a leather jacket with glowing circuit patterns. He stands stoically amidst a soaked cityscape.</td>
<td><img src="images/show_case/37.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/38.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/39.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/40.png" alt="PuLID图像" width="100%"></td>
</tr>
</table>
### More Base Models, Resolutions, and Styles
<table>
<tr>
<th style="width: 12%;">SDXL models</th>
<th style="width: 24%;">Prompt</th>
<th style="width: 16%;">Reference Image</th>
<th style="width: 16%;">EcomID</th>
<th style="width: 16%;">InstantID</th>
<th style="width: 16%;">PuLID</th>
</tr>
<tr>
<td>sd-xl-base-1.0</td>
<td style="font-size: 12px;">girl, solo, brown hair, holding a little teddy bear on her hands, wearing a school uniform, standing in the library, <span style="color:red"><strong>cartoon style</strong></span>.</td>
<td><img src="images/show_case/1.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/2.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/3.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/4.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>EcomXL</td>
<td style="font-size: 12px;">A close-up portrait of a <span style="color:red"><strong>very little girl</strong></span> with double braids, wearing <span style="color:red"><strong>a hat</strong></span> and white dress, standing on the beach during sunset.</td>
<td><img src="images/show_case/44.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/47.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/46.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/45.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>DreamShaperXL</td>
<td style="font-size: 12px;">solo, looking_at_viewer, smile, brown_hair, upper_body, open_clothes, teeth, open_jacket, black_jacket, blurry_background, realistic</td>
<td><img src="images/show_case/44.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/6.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/7.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/8.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>leosam_xl_v7</td>
<td style="font-size: 12px;">A close-up portrait of a girl, solo, dress, jewelry, beach and sea, pink_dress, realistic.</td>
<td><img src="images/show_case/9.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/15.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/14.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/16.png" alt="PuLID图像" width="100%"></td>
</tr>
</table>
### Notes
- Unless otherwise specified, the showcases are generated using the base model EcomXL, which is also highly compatible with various other SDXL-based models, such as [leosams-helloworld-xl](https://civitai.com/models/43977/leosams-helloworld-xl), [dreamshaper-xl](https://civitai.com/models/112902/dreamshaper-xl), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and so on.
- It works very well with SDXL Turbo/Lighting, [EcomXL Inpainting ControlNet](https://huggingface.co/alimama-creative/EcomXL_controlnet_inpaint) and [EcomXL Softedge ControlNet](https://huggingface.co/alimama-creative/EcomXL_controlnet_softedge).
# How to use
## ComfyUI
- The EcomID_ComfyUI node has been released: [click here](https://github.com/alimama-creative/SDXL_EcomID_ComfyUI)
# Training Details
The model is trained on 2M Taobao images, where the proportion of human faces is greater than 3%. The images have a resolution greater than 800, and the aesthetic score is above 5.5.
Mixed precision: fp16
Learning rate: 1e-4
Batch size: 2
Image size: 1024x1024

172
README_ZH.md Normal file
View File

@ -0,0 +1,172 @@
<div style="display: flex; justify-content: center; align-items: center;">
<img src="./images/images_alibaba.png" alt="alibaba" style="width: 20%; height: auto; margin-right: 5%;">
<img src="./images/images_alimama.png" alt="alimama" style="width: 20%; height: auto;">
</div>
EcomID 旨在从单个ID参考图像生成定制的保ID图像优势在于很强的语义一致性同时受人脸关键点控制。
此仓库提供了 EcomID 方法和模型,方法上结合了 [PuLID](https://github.com/ToTheBeginning/PuLID) 和 [InstantID](https://github.com/instantX-research/InstantID) 的优点,以获得更好的背景一致性、面部关键点控制、更真实的面部以及更高的相似度。
# EcomID 概述
## EcomID 结构
<img src="./images/overflow.png" alt="alibaba" style="width: 100%; height: auto; margin-right: 5%;">
- **PuLID 的 IP-Adapter**EcomID 借鉴了 PuLID 的 ID-Encoder 和交叉注意力组件,其使用对齐损失训练而成。
故而该方法有效减少了 ID embedding 对交叉注意力部分的文本 embedding的干扰最小化对底层模型文本到图像能力的干扰。
- **InstantID 的 IdentityNet 架构**:利用 *200 万张美观的人像图像数据集*训练了IdentityNet增强了关键点控制提高了 ID 一致性和面部真实感。在训练过程中IP-adapter 被冻结,只有 IdentityNet 被训练。面部Keypoint用作条件输入同时面部嵌入通过交叉注意力集成到 IdentityNet 中。
# 展示案例
## 与其他方法的比较
### 1、保留文本到图像能力
<table>
<tr>
<th style="width: 28%;">Prompt</th>
<th style="width: 24%;">参考图像</th>
<th style="width: 24%;">EcomID</th>
<th style="width: 24%;">InstantID</th>
</tr>
<tr>
<td>女孩,白皮肤,黑头发,长卷发,<span style="color:red"><strong>在欧洲风格的客厅,复古色调,装饰品</strong></span>,景深。</td>
<td><img src="images/show_case/50.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/49.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/48.png" alt="InstantID图像" width="100%"></td>
</tr>
<table>
如上所示EcomID ***保留了背景生成能力,同时最大限度地减少了风格化,从而大大增强了真实感***。
如图可见EcomID的背景语义一致性得到了改善且在生成真实图像方面格外有优势。
### 2、改善面部控制和相似度
<table>
<tr>
<th style="width: 24%;">Prompt</th>
<th style="width: 19%;">参考图像</th>
<th style="width: 19%;">EcomID</th>
<th style="width: 19%;">InstantID</th>
<th style="width: 19%;">PuLID</th>
</tr>
<tr>
<td>在图书馆前站着的男人的特写肖像,<span style="color:red"><strong>抱着两个微笑的幼儿</strong></span></td>
<td><img src="images/show_case/20.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/17.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/18.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/19.png" alt="PuLID图像" width="100%"></td>
</tr>
<table>
如上所示EcomID 使用关键点作为训练的条件输入,***允许精确调整面部位置、大小和方向***。这种能力确保生成的人像更加可控,同时进一步增强了面部相似性和图像的整体质量。
### 更多案例
EcomID 提高了人像表现,提供了更真实和美观的外观,同时确保语义一致性和更好的内部 ID 相似性(即,不随年龄、发型、眼镜或其他身体变化而变化的特征)。
<table>
<tr>
<th style="width: 24%;">Prompt</th>
<th style="width: 19%;">参考图像</th>
<th style="width: 19%;">EcomID</th>
<th style="width: 19%;">InstantID</th>
<th style="width: 19%;">PuLID</th>
</tr>
<tr>
<td>一个<span style="color:red"><strong>双辫小女孩</strong></span>的特写肖像,穿着白色裙子,傍晚在海滩上。</td>
<td><img src="images/show_case/21.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/22.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/23.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/24.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>一个<span style="color:red"><strong>非常小的女孩</strong></span>,双辫,带着<spann style="color:red"><strong>帽子</strong></span>和白色裙子,傍晚在海滩上。</td>
<td><img src="images/show_case/44.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/47.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/46.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/45.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>一个满脸胡茬的侦探,<span style="color:red"><strong>戴着帽子</strong></span>,阴影投在他方形的下巴上,<span style="color:red"><strong>嘴里叼着一根香烟</strong></span>,他的风衣唤起了电影黑色风格,在一个<span style="color:red"><strong>阴雨小巷</strong></span>里。</td>
<td><img src="images/show_case/25.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/26.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/27.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/28.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>一个微笑的女孩,<span style="color:red"><strong>齐刘海和长发</strong></span>,穿着校服,站在樱花树下,手里拿着一本书。</td>
<td><img src="images/show_case/29.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/30.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/31.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/32.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>一个<spann style="color:red"><strong>非常老的</strong></span>女巫,穿着黑色斗篷,戴着尖顶帽,手握魔杖,在雾气缭绕的森林背景下。</td>
<td><img src="images/show_case/33.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/34.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/35.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/36.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>一个身穿赛博朋克风格的男人:<span style="color:red"><strong>霓虹配件,反光太阳镜,</strong></span>和带有发光电路图案的皮夹克。他在湿润的城市风貌中冷静地站着。</td>
<td><img src="images/show_case/37.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/38.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/39.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/40.png" alt="PuLID图像" width="100%"></td>
</tr>
</table>
### 更多基础模型、分辨率和风格
<table>
<tr>
<th style="width: 12%;">SDXL 模型</th>
<th style="width: 24%;">Prompt</th>
<th style="width: 16%;">参考图像</th>
<th style="width: 16%;">EcomID</th>
<th style="width: 16%;">InstantID</th>
<th style="width: 16%;">PuLID</th>
</tr>
<tr>
<td>sd-xl-base-1.0</td>
<td>女孩,单独,棕色头发,手里抱着一个小泰迪熊,穿着校服,站在图书馆里,<span style="color:red"><strong>卡通风格</strong></span></td>
<td><img src="images/show_case/1.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/2.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/3.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/4.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>EcomXL</td>
<td>一个<span style="color:red"><strong>非常小的女孩</strong></span>的特写肖像,双辫,带着<spann style="color:red"><strong>帽子</strong></span>和白色裙子,傍晚在海滩上。</td>
<td><img src="images/show_case/44.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/47.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/46.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/45.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>DreamShaperXL</td>
<td>单独,面向观众,微笑,棕色头发,上半身,开衫,牙齿,打开的外套,黑色夹克,模糊背景,真实感</td>
<td><img src="images/show_case/44.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/6.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/7.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/8.png" alt="PuLID图像" width="100%"></td>
</tr>
<tr>
<td>leosam_xl_v7</td>
<td>一个特写肖像,女孩,单独,裙子,珠宝,海滩和大海,粉色裙子,真实感。</td>
<td><img src="images/show_case/9.png" alt="参考图像" width="100%"></td>
<td><img src="images/show_case/15.png" alt="EcomID图像" width="100%"></td>
<td><img src="images/show_case/14.png" alt="InstantID图像" width="100%"></td>
<td><img src="images/show_case/16.png" alt="PuLID图像" width="100%"></td>
</tr>
</table>
### 注意事项
- 除非特别说明,大部分展示案例使用基础模型 EcomXL 生成同时EcomID与其他基于 SDXL 的模型也高度兼容,例如 [leosams-helloworld-xl](https://civitai.com/models/43977/leosams-helloworld-xl)、[dreamshaper-xl](https://civitai.com/models/112902/dreamshaper-xl)、[stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) 等。
- 它与 SDXL Turbo/Lighting、[EcomXL Inpainting ControlNet](https://huggingface.co/alimama-creative/EcomXL_controlnet_inpaint) 和 [EcomXL Softedge ControlNet](https://huggingface.co/alimama-creative/EcomXL_controlnet_softedge) 的兼容性非常好。
# 如何使用
## ComfyUI
- 已发布 EcomID_ComfyUI 节点:[点击这里](https://github.com/alimama-creative/SDXL_EcomID_ComfyUI)
# 训练细节
该模型在 200 万张淘宝图像上进行训练,其中人脸比例大于 3%。图像分辨率大于800且美学评分超过 5.5。
混合精度fp16
学习率1e-4
批量大小2
图像大小1024x1024

1
configuration.json Normal file
View File

@ -0,0 +1 @@
{"framework": "pytorch", "task": "others", "allow_remote": true}

BIN
diffusion_pytorch_model.safetensors (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/images_alibaba.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

BIN
images/images_alimama.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

BIN
images/overflow.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/1.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/10.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/11.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/12.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/14.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/15.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/16.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/17.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/18.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/19.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/2.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/20.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 455 KiB

BIN
images/show_case/21.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/22.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/23.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/24.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/25.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 853 KiB

BIN
images/show_case/26.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/27.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/28.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/29.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/3.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/30.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/31.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/32.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/33.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 726 KiB

BIN
images/show_case/34.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/35.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/36.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/37.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 355 KiB

BIN
images/show_case/38.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/39.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/4.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/40.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/41.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/42.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/43.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/44.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/45.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/46.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/47.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/48.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/49.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/50.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/6.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/7.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/8.png (Stored with Git LFS) Normal file

Binary file not shown.

BIN
images/show_case/9.png (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -0,0 +1,800 @@
{
"last_node_id": 15,
"last_link_id": 18,
"nodes": [
{
"id": 1,
"type": "InstantIDModelLoader",
"pos": [
1009.44140625,
227.6875
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "INSTANTID",
"type": "INSTANTID",
"links": [
3
],
"shape": 3,
"label": "INSTANTID",
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "InstantIDModelLoader"
},
"widgets_values": [
"ip-adapter.bin"
]
},
{
"id": 9,
"type": "EcomID_PulidModelLoader",
"pos": [
999,
320
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 1,
"mode": 0,
"outputs": [
{
"name": "PULID",
"type": "PULID",
"links": [
4
],
"shape": 3,
"label": "PULID",
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "EcomID_PulidModelLoader"
},
"widgets_values": [
"ip-adapter_pulid_sdxl_fp16.safetensors"
]
},
{
"id": 3,
"type": "PulidEvaClipLoader",
"pos": [
1101,
421
],
"size": {
"0": 210,
"1": 26
},
"flags": {},
"order": 2,
"mode": 0,
"outputs": [
{
"name": "EVA_CLIP",
"type": "EVA_CLIP",
"links": [
5
],
"shape": 3,
"label": "EVA_CLIP",
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "PulidEvaClipLoader"
}
},
{
"id": 10,
"type": "EcomIDFaceAnalysis",
"pos": [
999,
488
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 3,
"mode": 0,
"outputs": [
{
"name": "FACEANALYSIS",
"type": "FACEANALYSIS",
"links": [
6
],
"shape": 3,
"label": "FACEANALYSIS",
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "EcomIDFaceAnalysis"
},
"widgets_values": [
"CPU"
]
},
{
"id": 2,
"type": "ControlNetLoader",
"pos": [
999,
589
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 4,
"mode": 0,
"outputs": [
{
"name": "CONTROL_NET",
"type": "CONTROL_NET",
"links": [
7
],
"shape": 3,
"label": "CONTROL_NET",
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "ControlNetLoader"
},
"widgets_values": [
"yinu_instantid.safetensors"
]
},
{
"id": 5,
"type": "CLIPTextEncode",
"pos": [
975,
689
],
"size": {
"0": 380,
"1": 160
},
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 1,
"label": "clip"
}
],
"outputs": [
{
"name": "CONDITIONING",
"type": "CONDITIONING",
"links": [
9
],
"slot_index": 0,
"label": "CONDITIONING"
}
],
"properties": {
"Node name for S&R": "CLIPTextEncode"
},
"widgets_values": [
"1girl, solo, dress, jewelry, beach, pink_dress, realistic"
]
},
{
"id": 6,
"type": "CLIPTextEncode",
"pos": [
975,
893
],
"size": {
"0": 380,
"1": 160
},
"flags": {},
"order": 9,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 2,
"label": "clip"
}
],
"outputs": [
{
"name": "CONDITIONING",
"type": "CONDITIONING",
"links": [
10
],
"slot_index": 0,
"label": "CONDITIONING"
}
],
"properties": {
"Node name for S&R": "CLIPTextEncode"
},
"widgets_values": [
"bad hand, (worst quality, low quality, nevus, normal quality:1.6), blur skin,nevus,signature, logo, watermark,username,text"
]
},
{
"id": 13,
"type": "VAEDecode",
"pos": [
2113,
437
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 12,
"mode": 0,
"inputs": [
{
"name": "samples",
"type": "LATENT",
"link": 14,
"label": "samples"
},
{
"name": "vae",
"type": "VAE",
"link": 15,
"label": "vae"
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
16
],
"shape": 3,
"label": "IMAGE",
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "VAEDecode"
}
},
{
"id": 8,
"type": "ApplyEcomIDAdvanced",
"pos": [
1411,
437
],
"size": {
"0": 315,
"1": 402
},
"flags": {},
"order": 10,
"mode": 0,
"inputs": [
{
"name": "instantid_ipa",
"type": "INSTANTID",
"link": 3,
"label": "instantid_ipa"
},
{
"name": "pulid",
"type": "PULID",
"link": 4,
"label": "pulid"
},
{
"name": "eva_clip",
"type": "EVA_CLIP",
"link": 5,
"label": "eva_clip"
},
{
"name": "insightface",
"type": "FACEANALYSIS",
"link": 6,
"label": "insightface"
},
{
"name": "control_net",
"type": "CONTROL_NET",
"link": 7,
"label": "control_net"
},
{
"name": "image",
"type": "IMAGE",
"link": 8,
"label": "image"
},
{
"name": "model",
"type": "MODEL",
"link": 17,
"label": "model"
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 9,
"label": "positive"
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 10,
"label": "negative"
},
{
"name": "image_kps",
"type": "IMAGE",
"link": null,
"label": "image_kps"
},
{
"name": "mask",
"type": "MASK",
"link": null,
"label": "mask"
}
],
"outputs": [
{
"name": "MODEL",
"type": "MODEL",
"links": [
11
],
"shape": 3,
"label": "MODEL",
"slot_index": 0
},
{
"name": "positive",
"type": "CONDITIONING",
"links": [
12
],
"shape": 3,
"label": "positive",
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"links": [
13
],
"shape": 3,
"label": "negative",
"slot_index": 2
}
],
"properties": {
"Node name for S&R": "ApplyEcomIDAdvanced"
},
"widgets_values": [
"fidelity",
0,
1,
0.3,
0.8,
0,
"average"
]
},
{
"id": 15,
"type": "EmptyLatentImage",
"pos": [
1407,
902
],
"size": {
"0": 315,
"1": 106
},
"flags": {},
"order": 5,
"mode": 0,
"outputs": [
{
"name": "LATENT",
"type": "LATENT",
"links": [
18
],
"shape": 3,
"label": "LATENT",
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "EmptyLatentImage"
},
"widgets_values": [
1024,
1024,
1
]
},
{
"id": 11,
"type": "LoadImage",
"pos": [
573,
329
],
"size": [
315,
314
],
"flags": {},
"order": 6,
"mode": 0,
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
8
],
"shape": 3,
"label": "IMAGE",
"slot_index": 0
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"shape": 3,
"label": "MASK"
}
],
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"4 (2).png",
"image"
]
},
{
"id": 4,
"type": "CheckpointLoaderSimple",
"pos": [
564,
739
],
"size": {
"0": 315,
"1": 98
},
"flags": {},
"order": 7,
"mode": 0,
"outputs": [
{
"name": "MODEL",
"type": "MODEL",
"links": [
17
],
"slot_index": 0,
"label": "MODEL"
},
{
"name": "CLIP",
"type": "CLIP",
"links": [
1,
2
],
"slot_index": 1,
"label": "CLIP"
},
{
"name": "VAE",
"type": "VAE",
"links": [
15
],
"slot_index": 2,
"label": "VAE"
}
],
"properties": {
"Node name for S&R": "CheckpointLoaderSimple"
},
"widgets_values": [
"ecomxl.safetensors"
]
},
{
"id": 12,
"type": "KSampler",
"pos": [
1763,
436
],
"size": {
"0": 315,
"1": 262
},
"flags": {},
"order": 11,
"mode": 0,
"inputs": [
{
"name": "model",
"type": "MODEL",
"link": 11,
"label": "model"
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 12,
"label": "positive"
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 13,
"label": "negative"
},
{
"name": "latent_image",
"type": "LATENT",
"link": 18,
"label": "latent_image"
}
],
"outputs": [
{
"name": "LATENT",
"type": "LATENT",
"links": [
14
],
"shape": 3,
"label": "LATENT",
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "KSampler"
},
"widgets_values": [
730401293336759,
"randomize",
25,
3.5,
"dpm_2",
"karras",
1
]
},
{
"id": 14,
"type": "PreviewImage",
"pos": [
2110,
532
],
"size": [
428.19834675017046,
539.6653026265071
],
"flags": {},
"order": 13,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 16,
"label": "images"
}
],
"properties": {
"Node name for S&R": "PreviewImage"
},
"widgets_values": [
"0",
"0",
"0"
]
}
],
"links": [
[
1,
4,
1,
5,
0,
"CLIP"
],
[
2,
4,
1,
6,
0,
"CLIP"
],
[
3,
1,
0,
8,
0,
"INSTANTID"
],
[
4,
9,
0,
8,
1,
"PULID"
],
[
5,
3,
0,
8,
2,
"EVA_CLIP"
],
[
6,
10,
0,
8,
3,
"FACEANALYSIS"
],
[
7,
2,
0,
8,
4,
"CONTROL_NET"
],
[
8,
11,
0,
8,
5,
"IMAGE"
],
[
9,
5,
0,
8,
7,
"CONDITIONING"
],
[
10,
6,
0,
8,
8,
"CONDITIONING"
],
[
11,
8,
0,
12,
0,
"MODEL"
],
[
12,
8,
1,
12,
1,
"CONDITIONING"
],
[
13,
8,
2,
12,
2,
"CONDITIONING"
],
[
14,
12,
0,
13,
0,
"LATENT"
],
[
15,
4,
2,
13,
1,
"VAE"
],
[
16,
13,
0,
14,
0,
"IMAGE"
],
[
17,
4,
0,
8,
6,
"MODEL"
],
[
18,
15,
0,
12,
3,
"LATENT"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.6830134553650709,
"offset": [
-260.1114479220462,
-82.98489012650744
]
}
},
"version": 0.4
}