一、模型说明

  • 模型名称:Qwen/Qwen2.5-VL-32B-Instruct
  • 模型类型:多模态大模型(支持图片 + 文字对话)
  • 核心依赖类:Qwen2_5_VLForConditionalGeneration
  • 模型大小:bfloat16 精度约 70GB

二、环境安装

pip install torch \
transformers \
modelscope \
pillow \
accelerate \
fastapi \
uvicorn \
python-multipart \
websockets

三、模型下载(本地离线部署)

方式 1:modelscope 下载

modelscope download --model Qwen/Qwen2.5-VL-32B-Instruct --local_dir /home/models/Qwen2.5-VL-32B-Instruct

方式 2:Git LFS 下载

Bash
git lfs install
git clone https://www.modelscope.cn/Qwen/Qwen2.5-VL-32B-Instruct.git /home/models/Qwen2.5-VL-32B-Instruct

四、本地运行脚本

文件名:qwen_vl_infer.py

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import os

# ===================== 离线模式(必须开启) =====================
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HOME"] = "/home/models"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# ===================== 模型路径配置 =====================
MODEL_DIR = "/home/models/Qwen2.5-VL-32B-Instruct"

# ===================== 加载处理器和模型 =====================
print("加载处理器...")
processor = AutoProcessor.from_pretrained(
    MODEL_DIR,
    trust_remote_code=True
)

print("加载模型...")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_DIR,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# ===================== 图片输入 =====================
image_path = "/home/test.jpg"
image = Image.open(image_path).convert("RGB")

# ===================== 对话提示 =====================
messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "详细描述这张图片里的内容"},
    ]}
]

# ===================== 推理 =====================
text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

inputs = processor(
    text=[text],
    images=[image],
    return_tensors="pt"
).to(model.device)

# 生成配置
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=2048,
        temperature=0.7,
        top_p=0.8,
        use_cache=True
    )

response = processor.decode(
    outputs[0][len(inputs.input_ids[0]):],
    skip_special_tokens=True
)

# ===================== 输出结果 =====================
print("\n===== 模型回答 =====")
print(response)

运行:

Bash
python qwen_vl_infer.py

五、FastAPI 生产级部署

文件名:qwen_vl_api.py

import os
import logging
import sys
import threading
import time
from contextlib import asynccontextmanager
from io import BytesIO
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
import torch

# ===================== 离线部署(必须开启) =====================
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HOME"] = "/home/models"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# ===================== 日志配置 =====================
logger = logging.getLogger("qwen_vl_api")
logger.setLevel(logging.INFO)

# ===================== 模型路径 =====================
MODEL_DIR = "/home/models/Qwen2.5-VL-32B-Instruct"
processor = None
model = None

# ===================== 心跳日志(防止加载时误以为卡死) =====================
def heartbeat(phase):
    stop = threading.Event()
    def run():
        n = 0
        while not stop.wait(15):
            n += 1
            logger.info(f"【心跳】{phase} 运行中 {n}次")
    t = threading.Thread(target=run, daemon=True)
    t.start()
    return stop

# ===================== 加载模型 =====================
def load_model():
    global processor, model
    logger.info("开始加载 Processor")
    stop = heartbeat("加载Processor")
    processor = AutoProcessor.from_pretrained(MODEL_DIR, trust_remote_code=True)
    stop.set()

    logger.info("开始加载模型")
    stop = heartbeat("加载模型")
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_DIR,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    stop.set()
    logger.info("模型加载完成")

# ===================== 服务生命周期 =====================
@asynccontextmanager
async def lifespan(app: FastAPI):
    load_model()
    logger.info("服务启动完成")
    yield

app = FastAPI(title="Qwen2.5-VL 多模态API", lifespan=lifespan)

# ===================== 工具函数 =====================
def load_image(raw: bytes):
    try:
        return Image.open(BytesIO(raw)).convert("RGB")
    except:
        raise HTTPException(status_code=400, detail="图片格式错误")

# ===================== 健康检查 =====================
@app.get("/health")
def health():
    return {"status": "running", "model_ready": model is not None}

# ===================== 推理接口 =====================
@app.post("/predict")
async def predict(
    file: UploadFile = File(...),
    prompt: str = Form(default="请描述图片内容")
):
    if not model or not processor:
        raise HTTPException(status_code=500, detail="模型未加载")

    raw = await file.read()
    image = load_image(raw)

    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": prompt},
        ]}
    ]

    prompt_text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[prompt_text],
        images=[image],
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2048,
            temperature=0.7,
            use_cache=True
        )

    answer = processor.decode(
        outputs[0][len(inputs.input_ids[0]):],
        skip_special_tokens=True
    )

    return {"code": 0, "message": "success", "answer": answer}

启动服务

uvicorn qwen_api:app --host 0.0.0.0 --port 8002

六、API 测试脚本

文件名:test_qwen_vl_infer.py

Python
import requests

url = "http://127.0.0.1:8002/predict"
image_path = "/home/test.jpg"
prompt = "请详细描述这张图片"

with open(image_path, "rb") as f:
    files = {"file": f}
    data = {"prompt": prompt}
    res = requests.post(url, files=files, data=data, timeout=300)

print(res.json())

七、生成参数说明(可调优)

参数

说明

max_new_tokens

最大生成长度(默认 2048)

temperature

生成随机性(0.1~1.0)

top_p

核采样

use_cache

加速推理

device_map="auto"

自动分配GPU

bfloat16

精度,省显存

Logo

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。

更多推荐