# Qwen3.5在Transformers库部署推理及ReAct智能体

本是少年

559人浏览 · 2026-03-21 17:13:27

本是少年 · 2026-03-21 17:13:27 发布

0 目录

前言
Transformers库常用AutoModel介绍
- 常见的AutoModel类
使用方式
- 纯文本推理
- 文本图片推理
- ReAct工具执行推理
参考资料

1 前言

Qwen3.5系列发布好些天了，官方的ModelCard只有SGLang、vLLM、KTransformers等框架的推理示例，以及OpenAI库、Agentic智能体的使用方法，但是对于初学者来说，仍然有在Transformers库上加载大模型进行推理和微调训练的学习需求，本文就是为此所撰写的。

2 Transformers库常用AutoModel介绍

有时候我们总会对AutoModel、AutoModelForCausalLM等类产生疑惑，这些有什么区别，可以说AutoModel是backbone，而AutoModelForXXX是在backbone基础上，针对任务做了进一步的处理。语言建模通常有因果型和掩码型，这是为了区分，但是针对视觉语言模型，则AutoModelForCausalLM并不适用，而应该使用AutoModelForImageTextToText。

2.1 常见的AutoModel类

类名	描述	适用任务
AutoModel	加载预训练的基础模型，不包含任务特定任务的头部	特征提取、嵌入生成等
AutoModelForCausalLM	加载带有因果语言建模头部的模型，适用于生成任务	文本生成、文本补全、对话等
AutoModelForMaskedLM	加载带有掩码语言建模头部的模型，适用于填空任务。	命名实体识别、文本填空等
AutoModelForImageTextToText	加载接受图像输入的语言模型，适用于视觉问答，多模态处理任务	图像理解，图像分割，文本生成等

3 使用方式

库引用

import torch
import requests
from PIL import Image
from accelerate import Accelerator
from modelscope import snapshot_download
model_dir = snapshot_download('Qwen/Qwen3.5-2B')
device = Accelerator().device

3.1 纯文本推理

from transformers import AutoModelForCausalLM, AutoTokenizer

# 加载因果语言模型（纯文本模型）适用于生成任务
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    torch_dtype=torch.float16, # 使用float16，nvidia卡可使用torch.bfloat16
    trust_remote_code=True
).to(device)
# 注： 可添加device_map="auto"参数自动分配设备，支持多卡

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(
    model_dir,
    trust_remote_code=True
)

# 构建对话消息
messages = [
  	{
        "role": "assistant",
        "content": [
            {"type": "text", "text": "you are a helpful assistant."},
        ] 
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "什么是LLM，什么是VLM，两者有何区别？"},
        ]
    }
]

# 应用chat_template格式化输入
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# 编码输入
inputs = tokenizer(text, return_tensors="pt").to(device)

# 前向推理生成回复
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=1024)

# 解码输出
input_len = len(inputs.input_ids[0])	# 输入长度
generated_text = tokenizer.decode(generated_ids[0][input_len:], skip_special_tokens=True)

print(f"模型回复: {generated_text}")

3.2 文本图片推理

3.2.1 导入库

from transformers import AutoModelForImageTextToText, AutoProcessor

3.2.2 加载模型和处理器

# 加载图像文本转文本模型（视觉语言模型）适用于接受图像输入的语言模型
model = AutoModelForImageTextToText.from_pretrained(
  model_dir,
  dtype=torch.float16
).to(device)

# 初始化处理器
processor = AutoProcessor.from_pretrained(
  model_dir,
  trust_remote_code=True
)

3.2.3 构建对话消息和推理

# 构建对话消息
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "http://images.cocodataset.org/val2017/000000039769.jpg"},
            {"type": "text", "text": "在这张图片里你能看到什么？"},
        ]
    }
]

# 调用处理器processors应用聊天模板预处理其输出与图像输入
inputs = processor.apply_chat_template(
    messages, 
    add_generation_prompt=True, 
    tokenize=True, 
    return_dict=True, 
    return_tensors="pt"
).to(device)

# 前向传播，将预处理后的输入传递给模型
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=200)

# 解码模型输出
input_len = len(inputs.input_ids[0])
generated_texts = processor.batch_decode(generated_ids[:, input_len:], skip_special_tokens=True)
print(generated_texts)

3.2.4 图像输入的格式

格式类型	示例写法	说明
URL	“http://images.cocodataset.org/val2017/000000039769.jpg”	网络图片链接
本地路径	“/path/to/your/image.jpg”	注意file://前缀
Base64	“/9j/4AAQSkZJRg…”	Base64编码字符串

本地路径

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "000000039769.jpg"},
            {"type": "text", "text": "在这张图片里你能看到什么？"},
        ]
    }
]
inputs = processor.apply_chat_template(
    messages, 
    add_generation_prompt=True, 
    tokenize=True, 
    return_dict=True, 
    return_tensors="pt"
).to(device)

Base64

import base64
from PIL import Image
from io import BytesIO
def pil_to_base64(image, format: str="JPEG"):
    """
    将PIL Image转为processor可识别的base64字符串格式
    """
    # 内存中保存为字节流
    buffered = BytesIO()

    image = image.convert("RGB")
    image.save(buffered, format=format)

    # 转base64编码
    b64_str = base64.b64encode(buffered.getvalue()).decode("utf-8")

    return b64_str
image = Image.open("000000039769.jpg")
image_format = "JPEG"
image_b64_str = pil_to_base64(image, image_format)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image_b64_str},
            {"type": "text", "text": "在这张图片里你能看到什么？"},
        ]
    }
]

3.3 ReAct工具执行推理

ReAct（Reasoning + Acting）模式，让模型能够：

思考
行动
观察
循环

3.3.2 导入库

import torch
import requests
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor
from accelerate import Accelerator
from modelscope import snapshot_download
import re
model_dir = snapshot_download('Qwen/Qwen3.5-2B')
device = Accelerator().device

3.3.3 加载模型与处理器

# 加载模型与处理器
model = AutoModelForImageTextToText.from_pretrained(
  model_dir,
  dtype=torch.float16
).to(device)

processor = AutoProcessor.from_pretrained(
  model_dir,
  trust_remote_code=True
)

3.3.4 定义工具

TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "get_current_temperature",
            "description": "获取指定城市的当前温度",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "城市名称，格式：'城市, 省份, 国家'"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "温度单位，默认celsius"
                    }
                },
                "required": ["location"]
            }
        }
    },
    {
        "type": "function", 
        "function": {
            "name": "get_weather_forecast",
            "description": "获取指定城市指定日期的天气预报",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string"},
                    "date": {"type": "string", "description": "格式：YYYY-MM-DD"},
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                },
                "required": ["location", "date"]
            }
        }
    }
]

def execute_tool(tool_name: str, tool_args: dict):
    """模拟工具执行，实际使用中替换为真实API调用"""
    if tool_name == "get_current_temperature":
        return {"temperature": 26.5, "location": tool_args["location"], "unit": tool_args.get("unit", "celsius")}
    elif tool_name == "get_weather_forecast":
        return {"forecast": "晴转多云", "location": tool_args["location"], "date": tool_args["date"]}
    return {"error": "Unknown tool"}

def parse_qwen_tool_calls(text: str):
    """解析 Qwen 模型的 XML 风格工具调用"""
    tool_calls = []
    
    # 匹配 <function=函数名>...</function> 模式
    pattern = r'<tool_call>\n<function=(\w+)>\n(.*?)\n</function>\n</tool_call>'
    matches = re.findall(pattern, text, re.DOTALL)
    
    for func_name, params_block in matches:
        # 解析参数 <parameter=参数名>\n值\n</parameter>
        param_pattern = r'<parameter=(\w+)>\n(.*?)\n</parameter>'
        params = re.findall(param_pattern, params_block, re.DOTALL)
        
        tool_args = {key: value.strip() for key, value in params}
        tool_calls.append({
            "name": func_name,
            "arguments": tool_args
        })
    
    return tool_calls

3.3.4 多轮ReAct推理循环

# 多轮ReAct推理循环
def react_inference(query: str, tools: list, max_rounds: int = 3):
    messages = [
        {"role": "user", "content": [
            {
                "type": "text",
                "text": query
            }
        ]}
    ]
    
    for round_idx in range(max_rounds):
        # 1.格式化输入
        inputs = processor.apply_chat_template(
            messages,
            tools=tools,
            add_generation_prompt=True, 
            tokenize=True, 
            return_dict=True, 
            return_tensors="pt"
        ).to(device)
    
        # 2.模型生成
        with torch.no_grad():
            generate_ids = model.generate(
                **inputs, 
                max_new_tokens=1024,
                temperature=0.7,
                top_p=0.8,
                top_k=20
            )
     
        # 3.解析输出
        input_len = len(inputs.input_ids[0])
        generated_texts = processor.batch_decode(generate_ids[:, input_len:], skip_special_tokens=True)
        print(generated_texts)
        generated_text = generated_texts[0]
        # 4.检查是否包含工具调用 (Qwen 格式)
        if '<function=' in generated_text:
            try:
                # 解析所有工具调用 (可能同时调用多个工具)
                tool_calls = parse_qwen_tool_calls(generated_text)
                
                if not tool_calls:
                    raise ValueError("未解析到有效的工具调用")
                
                # 执行所有工具调用
                tool_results = []
                for tool_call in tool_calls:
                    tool_name = tool_call["name"]
                    tool_args = tool_call["arguments"]
                    
                    # 执行工具
                    tool_result = execute_tool(tool_name, tool_args)
                    tool_results.append({
                        "name": tool_name,
                        "result": tool_result
                    })
                    print(f"调用工具 [{tool_name}], 参数:{tool_args}, 结果:{tool_result}")
                
                # 5.将工具调用和结果加入对话历史
                messages.append({
                    "role": "assistant", 
                    "content": [{"type": "text", "text": generated_text}]
                })
                
                # Qwen 格式：每个工具调用对应一个 tool 消息
                for tool_result in tool_results:
                    messages.append({
                        "role": "tool",
                        "name": tool_result["name"],
                        "content": [{
                            "type": "text", 
                            "text": json.dumps(tool_result["result"], ensure_ascii=False)
                        }]
                    })
                
                continue  # 继续下一轮推理
            except Exception as e:
                print(f"工具调用解析失败：{e}")
                # 解析失败时，可以选择让模型重新生成或直接返回
                messages.append({
                    "role": "assistant", 
                    "content": [{"type": "text", "text": generated_text}]
                })
                messages.append({
                    "role": "user",
                    "content": [{"type": "text", "text": f"工具调用解析失败：{e}，请重新尝试"}]
                })
                continue
        else:
            print(f"最终回复: {generated_text}")
            return generated_text
        return "达到最大推理轮数"

query = "北京现在多少度？明天天气怎么样？"
result = react_inference(query, TOOLS)

结果如下：

['<tool_call>\n<function=get_current_temperature>\n<parameter=location>\n北京, 中国\n</parameter>\n</function>\n</tool_call>\n<tool_call>\n<function=get_weather_forecast>\n<parameter=location>\n北京, 中国\n</parameter>\n<parameter=date>\n2024-01-16\n</parameter>\n</function>\n</tool_call>\n']
调用工具 [get_current_temperature], 参数:{'location': '北京, 中国'}, 结果:{'temperature': 26.5, 'location': '北京, 中国', 'unit': 'celsius'}
调用工具 [get_weather_forecast], 参数:{'location': '北京, 中国', 'date': '2024-01-16'}, 结果:{'forecast': '晴转多云', 'location': '北京, 中国', 'date': '2024-01-16'}
['北京现在温度是 26.5 摄氏度。\n\n明天（1月16日）北京的天气是晴转多云。\n']
最终回复: 北京现在温度是 26.5 摄氏度。

明天（1月16日）北京的天气是晴转多云。

4 参考资料

AtomGit开源社区

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念，把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起，为开发者提供从开发、训练到部署的一站式体验。

更多推荐

2025届最火的六大AI科研方案推荐榜单

AtomGit开源社区

2025届毕业生推荐的五大AI辅助写作网站实测分析

AtomGit开源社区

大模型落地：当AI真正走进工作的日常

大模型正从概念走向行业落地，主要通过四种路径重塑内容生产：1）垂直领域微调让AI掌握专业术语和行业思维；2）提示词工程成为新技能，精准提问能力决定AI产出质量；3）多模态应用打破内容形态壁垒，实现跨媒介内容衍生；4）企业级解决方案构建安全可控的AI工作流体系。这些转变不是替代人力，而是通过"驯化"技术，将AI转化为提升判断力、创造力和协作效率的生产力工具，重新定义人机协作的工作