第42节:Hugging Face Transformers 工具从入门到精通【第七部分:案例实战篇】

【Hugging Face Transformers 工具从入门到精通】全文导读
第一部分:基础入门篇
第二部分:核心API深度实践篇
第三部分:数据处理与训练篇
第四部分:参数高效微调(PEFT)与高级训练篇
第五部分:推理优化与部署篇&第六部分:多模态与进阶应用篇
第七部分:案例实战篇
第八部分:常见问题与调试指南
第七部分:案例实战篇
19. 自然语言处理实战案例
19.1 情感分析:使用 BERT 微调电影评论分类
情感分析是自然语言处理中最经典的任务之一。本例将使用 BERT 模型对 IMDb 电影评论进行二分类(正面/负面)。我们将展示从数据加载、预处理、模型微调到评估的完整流程。
# 文件名:sentiment_analysis_bert.py
# 使用 BERT 微调 IMDb 电影评论情感分类
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
# 1. 加载数据集(使用 IMDb 电影评论)
print("加载 IMDb 数据集...")
dataset = load_dataset("imdb")
# 为了快速演示,取部分数据(实际使用时可以去掉 select)
train_dataset = dataset["train"].select(range(2000))
test_dataset = dataset["test"].select(range(500))
print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")
# 2. 加载分词器和模型
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# 3. 数据预处理函数
def preprocess_function(examples):
"""对文本进行分词处理"""
return tokenizer(
examples["text"],
truncation=True, # 截断超长文本
padding=False, # 暂不填充,由 DataCollator 动态处理
max_length=256, # BERT 最大 512,256 平衡效果与速度
)
print("开始分词...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
# 移除原始文本列,只保留模型需要的列
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])
# 将标签列重命名为 labels(Trainer 默认使用 labels)
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")
# 设置格式为 torch
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")
# 动态数据整理器(自动填充批次内序列)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 4. 定义评估指标
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
acc = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average="binary")
return {"accuracy": acc, "f1": f1}
# 5. 配置训练参数
training_args = TrainingArguments(
output_dir="./bert_sentiment_model",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_steps=50,
learning_rate=2e-5,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
push_to_hub=False, # 是否推送到 Hugging Face Hub
report_to="none", # 禁用 wandb/tensorboard 日志(可选)
)
# 6. 创建 Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_test,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# 7. 训练
print("开始训练...")
trainer.train()
# 8. 评估
print("评估模型...")
eval_results = trainer.evaluate()
print(f"评估结果: {eval_results}")
# 9. 保存模型
model.save_pretrained("./bert_sentiment_final")
tokenizer.save_pretrained("./bert_sentiment_final")
print("模型已保存至 ./bert_sentiment_final")
# 10. 推理示例
def predict_sentiment(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
model.eval()
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
pred = torch.argmax(logits, dim=-1).item()
return "正面" if pred == 1 else "负面"
test_text = "This movie was absolutely fantastic! The acting was superb."
print(f"\n测试文本: {test_text}")
print(f"预测情感: {predict_sentiment(test_text)}")
test_text2 = "A complete waste of time. Boring and predictable."
print(f"测试文本: {test_text2}")
print(f"预测情感: {predict_sentiment(test_text2)}")
19.2 命名实体识别(NER):使用 BERT 进行序列标注
命名实体识别(NER)任务是识别文本中的人名、地名、组织名等实体。我们使用 BERT 配合 CoNLL-2003 数据集进行微调,采用 BIO 标注方案。
# 文件名:ner_bert.py
# 使用 BERT 进行命名实体识别
import torch
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
TrainingArguments,
Trainer,
DataCollatorForTokenClassification
)
from datasets import load_dataset
import numpy as np
from seqeval.metrics import classification_report, accuracy_score as seq_accuracy
# 1. 加载数据集(CoNLL-2003 英语 NER)
dataset = load_dataset("conll2003")
print(f"训练集大小: {len(dataset['train'])}")
print(f"验证集大小: {len(dataset['validation'])}")
print(f"测试集大小: {len(dataset['test'])}")
# 获取标签映射
label_list = dataset["train"].features["ner_tags"].feature.names
print(f"标签列表: {label_list}")
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}
# 2. 加载分词器和模型
model_name = "bert-base-cased" # NER 通常使用 cased 版本以保留大小写信息
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=len(label_list),
id2label=id2label,
label2id=label2id,
)
# 3. 数据预处理(需要对齐标签到 token 级别)
def tokenize_and_align_labels(examples):
"""对文本分词,并调整标签使其与子词 token 对齐"""
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True, # 输入已经是分词后的列表
max_length=128,
padding=False,
)
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i) # 获取每个 token 对应的原始词索引
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
# 特殊 token([CLS], [SEP])标记为 -100(忽略)
label_ids.append(-100)
elif word_idx != previous_word_idx:
# 新词开始,使用原始标签
label_ids.append(label[word_idx])
else:
# 同一词内的子 token,标记为 -100(不参与损失计算)
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
# 应用预处理(使用 batched=True 加速)
print("开始分词和对齐标签...")
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
# 4. 定义评估指标(使用 seqeval 库)
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
# 移除忽略的标签(-100)
true_predictions = [
[label_list[p] for (p, l) in zip(pred, lab) if l != -100]
for pred, lab in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(pred, lab) if l != -100]
for pred, lab in zip(predictions, labels)
]
results = classification_report(true_labels, true_predictions, output_dict=True)
return {
"precision": results["macro avg"]["precision"],
"recall": results["macro avg"]["recall"],
"f1": results["macro avg"]["f1-score"],
"accuracy": seq_accuracy(true_labels, true_predictions),
}
# 5. 训练参数
training_args = TrainingArguments(
output_dir="./bert_ner_model",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_steps=50,
learning_rate=2e-5,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="f1",
report_to="none",
)
# 数据整理器(用于 token 分类,会动态填充并创建 attention_mask)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# 6. 创建 Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# 7. 训练
print("开始 NER 微调...")
trainer.train()
# 8. 评估
print("评估模型...")
eval_results = trainer.evaluate()
print(f"验证集结果: {eval_results}")
# 9. 保存模型
model.save_pretrained("./bert_ner_final")
tokenizer.save_pretrained("./bert_ner_final")
print("NER 模型已保存")
# 10. 推理示例
def predict_ner(text):
"""对给定文本进行命名实体识别"""
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
model.eval()
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)[0].cpu().numpy()
# 获取 token 列表
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
# 过滤特殊 token 和子词部分(仅保留每个词的第一部分)
entities = []
current_entity = None
for token, pred_id in zip(tokens, predictions):
label = id2label[pred_id]
if label == "O":
if current_entity:
entities.append(current_entity)
current_entity = None
continue
# BIO 格式:B-XXX 或 I-XXX
if label.startswith("B-"):
if current_entity:
entities.append(current_entity)
entity_type = label[2:]
current_entity = {"type": entity_type, "text": token.replace("##", "")}
elif label.startswith("I-") and current_entity:
current_entity["text"] += token.replace("##", "")
if current_entity:
entities.append(current_entity)
return entities
sample_text = "Elon Musk, the CEO of Tesla, was born in South Africa."
entities = predict_ner(sample_text)
print(f"\n文本: {sample_text}")
print(f"识别实体: {entities}")
19.3 问答系统:使用 BERT 进行抽取式问答
抽取式问答任务是从给定段落中提取答案片段。我们使用 SQuAD 数据集微调 BERT。
# 文件名:qa_bert.py
# 使用 BERT 进行抽取式问答
from transformers import (
AutoTokenizer,
AutoModelForQuestionAnswering,
TrainingArguments,
Trainer,
DefaultDataCollator
)
from datasets import load_dataset
import torch
import numpy as np
# 1. 加载 SQuAD 数据集(这里使用 SQuAD v2 的子集快速演示)
dataset = load_dataset("squad_v2", split="train[:500]")
validation = load_dataset("squad_v2", split="validation[:100]")
print(f"训练样本数: {len(dataset)}")
print(f"验证样本数: {len(validation)}")
# 2. 加载分词器和模型
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# 3. 预处理函数:将问题和上下文拼接,并计算答案的开始/结束位置
def preprocess_qa(examples):
"""将问题-上下文对转换为模型输入,并标记答案位置"""
questions = [q.strip() for q in examples["question"]]
contexts = examples["context"]
# 对每个问题-上下文对进行分词(返回 offset_mapping 用于定位答案)
tokenized = tokenizer(
questions,
contexts,
truncation="only_second", # 仅截断上下文(第二个句子)
max_length=384,
stride=128, # 滑动窗口步长
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding=False,
)
sample_mapping = tokenized.pop("overflow_to_sample_mapping")
offset_mapping = tokenized.pop("offset_mapping")
start_positions = []
end_positions = []
for i, offsets in enumerate(offset_mapping):
sample_idx = sample_mapping[i]
answer = examples["answers"][sample_idx]
# 如果没有答案(SQuAD v2 中可能存在),设置 start=end=0
if len(answer["answer_start"]) == 0:
start_positions.append(0)
end_positions.append(0)
continue
start_char = answer["answer_start"][0]
end_char = start_char + len(answer["text"][0])
token_start_index = 0
token_end_index = 0
# 找到答案对应的 token 起始和结束位置
for idx, (start, end) in enumerate(offsets):
if start <= start_char < end:
token_start_index = idx
if start < end_char <= end:
token_end_index = idx
break
start_positions.append(token_start_index)
end_positions.append(token_end_index)
tokenized["start_positions"] = start_positions
tokenized["end_positions"] = end_positions
return tokenized
# 应用预处理
print("预处理训练数据...")
tokenized_train = dataset.map(preprocess_qa, batched=True, remove_columns=dataset.column_names)
print("预处理验证数据...")
tokenized_val = validation.map(preprocess_qa, batched=True, remove_columns=validation.column_names)
# 4. 训练参数
training_args = TrainingArguments(
output_dir="./bert_qa_model",
num_train_epochs=2,
per_device_train_batch_size=8,
per_device_eval_batch_size=16,
evaluation_strategy="steps",
eval_steps=100,
save_strategy="no",
logging_steps=20,
learning_rate=3e-5,
report_to="none",
)
data_collator = DefaultDataCollator()
# 5. 定义评估指标(精确匹配和 F1)
def compute_qa_metrics(eval_pred):
start_logits, end_logits = eval_pred.predictions
start_preds = np.argmax(start_logits, axis=-1)
end_preds = np.argmax(end_logits, axis=-1)
# 简化版:这里仅输出平均 start/end 准确率
start_labels = eval_pred.label_ids[0]
end_labels = eval_pred.label_ids[1]
start_acc = (start_preds == start_labels).mean()
end_acc = (end_preds == end_labels).mean()
return {"start_accuracy": start_acc, "end_accuracy": end_acc}
# 6. 创建 Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
data_collator=data_collator,
compute_metrics=compute_qa_metrics,
)
# 7. 训练
print("开始问答模型微调...")
trainer.train()
# 8. 保存模型
model.save_pretrained("./bert_qa_final")
tokenizer.save_pretrained("./bert_qa_final")
print("问答模型已保存")
# 9. 推理函数
def answer_question(question, context):
inputs = tokenizer(
question,
context,
return_tensors="pt",
truncation="only_second",
max_length=384,
stride=128,
return_overflowing_tokens=False,
)
model.eval()
with torch.no_grad():
outputs = model(**inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
start_idx = torch.argmax(start_logits, dim=1).item()
end_idx = torch.argmax(end_logits, dim=1).item()
# 解码答案
input_ids = inputs["input_ids"][0]
answer_tokens = input_ids[start_idx:end_idx+1]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
return answer
# 测试
context = "Hugging Face was founded in New York City in 2016 by Clément Delangue, Julien Chaumond, and Thomas Wolf."
question = "Where was Hugging Face founded?"
answer = answer_question(question, context)
print(f"\n问题: {question}")
print(f"上下文: {context}")
print(f"答案: {answer}")
19.4 文本摘要:使用 T5/BART 进行生成式摘要
文本摘要需要生成一段简短的文本概括原始内容。我们使用 BART 模型在 CNN/DailyMail 数据集上微调。
# 文件名:summarization_bart.py
# 使用 BART 进行文本摘要微调
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
DataCollatorForSeq2Seq
)
from datasets import load_dataset
import torch
import evaluate
# 1. 加载数据集(使用 CNN/DailyMail 的子集)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:500]")
validation = load_dataset("cnn_dailymail", "3.0.0", split="validation[:100]")
print(f"训练样本数: {len(dataset)}")
print(f"验证样本数: {len(validation)}")
# 2. 加载 BART 模型和分词器
model_name = "facebook/bart-base" # BART-base 约 140M 参数
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# 3. 预处理函数:将 article 作为输入, highlights 作为目标
max_input_length = 1024
max_target_length = 128
def preprocess_summarization(examples):
inputs = examples["article"]
targets = examples["highlights"]
# 编码输入
model_inputs = tokenizer(
inputs,
max_length=max_input_length,
truncation=True,
padding=False,
)
# 编码目标
with tokenizer.as_target_tokenizer():
labels = tokenizer(
targets,
max_length=max_target_length,
truncation=True,
padding=False,
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
print("预处理数据...")
tokenized_train = dataset.map(preprocess_summarization, batched=True, remove_columns=dataset.column_names)
tokenized_val = validation.map(preprocess_summarization, batched=True, remove_columns=validation.column_names)
# 4. 数据整理器(动态填充)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# 5. 评估指标:ROUGE
rouge = evaluate.load("rouge")
def compute_rouge_metrics(eval_pred):
predictions, labels = eval_pred
# 解码预测
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
# 替换 -100 为 pad_token_id
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# 计算 ROUGE
result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
# 提取主要指标
return {
"rouge1": result["rouge1"],
"rouge2": result["rouge2"],
"rougeL": result["rougeL"],
}
# 6. 训练参数
training_args = Seq2SeqTrainingArguments(
output_dir="./bart_summarization",
num_train_epochs=3,
per_device_train_batch_size=4, # 根据显存调整
per_device_eval_batch_size=8,
gradient_accumulation_steps=2,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_steps=20,
learning_rate=3e-5,
weight_decay=0.01,
predict_with_generate=True, # 生成摘要用于评估
generation_max_length=max_target_length,
report_to="none",
)
# 7. 创建 Seq2SeqTrainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_rouge_metrics,
)
# 8. 训练
print("开始摘要模型微调...")
trainer.train()
# 9. 评估
eval_results = trainer.evaluate()
print(f"验证集 ROUGE: {eval_results}")
# 10. 保存模型
model.save_pretrained("./bart_summary_final")
tokenizer.save_pretrained("./bart_summary_final")
print("摘要模型已保存")
# 11. 推理示例
def summarize(text, max_length=150):
inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True)
model.eval()
with torch.no_grad():
summary_ids = model.generate(
inputs["input_ids"],
max_length=max_length,
num_beams=4,
early_stopping=True,
)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
sample_article = """The company announced a new AI model that can generate realistic images from text descriptions.
The model, called DALL-E 3, is the successor to DALL-E 2 and features improved understanding of complex prompts.
It will be available to ChatGPT Plus subscribers starting in October. OpenAI says the model is more accurate and
produces higher-quality images with better text rendering."""
print(f"\n原文: {sample_article[:200]}...")
print(f"摘要: {summarize(sample_article)}")
19.5 代码生成:使用 CodeLlama 进行代码补全
CodeLlama 是 Llama 2 的代码优化版本,专为代码生成任务设计。本例展示如何使用 CodeLlama 进行代码补全(使用 7B 模型,需一定 GPU 显存)。
# 文件名:code_generation_codellama.py
# 使用 CodeLlama 进行代码补全
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# 1. 加载模型和分词器
# 注意:CodeLlama-7B 需要约 14GB 显存(FP16)。如果显存不足,可使用 4-bit 量化版本。
model_name = "codellama/CodeLlama-7b-hf" # 或者 "codellama/CodeLlama-7b-Python-hf" 针对 Python
print("加载 CodeLlama 模型...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
# 设置 pad_token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 2. 代码补全函数
def complete_code(prompt, max_new_tokens=128, temperature=0.2):
"""根据提示生成代码"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
do_sample=True,
top_p=0.9,
repetition_penalty=1.05,
pad_token_id=tokenizer.pad_token_id,
)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
# 只返回新生成的部分(去除原始 prompt)
return generated[len(prompt):]
# 3. 示例:Python 函数补全
prompt1 = """def fibonacci(n):
\"\"\"Return the nth Fibonacci number.\"\"\"
if n <= 1:
return n
else:
return fibonacci(n-1) + fibonacci(n-2)
def factorial(n):
\"\"\"Compute factorial recursively.\"\"\"
"""
result1 = complete_code(prompt1, max_new_tokens=80)
print("=== Python 函数补全 ===")
print(prompt1 + result1)
print()
# 4. 示例:文档字符串生成代码
prompt2 = "# Write a function that sorts a list of integers using quicksort algorithm\n"
prompt2 += "def quicksort(arr):\n \"\"\"Sort array in-place using quicksort.\"\"\"\n "
result2 = complete_code(prompt2, max_new_tokens=150, temperature=0.3)
print("=== Quicksort 实现 ===")
print(prompt2 + result2)
print()
# 5. 示例:SQL 查询生成(自然语言转 SQL)
prompt3 = "-- Given a table 'employees' with columns: id, name, department, salary\n"
prompt3 += "-- Write a SQL query to find the average salary per department\n"
prompt3 += "SELECT "
result3 = complete_code(prompt3, max_new_tokens=60)
print("=== SQL 查询生成 ===")
print(prompt3 + result3)
# 6. 保存模型(如果需要)
# model.save_pretrained("./codellama_finetuned")
# tokenizer.save_pretrained("./codellama_finetuned")
20. 大模型全流程微调案例
20.1 基于 Llama 3 / Qwen 的客服指令微调(完整代码实现)
本案例展示如何使用自定义客服对话数据对 Qwen2.5-7B 进行指令微调(SFT)。由于 7B 模型较大,示例使用 LoRA 技术以降低显存需求。
# 文件名:customer_service_sft.py
# 使用 Qwen2.5-7B 和 LoRA 进行客服指令微调
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
from trl import SFTTrainer
# 1. 准备客服对话数据(示例数据,实际应替换为真实数据)
train_data = [
{
"instruction": "你好,我想查询订单状态",
"output": "好的,请提供您的订单号,我会帮您查询。"
},
{
"instruction": "我的订单已经一个星期了还没收到",
"output": "非常抱歉给您带来不便。请提供订单号,我会为您核实物流信息并催促发货。"
},
{
"instruction": "如何申请退货?",
"output": "您可以在订单页面点击“申请退货”,填写退货原因并提交。我们会在1-3个工作日内审核。"
},
{
"instruction": "商品有质量问题怎么办?",
"output": "很抱歉商品给您带来困扰。请提供商品照片和订单号,我们会为您办理换货或退款。"
},
{
"instruction": "你们的客服工作时间是?",
"output": "我们的在线客服工作时间是每天9:00-21:00,其余时间您可留言,我们会尽快回复。"
},
] * 20 # 复制多份以便训练
# 格式化数据为 Alpaca 风格
def format_instruction(example):
return f"### 用户:\n{example['instruction']}\n\n### 客服:\n{example['output']}"
dataset = Dataset.from_list(train_data)
# 2. 配置 4-bit 量化(节省显存)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model_name = "Qwen/Qwen2.5-7B" # 如果 7B 太大,可换用 Qwen2.5-1.5B 或 0.5B 测试
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
# 加载基础模型(4-bit)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
# 3. 准备模型用于 k-bit 训练
model = prepare_model_for_kbit_training(model)
# 4. 配置 LoRA
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # 打印可训练参数数量
# 5. 训练参数
training_args = TrainingArguments(
output_dir="./customer_service_lora",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=3,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
save_strategy="epoch",
save_total_limit=2,
report_to="none",
)
# 6. 创建 SFTTrainer
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer,
formatting_func=format_instruction,
max_seq_length=512,
)
# 7. 训练
print("开始客服指令微调...")
trainer.train()
# 8. 保存 LoRA 适配器
model.save_pretrained("./customer_service_adapter")
tokenizer.save_pretrained("./customer_service_adapter")
print("微调完成,适配器已保存")
# 9. 推理测试
def generate_response(user_input):
prompt = f"### 用户:\n{user_input}\n\n### 客服:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=200,
temperature=0.7,
do_sample=True,
top_p=0.9,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# 提取客服回答部分
if "### 客服:\n" in response:
response = response.split("### 客服:\n")[-1].strip()
return response
# 测试
test_question = "我的订单一直没发货,能帮我查一下吗?"
print(f"\n用户: {test_question}")
print(f"客服: {generate_response(test_question)}")
20.2 使用 QLoRA 在消费级 GPU 上微调 7B/13B 模型
QLoRA 可以在单张 24GB 显存的 GPU(如 RTX 3090/4090)上微调 7B 甚至 13B 模型。本例使用 13B 模型演示完整流程。
# 文件名:qlora_13b_finetune.py
# 使用 QLoRA 微调 13B 模型(例如 Llama-2-13b 或 Qwen-14B)
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer
# 1. 配置 4-bit 量化(QLoRA 核心)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
# 2. 加载 13B 模型(这里以 Qwen-14B 为例,实际可使用 Llama-2-13b-hf)
model_name = "Qwen/Qwen2.5-14B" # 注意:14B 需要约 16GB 显存(4-bit)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
# 3. 准备模型
model = prepare_model_for_kbit_training(model)
# 4. LoRA 配置(针对 13B 模型,适当增大 rank)
lora_config = LoraConfig(
r=32, # 增大秩以提升表达能力
lora_alpha=64,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
print(f"可训练参数: {model.num_parameters(only_trainable=True):,}")
# 5. 加载数据集(使用公开指令数据集)
dataset = load_dataset("timdettmers/openassistant-guanaco", split="train[:1000]")
def formatting_func(example):
return f"### Human: {example['text'].split('### Assistant:')[0]}\n### Assistant: {example['text'].split('### Assistant:')[1]}"
# 6. 训练参数(优化显存使用)
training_args = TrainingArguments(
output_dir="./qlora_13b_output",
per_device_train_batch_size=1, # 13B 模型 batch_size 通常为 1
gradient_accumulation_steps=8, # 有效批次大小 = 1 * 8 = 8
num_train_epochs=2,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
save_strategy="epoch",
save_total_limit=2,
gradient_checkpointing=True, # 启用梯度检查点节省显存
optim="paged_adamw_8bit", # 分页优化器
report_to="none",
)
# 7. 创建 SFTTrainer
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer,
formatting_func=formatting_func,
max_seq_length=512,
)
# 8. 训练
print("开始 QLoRA 微调 13B 模型...")
trainer.train()
# 9. 保存适配器
model.save_pretrained("./qlora_13b_adapter")
tokenizer.save_pretrained("./qlora_13b_adapter")
print("QLoRA 微调完成")
# 10. 合并权重(可选,用于推理加速)
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./qlora_13b_merged")
print("合并模型已保存")
20.3 微调后的模型评估与部署上线
微调完成后,需要对模型进行客观评估,然后部署为 API 服务。
# 文件名:evaluate_and_deploy.py
# 评估微调后的模型并部署为 FastAPI 服务
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import json
import numpy as np
from sklearn.metrics import accuracy_score
import requests
# ============================================================
# 1. 评估模型(以客服任务为例)
# ============================================================
def evaluate_model(base_model_name, adapter_path, test_data_path):
"""加载 LoRA 模型并在测试集上评估"""
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval()
with open(test_data_path, "r", encoding="utf-8") as f:
test_data = json.load(f)
predictions = []
references = []
for item in test_data[:50]: # 评估 50 条
instruction = item["instruction"]
expected = item["output"]
prompt = f"### 用户:\n{instruction}\n\n### 客服:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=128,
temperature=0.1, # 低温度,减少随机性
do_sample=False,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if "### 客服:\n" in response:
response = response.split("### 客服:\n")[-1].strip()
predictions.append(response)
references.append(expected)
# 计算简单的准确率(完全匹配)
exact_matches = sum(1 for p, r in zip(predictions, references) if p == r)
accuracy = exact_matches / len(predictions)
print(f"完全匹配准确率: {accuracy:.2%}")
return predictions
# 示例测试数据
test_examples = [
{"instruction": "如何更改密码?", "output": "您可以在个人设置中点击修改密码,按照提示操作即可。"},
{"instruction": "你们的客服电话是多少?", "output": "我们的客服热线是 400-123-4567,工作时间 9:00-18:00。"}
]
with open("test_data.json", "w") as f:
json.dump(test_examples, f, ensure_ascii=False, indent=2)
# 运行评估(需要先有训练好的适配器)
# evaluate_model("Qwen/Qwen2.5-7B", "./customer_service_adapter", "test_data.json")
# ============================================================
# 2. 使用 FastAPI 部署模型(服务端)
# ============================================================
"""
# 单独创建 deployment.py 文件,内容如下:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import os
app = FastAPI(title="客服助手 API")
class ChatRequest(BaseModel):
message: str
max_tokens: int = 200
class ChatResponse(BaseModel):
reply: str
# 加载模型(启动时加载一次)
BASE_MODEL = "Qwen/Qwen2.5-7B"
ADAPTER_PATH = "./customer_service_adapter"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()
def generate(message, max_new_tokens=200):
prompt = f"### 用户:\n{message}\n\n### 客服:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=0.7,
do_sample=True,
top_p=0.9,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if "### 客服:\n" in response:
response = response.split("### 客服:\n")[-1].strip()
return response
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
try:
reply = generate(request.message, request.max_tokens)
return ChatResponse(reply=reply)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health():
return {"status": "ok"}
# 启动命令: uvicorn deployment:app --host 0.0.0.0 --port 8000
"""
# ============================================================
# 3. 客户端调用示例
# ============================================================
def call_customer_service_api(message, api_url="http://localhost:8000/chat"):
response = requests.post(api_url, json={"message": message})
if response.status_code == 200:
return response.json()["reply"]
else:
return f"Error: {response.text}"
# 测试
# reply = call_customer_service_api("我的订单还没发货")
# print(f"客服: {reply}")
print("\n部署说明:")
print("1. 将上面的服务端代码保存为 deployment.py")
print("2. 安装依赖: pip install fastapi uvicorn")
print("3. 启动服务: uvicorn deployment:app --host 0.0.0.0 --port 8000")
print("4. 客户端调用: POST http://localhost:8000/chat 带 JSON 数据")
21. 多模态实战案例
21.1 使用 CLIP 进行零样本图像分类
CLIP 通过对比学习联合训练图像和文本编码器,能够实现零样本分类:即不需要任何训练样本,直接通过计算图像与类别文本描述的相似度进行分类。
# 文件名:clip_zero_shot.py
# 使用 CLIP 进行零样本图像分类
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import requests
# 1. 加载 CLIP 模型
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
# 2. 加载图像(示例:猫的图片)
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# 3. 定义候选类别
candidate_labels = ["cat", "dog", "car", "bird", "person", "book"]
# 为 CLIP 构造更自然的描述文本
text_descriptions = [f"a photo of a {label}" for label in candidate_labels]
# 4. 预处理
inputs = processor(text=text_descriptions, images=image, return_tensors="pt", padding=True)
# 5. 推理
with torch.no_grad():
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # 图像与文本的相似度
probs = logits_per_image.softmax(dim=-1) # 转换为概率
# 6. 输出结果
print("零样本分类结果:")
for label, prob in zip(candidate_labels, probs[0]):
print(f" {label}: {prob.item():.4f}")
predicted_idx = probs.argmax().item()
print(f"\n预测类别: {candidate_labels[predicted_idx]} (置信度: {probs[0][predicted_idx]:.4f})")
# 7. 批量图像分类示例
image_urls = [
"http://images.cocodataset.org/val2017/000000039769.jpg", # 猫
"http://images.cocodataset.org/val2017/000000574769.jpg", # 狗?实际是牛
]
images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]
inputs = processor(text=text_descriptions, images=images, return_tensors="pt", padding=True)
with torch.no_grad():
outputs = model(**inputs)
probs = outputs.logits_per_image.softmax(dim=-1)
for i, img_probs in enumerate(probs):
pred = candidate_labels[img_probs.argmax().item()]
print(f"图像 {i+1} 预测: {pred}")
21.2 使用 Whisper 进行语音识别微调
Whisper 是 OpenAI 的语音识别模型,支持多语言。本例展示如何使用自定义音频数据微调 Whisper 的小型版本。
# 文件名:whisper_finetune.py
# 使用 Whisper 进行语音识别微调(需要安装 datasets、soundfile、librosa)
import torch
from transformers import (
WhisperProcessor,
WhisperForConditionalGeneration,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
DataCollatorForSeq2Seq
)
from datasets import load_dataset, Audio
import evaluate
# 1. 加载数据集(这里使用 Common Voice 的极小部分示例)
# 实际使用时,请替换为你的音频数据集,需包含 "audio" 和 "sentence" 列
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train[:20]")
print(f"数据集大小: {len(dataset)}")
# 2. 加载 Whisper 处理器和模型(使用 tiny 版本快速演示)
model_name = "openai/whisper-tiny"
processor = WhisperProcessor.from_pretrained(model_name, language="en", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(model_name)
# 强制语言和任务
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
# 3. 预处理函数:将音频重采样到 16kHz,并转换为输入特征
def prepare_dataset(batch):
# 加载音频(重采样到 16kHz)
audio = batch["audio"]
# 计算 log-mel 频谱特征
inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt")
batch["input_features"] = inputs.input_features[0]
# 处理文本标签
batch["labels"] = processor(text=batch["sentence"], return_tensors="pt").input_ids[0]
return batch
# 确保数据集包含音频路径,并将采样率设置为 16kHz
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
# 划分训练/验证
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
# 4. 数据整理器
data_collator = DataCollatorForSeq2Seq(processor=processor, model=model)
# 5. 评估指标(WER)
wer_metric = evaluate.load("wer")
def compute_metrics(pred):
pred_ids = pred.predictions
label_ids = pred.label_ids
# 替换 -100 为 pad_token_id
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
# 解码
pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
wer = wer_metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
# 6. 训练参数
training_args = Seq2SeqTrainingArguments(
output_dir="./whisper_finetuned",
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=2,
num_train_epochs=3,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_steps=10,
learning_rate=1e-5,
warmup_steps=10,
fp16=True,
predict_with_generate=True,
generation_max_length=225,
report_to="none",
)
# 7. 创建 Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor,
)
# 8. 训练(数据量小,很快完成)
print("开始 Whisper 微调...")
trainer.train()
# 9. 保存模型
model.save_pretrained("./whisper_finetuned")
processor.save_pretrained("./whisper_finetuned")
print("微调完成")
# 10. 推理示例(需要单独的音频文件)
def transcribe_audio(audio_path):
processor = WhisperProcessor.from_pretrained("./whisper_finetuned")
model = WhisperForConditionalGeneration.from_pretrained("./whisper_finetuned")
import librosa
audio, sr = librosa.load(audio_path, sr=16000)
input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features
with torch.no_grad():
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
# print(transcribe_audio("sample.wav"))
21.3 使用 LLaVA 构建图像理解对话系统
LLaVA 是一种视觉语言模型,可以理解图像并与用户对话。本例展示如何加载 LLaVA 并进行图像对话。
# 文件名:llava_dialogue.py
# 使用 LLaVA 进行图像理解对话
import torch
from transformers import LlavaProcessor, LlavaForConditionalGeneration
from PIL import Image
import requests
# 1. 加载 LLaVA 模型和处理器
# 注意:LLaVA-1.5-7B 需要约 16GB 显存
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto",
)
# 2. 加载图像
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# 3. 构建对话
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is happening in this image?"},
],
},
]
# 4. 应用聊天模板并生成
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
# 5. 生成回答
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=200,
do_sample=False,
temperature=0.2,
)
response = processor.decode(output_ids[0], skip_special_tokens=True)
print("=== 图像对话 ===")
print(f"用户: {conversation[0]['content'][1]['text']}")
# 提取模型回答(去除 prompt 部分)
if "ASSISTANT:" in response:
response = response.split("ASSISTANT:")[-1].strip()
print(f"助手: {response}")
# 6. 多轮对话示例
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "How many cats are in this picture?"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "There are two cats."}],
},
{
"role": "user",
"content": [{"type": "text", "text": "What color are they?"}],
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=100)
response = processor.decode(output_ids[0], skip_special_tokens=True)
if "ASSISTANT:" in response:
response = response.split("ASSISTANT:")[-1].strip()
print(f"\n多轮对话:")
print(f"用户: What color are they?")
print(f"助手: {response}")
# 7. 保存模型(可选)
# model.save_pretrained("./llava_finetuned")
# processor.save_pretrained("./llava_finetuned")
🌟 感谢您耐心阅读到这里!
💡 如果本文对您有所启发欢迎:
👍 点赞📌 收藏 📤 分享给更多需要的伙伴。
🗣️ 期待在评论区看到您的想法, 共同进步。
🔔 关注我,持续获取更多干货内容~
🤗 我们下篇文章见~
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐



所有评论(0)