多模态 AI API 实战
·
前言
💡 痛点:只懂文本 API?图片理解、语音识别、视频分析怎么做?多模态 API 怎么选?
🎯 解决方案:掌握 多模态 AI API — 从图片理解、到语音处理、再到视频分析。
多模态 AI API 全景:
主流平台多模态能力对比:
| 平台 | 图片理解 | 语音识别 | TTS | 视频理解 | 文档理解 |
|---|---|---|---|---|---|
| OpenAI | ✅ GPT-4V | ✅ Whisper | ✅ TTS | ❌ | ✅ |
| Claude | ✅ Claude 3 | ❌ | ❌ | ❌ | ✅ |
| 通义千问 | ✅ qwen-vl | ✅ Paraformer | ✅ Sambert | ❌ | ✅ |
| 文心 | ✅ ERNIE-ViL | ✅ | ✅ | ❌ | ✅ |
| ✅ Gemini | ✅ | ✅ | ✅ | ✅ |
一、图片理解 API
1.1 OpenAI GPT-4V
# ===== OpenAI GPT-4V 图片理解 =====
from openai import OpenAI
import base64
from PIL import Image
import io
client = OpenAI()
def encode_image_to_base64(image_path: str) -> str:
"""将图片编码为 base64"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def describe_image(image_path: str):
"""描述图片"""
base64_image = encode_image_to_base64(image_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "描述这张图片"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
max_tokens=1024
)
return response.choices[0].message.content
def ocr_image(image_path: str):
"""OCR:图片中的文字"""
base64_image = encode_image_to_base64(image_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "提取图片中的所有文字"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
max_tokens=1024
)
return response.choices[0].message.content
def analyze_multiple_images(image_paths: list):
"""分析多张图片"""
content = [{"type": "text", "text": "比较这些图片的异同"}]
for path in image_paths:
base64_image = encode_image_to_base64(path)
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
})
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
max_tokens=1024
)
return response.choices[0].message.content
# 使用
if __name__ == '__main__':
# 描述图片
description = describe_image("image.jpg")
print(f"描述: {description}")
# OCR
text = ocr_image("document.jpg")
print(f"文字: {text}")
1.2 Claude 3 Vision
# ===== Claude 3 图片理解 =====
from anthropic import Anthropic
import base64
client = Anthropic()
def describe_image_claude(image_path: str):
"""描述图片(Claude 3)"""
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "描述这张图片"},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_data
}
}
]
}
]
)
return message.content[0].text
def analyze_image_with_context(image_path: str, question: str):
"""带上下文的图片分析"""
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_data
}
}
]
}
]
)
return message.content[0].text
# 使用
if __name__ == '__main__':
description = describe_image_claude("image.jpg")
print(f"描述: {description}")
1.3 通义千问 Vision
# ===== 通义千问 图片理解 =====
from dashscope import MultiModalConversation
def describe_image_qwen(image_path: str):
"""描述图片(通义千问)"""
response = MultiModalConversation.call(
model="qwen-vl-max",
messages=[
{
"role": "user",
"content": [
{"image": image_path},
{"text": "描述这张图片"}
]
}
]
)
return response.output.text
def ocr_image_qwen(image_path: str):
"""OCR(通义千问)"""
response = MultiModalConversation.call(
model="qwen-vl-max",
messages=[
{
"role": "user",
"content": [
{"image": image_path},
{"text": "提取图片中的所有文字"}
]
}
]
)
return response.output.text
# 使用
if __name__ == '__main__':
description = describe_image_qwen("image.jpg")
print(f"描述: {description}")
二、语音识别 API(ASR)
2.1 OpenAI Whisper
# ===== OpenAI Whisper 语音识别 =====
from openai import OpenAI
client = OpenAI()
def transcribe_audio(audio_path: str):
"""转录音频"""
with open(audio_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language="zh" # 中文
)
return transcript.text
def translate_audio(audio_path: str):
"""翻译音频(转英文)"""
with open(audio_path, "rb") as audio_file:
translation = client.audio.translations.create(
model="whisper-1",
file=audio_file
)
return translation.text
def transcribe_with_timestamps(audio_path: str):
"""带时间戳的转录"""
with open(audio_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json" # 详细格式
)
# 解析时间戳
segments = transcript.segments
for segment in segments:
start = segment["start"]
end = segment["end"]
text = segment["text"]
print(f"[{start:.2f}s - {end:.2f}s]: {text}")
return transcript.text
# 使用
if __name__ == '__main__':
# 转录
text = transcribe_audio("audio.mp3")
print(f"转录: {text}")
# 翻译
translation = translate_audio("chinese_audio.mp3")
print(f"翻译: {translation}")
2.2 通义 Paraformer
# ===== 通义 Paraformer 语音识别 =====
from dashscope import audio
def transcribe_audio_paraformer(audio_url: str):
"""转录音频(Paraformer)"""
result = audio.asr.Transcription.call(
model="paraformer-realtime-v1",
file_urls=[audio_url]
)
# 获取结果
transcripts = []
for res in result.output.results:
for sentence in res.sentences:
transcripts.append(sentence.text)
return " ".join(transcripts)
def transcribe_with_punctuation(audio_url: str):
"""带标点的转录"""
result = audio.asr.Transcription.call(
model="paraformer-realtime-v1",
file_urls=[audio_url],
punctuation=True # 添加标点
)
return result.output.results[0].transcripts[0]
# 使用
if __name__ == '__main__':
audio_url = "https://example.com/audio.mp3"
text = transcribe_audio_paraformer(audio_url)
print(f"转录: {text}")
三、文字转语音 API(TTS)
3.1 OpenAI TTS
# ===== OpenAI TTS 文字转语音 =====
from openai import OpenAI
client = OpenAI()
def text_to_speech(text: str, output_path: str = "output.mp3"):
"""文字转语音"""
response = client.audio.speech.create(
model="tts-1",
voice="alloy", # alloy/echo/fable/onyx/nova/shimmer
input=text
)
# 保存音频
response.stream_to_file(output_path)
print(f"音频已保存到: {output_path}")
def text_to_speech_with_options(
text: str,
output_path: str = "output.mp3",
voice: str = "alloy",
speed: float = 1.0
):
"""带选项的文字转语音"""
response = client.audio.speech.create(
model="tts-1",
voice=voice,
input=text,
speed=speed # 0.25 - 4.0
)
response.stream_to_file(output_path)
print(f"音频已保存到: {output_path}")
# 使用
if __name__ == '__main__':
# 基础 TTS
text_to_speech("你好,世界!", "hello.mp3")
# 带选项
text_to_speech_with_options(
text="这是一段测试语音。",
output_path="test.mp3",
voice="nova", # 女声
speed=1.2 # 加快 20%
)
3.2 通义 Sambert
# ===== 通义 Sambert 文字转语音 =====
from dashscope import audio
def text_to_speech_sambert(text: str, output_path: str = "output.mp3"):
"""文字转语音(Sambert)"""
result = audio.tts.SpeechSynthesizer.call(
model="sambert-zhichu-v1", # 多种音色
text=text
)
# 保存音频
with open(output_path, "wb") as f:
f.write(result.output.audio)
print(f"音频已保存到: {output_path}")
# 使用
if __name__ == '__main__':
text_to_speech_sambert("你好,世界!", "hello.mp3")
四、视频理解 API
4.1 OpenAI GPT-4V(视频帧分析)
# ===== 视频理解(帧分析)=====
from openai import OpenAI
import cv2
import base64
import tempfile
client = OpenAI()
def extract_video_frames(video_path: str, num_frames: int = 10) -> list:
"""提取视频关键帧"""
video = cv2.VideoCapture(video_path)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
frame_indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
frames = []
for idx in frame_indices:
video.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = video.read()
if ret:
# 保存为临时文件
temp_file = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
cv2.imwrite(temp_file.name, frame)
frames.append(temp_file.name)
video.release()
return frames
def analyze_video(video_path: str, question: str = "描述这个视频"):
"""分析视频"""
# 提取关键帧
frames = extract_video_frames(video_path, num_frames=10)
# 构建多模态内容
content = [{"type": "text", "text": question}]
for frame_path in frames:
with open(frame_path, "rb") as f:
frame_data = base64.b64encode(f.read()).decode('utf-8')
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{frame_data}"
}
})
# 调用 API
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
max_tokens=2048
)
return response.choices[0].message.content
# 使用
if __name__ == '__main__':
analysis = analyze_video("video.mp4", "这个视频在做什么?")
print(f"分析: {analysis}")
4.2 Google Gemini(原生视频理解)
# ===== Google Gemini 视频理解 =====
import google.generativeai as genai
from google.generativeai import GenerativeModel
import os
# 配置 API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
def analyze_video_gemini(video_path: str, question: str):
"""分析视频(Gemini)"""
model = GenerativeModel("gemini-1.5-pro")
# 上传视频
video = genai.upload_file(video_path)
# 分析
response = model.generate_content([video, question])
return response.text
def video_summarization(video_path: str):
"""视频摘要"""
return analyze_video_gemini(video_path, "总结这个视频的主要内容")
def extract_video_highlights(video_path: str):
"""提取视频亮点"""
return analyze_video_gemini(video_path, "这个视频的关键亮点是什么?")
# 使用
if __name__ == '__main__':
summary = video_summarization("video.mp4")
print(f"摘要: {summary}")
五、文档理解 API
5.1 OpenAI PDF 理解
# ===== PDF 理解 =====
from openai import OpenAI
import base64
client = OpenAI()
def encode_pdf_to_base64(pdf_path: str) -> str:
"""将 PDF 编码为 base64"""
with open(pdf_path, "rb") as pdf_file:
return base64.b64encode(pdf_file.read()).decode('utf-8')
def analyze_pdf(pdf_path: str, question: str):
"""分析 PDF"""
base64_pdf = encode_pdf_to_base64(pdf_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:application/pdf;base64,{base64_pdf}"
}
}
]
}
],
max_tokens=2048
)
return response.choices[0].message.content
def extract_pdf_tables(pdf_path: str):
"""提取 PDF 表格"""
return analyze_pdf(pdf_path, "提取文档中的所有表格,保持表格结构")
def pdf_question_answering(pdf_path: str, question: str):
"""PDF 问答"""
return analyze_pdf(pdf_path, f"根据文档内容回答问题:{question}")
# 使用
if __name__ == '__main__':
# 分析 PDF
analysis = analyze_pdf("document.pdf", "总结这份文档")
print(f"分析: {analysis}")
# 提取表格
tables = extract_pdf_tables("report.pdf")
print(f"表格: {tables}")
5.2 通义千问 文档理解
# ===== 通义千问 文档理解 =====
from dashscope import MultiModalConversation
def analyze_pdf_qwen(pdf_path: str, question: str):
"""分析 PDF(通义千问)"""
response = MultiModalConversation.call(
model="qwen-vl-max",
messages=[
{
"role": "user",
"content": [
{"pdf": pdf_path},
{"text": question}
]
}
]
)
return response.output.text
def extract_text_from_pdf_qwen(pdf_path: str):
"""从 PDF 提取文字(通义千问)"""
return analyze_pdf_qwen(pdf_path, "提取文档中的所有文字")
def analyze_pdf_images_qwen(pdf_path: str):
"""分析 PDF 中的图片(通义千问)"""
return analyze_pdf_qwen(pdf_path, "描述文档中的所有图片")
# 使用
if __name__ == '__main__':
analysis = analyze_pdf_qwen("document.pdf", "总结这份文档")
print(f"分析: {analysis}")
六、平台对比
6.1 图片理解对比
# ===== 图片理解对比 =====
image_understanding_comparison = {
"OpenAI GPT-4V": {
"模型": "gpt-4o / gpt-4-turbo",
"最大图片数": "多张",
"支持格式": "JPEG/PNG/GIF/WebP",
"最大大小": "20MB",
"特性": ["OCR", "目标检测", "图像描述", "视觉推理"],
"价格": "$0.005/图片"
},
"Claude 3": {
"模型": "claude-3-5-sonnet",
"最大图片数": "20张",
"支持格式": "JPEG/PNG/GIF/WebP",
"最大大小": "5MB/张",
"特性": ["OCR", "图像描述", "图表理解"],
"价格": "$0.003/图片"
},
"通义千问": {
"模型": "qwen-vl-max",
"最大图片数": "多张",
"支持格式": "JPEG/PNG/BMP",
"最大大小": "10MB",
"特性": ["OCR", "图像描述", "中文场景优化"],
"价格": "¥0.02/图片"
}
}
6.2 语音识别对比
# ===== 语音识别对比 =====
asr_comparison = {
"OpenAI Whisper": {
"模型": "whisper-1",
"支持语言": "99+ 种",
"特性": ["多语言", "翻译", "时间戳"],
"准确率": 5,
"价格": "$0.006/分钟"
},
"通义 Paraformer": {
"模型": "paraformer-realtime-v1",
"支持语言": "中文/英文",
"特性": ["实时", "标点", "说话人识别"],
"准确率": 5,
"价格": "¥0.004/分钟"
},
"Google Speech-to-Text": {
"模型": "latest-long",
"支持语言": "125+ 种",
"特性": ["实时", "多语言", "自动标点"],
"准确率": 5,
"价格": "$0.024/分钟"
}
}
# 准确率:1-5,5 为最高
6.3 文字转语音对比
# ===== 文字转语音对比 =====
tts_comparison = {
"OpenAI TTS": {
"模型": "tts-1 / tts-1-hd",
"音色数": 6,
"支持语言": "15+ 种",
"特性": ["多种音色", "语速调节"],
"音质": 4,
"价格": "$15.00/1M characters"
},
"通义 Sambert": {
"模型": "sambert-zhichu-v1",
"音色数": 20+,
"支持语言": "中文/英文",
"特性": ["多种音色", "情感调节"],
"音质": 4,
"价格": "¥20.00/1M characters"
},
"Azure TTS": {
"模型": "neural",
"音色数": 400+,
"支持语言": "140+ 种",
"特性": ["神经网络", "多种音色", "SSML 支持"],
"音质": 5,
"价格": "$16.00/1M characters"
}
}
# 音质:1-5,5 为最高
七、生产案例
7.1 案例:智能文档分析系统
# ===== 案例:智能文档分析系统 =====
from openai import OpenAI
import base64
import os
from typing import List, Dict
class DocumentAnalyzer:
"""智能文档分析系统"""
def __init__(self, api_key: str):
self.client = OpenAI(api_key=api_key)
def analyze_document(self, file_path: str) -> Dict:
"""
分析文档
支持格式:PDF/图片/文本
"""
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == ".pdf":
return self._analyze_pdf(file_path)
elif file_ext in [".jpg", ".jpeg", ".png", ".gif"]:
return self._analyze_image(file_path)
elif file_ext == ".txt":
return self._analyze_text(file_path)
else:
raise ValueError(f"不支持的格式: {file_ext}")
def _analyze_pdf(self, pdf_path: str) -> Dict:
"""分析 PDF"""
base64_pdf = self._encode_file_to_base64(pdf_path)
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": """分析这份 PDF 文档,提取以下信息:
1. 文档摘要
2. 关键要点
3. 重要数据
4. 结论
以 JSON 格式返回。"""},
{
"type": "image_url",
"image_url": {
"url": f"data:application/pdf;base64,{base64_pdf}"
}
}
]
}
],
response_format={"type": "json_object"},
max_tokens=2048
)
import json
return json.loads(response.choices[0].message.content)
def _analyze_image(self, image_path: str) -> Dict:
"""分析图片"""
base64_image = self._encode_file_to_base64(image_path)
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": """分析这张图片,提取以下信息:
1. 图片描述
2. 关键对象
3. 文字内容(如果有)
4. 场景分析
以 JSON 格式返回。"""},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
response_format={"type": "json_object"},
max_tokens=1024
)
import json
return json.loads(response.choices[0].message.content)
def _analyze_text(self, text_path: str) -> Dict:
"""分析文本"""
with open(text_path, "r", encoding="utf-8") as f:
text = f.read()
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": f"""分析以下文本,提取以下信息:
1. 文本摘要
2. 关键要点
3. 重要数据
4. 结论
以 JSON 格式返回。
文本:
{text}"""
}
],
response_format={"type": "json_object"},
max_tokens=1024
)
import json
return json.loads(response.choices[0].message.content)
def _encode_file_to_base64(self, file_path: str) -> str:
"""将文件编码为 base64"""
with open(file_path, "rb") as f:
return base64.b64encode(f.read()).decode('utf-8')
def batch_analyze(self, file_paths: List[str]) -> List[Dict]:
"""批量分析文档"""
results = []
for file_path in file_paths:
try:
result = self.analyze_document(file_path)
results.append({
"file": file_path,
"analysis": result,
"success": True
})
except Exception as e:
results.append({
"file": file_path,
"error": str(e),
"success": False
})
return results
# 使用
if __name__ == '__main__':
analyzer = DocumentAnalyzer(api_key="your-api-key")
# 分析单个文档
result = analyzer.analyze_document("document.pdf")
print(f"分析结果: {result}")
# 批量分析
results = analyzer.batch_analyze([
"doc1.pdf",
"doc2.jpg",
"doc3.txt"
])
print(f"批量结果: {results}")
7.2 案例:多模态客服系统
# ===== 案例:多模态客服系统 =====
from openai import OpenAI
import base64
class MultimodalCustomerService:
"""多模态客服系统"""
def __init__(self, api_key: str):
self.client = OpenAI(api_key=api_key)
self.conversation_history = []
def handle_text(self, text: str) -> str:
"""处理文本消息"""
self.conversation_history.append({
"role": "user",
"content": text
})
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个智能客服助手,可以处理文本、图片、语音等多种输入。"}
] + self.conversation_history,
max_tokens=1024
)
assistant_reply = response.choices[0].message.content
self.conversation_history.append({
"role": "assistant",
"content": assistant_reply
})
return assistant_reply
def handle_image(self, image_path: str, question: str = "分析这张图片") -> str:
"""处理图片消息"""
base64_image = self._encode_image_to_base64(image_path)
messages = list(self.conversation_history)
messages.append({
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
})
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "你是一个智能客服助手,可以处理文本、图片、语音等多种输入。"}
] + messages,
max_tokens=1024
)
assistant_reply = response.choices[0].message.content
self.conversation_history.append({
"role": "user",
"content": f"[图片] {question}"
})
self.conversation_history.append({
"role": "assistant",
"content": assistant_reply
})
return assistant_reply
def handle_audio(self, audio_path: str) -> str:
"""处理语音消息"""
# 1. 语音转文字
with open(audio_path, "rb") as audio_file:
transcript = self.client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
text = transcript.text
print(f"语音转文字: {text}")
# 2. 处理文字
return self.handle_text(text)
def _encode_image_to_base64(self, image_path: str) -> str:
"""将图片编码为 base64"""
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode('utf-8')
def reset_conversation(self):
"""重置对话"""
self.conversation_history = []
# 使用
if __name__ == '__main__':
客服 = MultimodalCustomerService(api_key="your-api-key")
# 文本对话
reply1 = 客服.handle_text("你们的营业时间是什么?")
print(f"回复1: {reply1}")
# 图片分析
reply2 = 客服.handle_image("product.jpg", "这个产品有什么特点?")
print(f"回复2: {reply2}")
# 语音处理
reply3 = 客服.handle_audio("voice_message.mp3")
print(f"回复3: {reply3}")
八、总结
8.1 核心要点
8.2 选型建议
| 需求 | 推荐方案 | 理由 |
|---|---|---|
| 图片理解 | GPT-4V / Claude 3 | 能力强 |
| OCR | GPT-4V / 通义千问 | 中文优化 |
| 语音识别 | Whisper / Paraformer | 多语言/中文 |
| 文字转语音 | OpenAI TTS / Sambert | 音质好 |
| 视频理解 | Gemini 1.5 Pro | 原生支持 |
| 文档理解 | GPT-4V / 通义千问 | PDF 支持好 |
8.3 最佳实践
| 实践 | 说明 |
|---|---|
| 图片压缩 | 减小文件大小,降低成本 |
| 批量处理 | 使用 Batch API 降低成本 |
| 缓存结果 | 避免重复调用 |
| 错误处理 | 重试机制 + 降级方案 |
| 成本监控 | Token 统计 + 成本追踪 |
本文基于各平台官方文档编写。如有问题欢迎评论区讨论!
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐

所有评论(0)