前言

💡 痛点:只懂文本 API?图片理解、语音识别、视频分析怎么做?多模态 API 怎么选?

🎯 解决方案:掌握 多模态 AI API — 从图片理解、到语音处理、再到视频分析。

多模态 AI API 全景:

多模态 AI API

图片理解

语音处理

视频理解

文档理解

OCR

目标检测

图像描述

语音识别 ASR

文字转语音 TTS

说话人识别

视频摘要

动作识别

场景理解

PDF 理解

表格提取

文档问答

主流平台多模态能力对比:

平台 图片理解 语音识别 TTS 视频理解 文档理解
OpenAI ✅ GPT-4V ✅ Whisper ✅ TTS
Claude ✅ Claude 3
通义千问 ✅ qwen-vl ✅ Paraformer ✅ Sambert
文心 ✅ ERNIE-ViL
Google ✅ Gemini

一、图片理解 API

1.1 OpenAI GPT-4V

# ===== OpenAI GPT-4V 图片理解 =====

from openai import OpenAI
import base64
from PIL import Image
import io

client = OpenAI()

def encode_image_to_base64(image_path: str) -> str:
    """将图片编码为 base64"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def describe_image(image_path: str):
    """描述图片"""
    base64_image = encode_image_to_base64(image_path)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "描述这张图片"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        max_tokens=1024
    )
    
    return response.choices[0].message.content

def ocr_image(image_path: str):
    """OCR:图片中的文字"""
    base64_image = encode_image_to_base64(image_path)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "提取图片中的所有文字"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        max_tokens=1024
    )
    
    return response.choices[0].message.content

def analyze_multiple_images(image_paths: list):
    """分析多张图片"""
    content = [{"type": "text", "text": "比较这些图片的异同"}]
    
    for path in image_paths:
        base64_image = encode_image_to_base64(path)
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
            }
        })
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        max_tokens=1024
    )
    
    return response.choices[0].message.content

# 使用
if __name__ == '__main__':
    # 描述图片
    description = describe_image("image.jpg")
    print(f"描述: {description}")
    
    # OCR
    text = ocr_image("document.jpg")
    print(f"文字: {text}")

1.2 Claude 3 Vision

# ===== Claude 3 图片理解 =====

from anthropic import Anthropic
import base64

client = Anthropic()

def describe_image_claude(image_path: str):
    """描述图片(Claude 3)"""
    with open(image_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode('utf-8')
    
    message = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "描述这张图片"},
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": image_data
                        }
                    }
                ]
            }
        ]
    )
    
    return message.content[0].text

def analyze_image_with_context(image_path: str, question: str):
    """带上下文的图片分析"""
    with open(image_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode('utf-8')
    
    message = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": image_data
                        }
                    }
                ]
            }
        ]
    )
    
    return message.content[0].text

# 使用
if __name__ == '__main__':
    description = describe_image_claude("image.jpg")
    print(f"描述: {description}")

1.3 通义千问 Vision

# ===== 通义千问 图片理解 =====

from dashscope import MultiModalConversation

def describe_image_qwen(image_path: str):
    """描述图片(通义千问)"""
    response = MultiModalConversation.call(
        model="qwen-vl-max",
        messages=[
            {
                "role": "user",
                "content": [
                    {"image": image_path},
                    {"text": "描述这张图片"}
                ]
            }
        ]
    )
    
    return response.output.text

def ocr_image_qwen(image_path: str):
    """OCR(通义千问)"""
    response = MultiModalConversation.call(
        model="qwen-vl-max",
        messages=[
            {
                "role": "user",
                "content": [
                    {"image": image_path},
                    {"text": "提取图片中的所有文字"}
                ]
            }
        ]
    )
    
    return response.output.text

# 使用
if __name__ == '__main__':
    description = describe_image_qwen("image.jpg")
    print(f"描述: {description}")

二、语音识别 API(ASR)

2.1 OpenAI Whisper

# ===== OpenAI Whisper 语音识别 =====

from openai import OpenAI

client = OpenAI()

def transcribe_audio(audio_path: str):
    """转录音频"""
    with open(audio_path, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            language="zh"  # 中文
        )
    
    return transcript.text

def translate_audio(audio_path: str):
    """翻译音频(转英文)"""
    with open(audio_path, "rb") as audio_file:
        translation = client.audio.translations.create(
            model="whisper-1",
            file=audio_file
        )
    
    return translation.text

def transcribe_with_timestamps(audio_path: str):
    """带时间戳的转录"""
    with open(audio_path, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            response_format="verbose_json"  # 详细格式
        )
    
    # 解析时间戳
    segments = transcript.segments
    for segment in segments:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]
        print(f"[{start:.2f}s - {end:.2f}s]: {text}")
    
    return transcript.text

# 使用
if __name__ == '__main__':
    # 转录
    text = transcribe_audio("audio.mp3")
    print(f"转录: {text}")
    
    # 翻译
    translation = translate_audio("chinese_audio.mp3")
    print(f"翻译: {translation}")

2.2 通义 Paraformer

# ===== 通义 Paraformer 语音识别 =====

from dashscope import audio

def transcribe_audio_paraformer(audio_url: str):
    """转录音频(Paraformer)"""
    result = audio.asr.Transcription.call(
        model="paraformer-realtime-v1",
        file_urls=[audio_url]
    )
    
    # 获取结果
    transcripts = []
    for res in result.output.results:
        for sentence in res.sentences:
            transcripts.append(sentence.text)
    
    return " ".join(transcripts)

def transcribe_with_punctuation(audio_url: str):
    """带标点的转录"""
    result = audio.asr.Transcription.call(
        model="paraformer-realtime-v1",
        file_urls=[audio_url],
        punctuation=True  # 添加标点
    )
    
    return result.output.results[0].transcripts[0]

# 使用
if __name__ == '__main__':
    audio_url = "https://example.com/audio.mp3"
    text = transcribe_audio_paraformer(audio_url)
    print(f"转录: {text}")

三、文字转语音 API(TTS)

3.1 OpenAI TTS

# ===== OpenAI TTS 文字转语音 =====

from openai import OpenAI

client = OpenAI()

def text_to_speech(text: str, output_path: str = "output.mp3"):
    """文字转语音"""
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy",  # alloy/echo/fable/onyx/nova/shimmer
        input=text
    )
    
    # 保存音频
    response.stream_to_file(output_path)
    print(f"音频已保存到: {output_path}")

def text_to_speech_with_options(
    text: str,
    output_path: str = "output.mp3",
    voice: str = "alloy",
    speed: float = 1.0
):
    """带选项的文字转语音"""
    response = client.audio.speech.create(
        model="tts-1",
        voice=voice,
        input=text,
        speed=speed  # 0.25 - 4.0
    )
    
    response.stream_to_file(output_path)
    print(f"音频已保存到: {output_path}")

# 使用
if __name__ == '__main__':
    # 基础 TTS
    text_to_speech("你好,世界!", "hello.mp3")
    
    # 带选项
    text_to_speech_with_options(
        text="这是一段测试语音。",
        output_path="test.mp3",
        voice="nova",  # 女声
        speed=1.2  # 加快 20%
    )

3.2 通义 Sambert

# ===== 通义 Sambert 文字转语音 =====

from dashscope import audio

def text_to_speech_sambert(text: str, output_path: str = "output.mp3"):
    """文字转语音(Sambert)"""
    result = audio.tts.SpeechSynthesizer.call(
        model="sambert-zhichu-v1",  # 多种音色
        text=text
    )
    
    # 保存音频
    with open(output_path, "wb") as f:
        f.write(result.output.audio)
    
    print(f"音频已保存到: {output_path}")

# 使用
if __name__ == '__main__':
    text_to_speech_sambert("你好,世界!", "hello.mp3")

四、视频理解 API

4.1 OpenAI GPT-4V(视频帧分析)

# ===== 视频理解(帧分析)=====

from openai import OpenAI
import cv2
import base64
import tempfile

client = OpenAI()

def extract_video_frames(video_path: str, num_frames: int = 10) -> list:
    """提取视频关键帧"""
    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    
    frame_indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
    frames = []
    
    for idx in frame_indices:
        video.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = video.read()
        if ret:
            # 保存为临时文件
            temp_file = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
            cv2.imwrite(temp_file.name, frame)
            frames.append(temp_file.name)
    
    video.release()
    return frames

def analyze_video(video_path: str, question: str = "描述这个视频"):
    """分析视频"""
    # 提取关键帧
    frames = extract_video_frames(video_path, num_frames=10)
    
    # 构建多模态内容
    content = [{"type": "text", "text": question}]
    
    for frame_path in frames:
        with open(frame_path, "rb") as f:
            frame_data = base64.b64encode(f.read()).decode('utf-8')
        
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{frame_data}"
            }
        })
    
    # 调用 API
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        max_tokens=2048
    )
    
    return response.choices[0].message.content

# 使用
if __name__ == '__main__':
    analysis = analyze_video("video.mp4", "这个视频在做什么?")
    print(f"分析: {analysis}")

4.2 Google Gemini(原生视频理解)

# ===== Google Gemini 视频理解 =====

import google.generativeai as genai
from google.generativeai import GenerativeModel
import os

# 配置 API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

def analyze_video_gemini(video_path: str, question: str):
    """分析视频(Gemini)"""
    model = GenerativeModel("gemini-1.5-pro")
    
    # 上传视频
    video = genai.upload_file(video_path)
    
    # 分析
    response = model.generate_content([video, question])
    
    return response.text

def video_summarization(video_path: str):
    """视频摘要"""
    return analyze_video_gemini(video_path, "总结这个视频的主要内容")

def extract_video_highlights(video_path: str):
    """提取视频亮点"""
    return analyze_video_gemini(video_path, "这个视频的关键亮点是什么?")

# 使用
if __name__ == '__main__':
    summary = video_summarization("video.mp4")
    print(f"摘要: {summary}")

五、文档理解 API

5.1 OpenAI PDF 理解

# ===== PDF 理解 =====

from openai import OpenAI
import base64

client = OpenAI()

def encode_pdf_to_base64(pdf_path: str) -> str:
    """将 PDF 编码为 base64"""
    with open(pdf_path, "rb") as pdf_file:
        return base64.b64encode(pdf_file.read()).decode('utf-8')

def analyze_pdf(pdf_path: str, question: str):
    """分析 PDF"""
    base64_pdf = encode_pdf_to_base64(pdf_path)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:application/pdf;base64,{base64_pdf}"
                        }
                    }
                ]
            }
        ],
        max_tokens=2048
    )
    
    return response.choices[0].message.content

def extract_pdf_tables(pdf_path: str):
    """提取 PDF 表格"""
    return analyze_pdf(pdf_path, "提取文档中的所有表格,保持表格结构")

def pdf_question_answering(pdf_path: str, question: str):
    """PDF 问答"""
    return analyze_pdf(pdf_path, f"根据文档内容回答问题:{question}")

# 使用
if __name__ == '__main__':
    # 分析 PDF
    analysis = analyze_pdf("document.pdf", "总结这份文档")
    print(f"分析: {analysis}")
    
    # 提取表格
    tables = extract_pdf_tables("report.pdf")
    print(f"表格: {tables}")

5.2 通义千问 文档理解

# ===== 通义千问 文档理解 =====

from dashscope import MultiModalConversation

def analyze_pdf_qwen(pdf_path: str, question: str):
    """分析 PDF(通义千问)"""
    response = MultiModalConversation.call(
        model="qwen-vl-max",
        messages=[
            {
                "role": "user",
                "content": [
                    {"pdf": pdf_path},
                    {"text": question}
                ]
            }
        ]
    )
    
    return response.output.text

def extract_text_from_pdf_qwen(pdf_path: str):
    """从 PDF 提取文字(通义千问)"""
    return analyze_pdf_qwen(pdf_path, "提取文档中的所有文字")

def analyze_pdf_images_qwen(pdf_path: str):
    """分析 PDF 中的图片(通义千问)"""
    return analyze_pdf_qwen(pdf_path, "描述文档中的所有图片")

# 使用
if __name__ == '__main__':
    analysis = analyze_pdf_qwen("document.pdf", "总结这份文档")
    print(f"分析: {analysis}")

六、平台对比

6.1 图片理解对比

# ===== 图片理解对比 =====

image_understanding_comparison = {
    "OpenAI GPT-4V": {
        "模型": "gpt-4o / gpt-4-turbo",
        "最大图片数": "多张",
        "支持格式": "JPEG/PNG/GIF/WebP",
        "最大大小": "20MB",
        "特性": ["OCR", "目标检测", "图像描述", "视觉推理"],
        "价格": "$0.005/图片"
    },
    "Claude 3": {
        "模型": "claude-3-5-sonnet",
        "最大图片数": "20张",
        "支持格式": "JPEG/PNG/GIF/WebP",
        "最大大小": "5MB/张",
        "特性": ["OCR", "图像描述", "图表理解"],
        "价格": "$0.003/图片"
    },
    "通义千问": {
        "模型": "qwen-vl-max",
        "最大图片数": "多张",
        "支持格式": "JPEG/PNG/BMP",
        "最大大小": "10MB",
        "特性": ["OCR", "图像描述", "中文场景优化"],
        "价格": "¥0.02/图片"
    }
}

6.2 语音识别对比

# ===== 语音识别对比 =====

asr_comparison = {
    "OpenAI Whisper": {
        "模型": "whisper-1",
        "支持语言": "99+ 种",
        "特性": ["多语言", "翻译", "时间戳"],
        "准确率": 5,
        "价格": "$0.006/分钟"
    },
    "通义 Paraformer": {
        "模型": "paraformer-realtime-v1",
        "支持语言": "中文/英文",
        "特性": ["实时", "标点", "说话人识别"],
        "准确率": 5,
        "价格": "¥0.004/分钟"
    },
    "Google Speech-to-Text": {
        "模型": "latest-long",
        "支持语言": "125+ 种",
        "特性": ["实时", "多语言", "自动标点"],
        "准确率": 5,
        "价格": "$0.024/分钟"
    }
}

# 准确率:1-5,5 为最高

6.3 文字转语音对比

# ===== 文字转语音对比 =====

tts_comparison = {
    "OpenAI TTS": {
        "模型": "tts-1 / tts-1-hd",
        "音色数": 6,
        "支持语言": "15+ 种",
        "特性": ["多种音色", "语速调节"],
        "音质": 4,
        "价格": "$15.00/1M characters"
    },
    "通义 Sambert": {
        "模型": "sambert-zhichu-v1",
        "音色数": 20+,
        "支持语言": "中文/英文",
        "特性": ["多种音色", "情感调节"],
        "音质": 4,
        "价格": "¥20.00/1M characters"
    },
    "Azure TTS": {
        "模型": "neural",
        "音色数": 400+,
        "支持语言": "140+ 种",
        "特性": ["神经网络", "多种音色", "SSML 支持"],
        "音质": 5,
        "价格": "$16.00/1M characters"
    }
}

# 音质:1-5,5 为最高

七、生产案例

7.1 案例:智能文档分析系统

# ===== 案例:智能文档分析系统 =====

from openai import OpenAI
import base64
import os
from typing import List, Dict

class DocumentAnalyzer:
    """智能文档分析系统"""
    
    def __init__(self, api_key: str):
        self.client = OpenAI(api_key=api_key)
    
    def analyze_document(self, file_path: str) -> Dict:
        """
        分析文档
        
        支持格式:PDF/图片/文本
        """
        file_ext = os.path.splitext(file_path)[1].lower()
        
        if file_ext == ".pdf":
            return self._analyze_pdf(file_path)
        elif file_ext in [".jpg", ".jpeg", ".png", ".gif"]:
            return self._analyze_image(file_path)
        elif file_ext == ".txt":
            return self._analyze_text(file_path)
        else:
            raise ValueError(f"不支持的格式: {file_ext}")
    
    def _analyze_pdf(self, pdf_path: str) -> Dict:
        """分析 PDF"""
        base64_pdf = self._encode_file_to_base64(pdf_path)
        
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": """分析这份 PDF 文档,提取以下信息:
1. 文档摘要
2. 关键要点
3. 重要数据
4. 结论

以 JSON 格式返回。"""},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:application/pdf;base64,{base64_pdf}"
                            }
                        }
                    ]
                }
            ],
            response_format={"type": "json_object"},
            max_tokens=2048
        )
        
        import json
        return json.loads(response.choices[0].message.content)
    
    def _analyze_image(self, image_path: str) -> Dict:
        """分析图片"""
        base64_image = self._encode_file_to_base64(image_path)
        
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": """分析这张图片,提取以下信息:
1. 图片描述
2. 关键对象
3. 文字内容(如果有)
4. 场景分析

以 JSON 格式返回。"""},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            response_format={"type": "json_object"},
            max_tokens=1024
        )
        
        import json
        return json.loads(response.choices[0].message.content)
    
    def _analyze_text(self, text_path: str) -> Dict:
        """分析文本"""
        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read()
        
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": f"""分析以下文本,提取以下信息:
1. 文本摘要
2. 关键要点
3. 重要数据
4. 结论

以 JSON 格式返回。

文本:
{text}"""
                }
            ],
            response_format={"type": "json_object"},
            max_tokens=1024
        )
        
        import json
        return json.loads(response.choices[0].message.content)
    
    def _encode_file_to_base64(self, file_path: str) -> str:
        """将文件编码为 base64"""
        with open(file_path, "rb") as f:
            return base64.b64encode(f.read()).decode('utf-8')
    
    def batch_analyze(self, file_paths: List[str]) -> List[Dict]:
        """批量分析文档"""
        results = []
        
        for file_path in file_paths:
            try:
                result = self.analyze_document(file_path)
                results.append({
                    "file": file_path,
                    "analysis": result,
                    "success": True
                })
            except Exception as e:
                results.append({
                    "file": file_path,
                    "error": str(e),
                    "success": False
                })
        
        return results

# 使用
if __name__ == '__main__':
    analyzer = DocumentAnalyzer(api_key="your-api-key")
    
    # 分析单个文档
    result = analyzer.analyze_document("document.pdf")
    print(f"分析结果: {result}")
    
    # 批量分析
    results = analyzer.batch_analyze([
        "doc1.pdf",
        "doc2.jpg",
        "doc3.txt"
    ])
    print(f"批量结果: {results}")

7.2 案例:多模态客服系统

# ===== 案例:多模态客服系统 =====

from openai import OpenAI
import base64

class MultimodalCustomerService:
    """多模态客服系统"""
    
    def __init__(self, api_key: str):
        self.client = OpenAI(api_key=api_key)
        self.conversation_history = []
    
    def handle_text(self, text: str) -> str:
        """处理文本消息"""
        self.conversation_history.append({
            "role": "user",
            "content": text
        })
        
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "你是一个智能客服助手,可以处理文本、图片、语音等多种输入。"}
            ] + self.conversation_history,
            max_tokens=1024
        )
        
        assistant_reply = response.choices[0].message.content
        self.conversation_history.append({
            "role": "assistant",
            "content": assistant_reply
        })
        
        return assistant_reply
    
    def handle_image(self, image_path: str, question: str = "分析这张图片") -> str:
        """处理图片消息"""
        base64_image = self._encode_image_to_base64(image_path)
        
        messages = list(self.conversation_history)
        messages.append({
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        })
        
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "你是一个智能客服助手,可以处理文本、图片、语音等多种输入。"}
            ] + messages,
            max_tokens=1024
        )
        
        assistant_reply = response.choices[0].message.content
        self.conversation_history.append({
            "role": "user",
            "content": f"[图片] {question}"
        })
        self.conversation_history.append({
            "role": "assistant",
            "content": assistant_reply
        })
        
        return assistant_reply
    
    def handle_audio(self, audio_path: str) -> str:
        """处理语音消息"""
        # 1. 语音转文字
        with open(audio_path, "rb") as audio_file:
            transcript = self.client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file
            )
        
        text = transcript.text
        print(f"语音转文字: {text}")
        
        # 2. 处理文字
        return self.handle_text(text)
    
    def _encode_image_to_base64(self, image_path: str) -> str:
        """将图片编码为 base64"""
        with open(image_path, "rb") as f:
            return base64.b64encode(f.read()).decode('utf-8')
    
    def reset_conversation(self):
        """重置对话"""
        self.conversation_history = []

# 使用
if __name__ == '__main__':
   客服 = MultimodalCustomerService(api_key="your-api-key")
    
    # 文本对话
    reply1 = 客服.handle_text("你们的营业时间是什么?")
    print(f"回复1: {reply1}")
    
    # 图片分析
    reply2 = 客服.handle_image("product.jpg", "这个产品有什么特点?")
    print(f"回复2: {reply2}")
    
    # 语音处理
    reply3 = 客服.handle_audio("voice_message.mp3")
    print(f"回复3: {reply3}")

八、总结

8.1 核心要点

多模态 AI API

图片理解

语音处理

视频理解

文档理解

OpenAI GPT-4V

Claude 3 Vision

通义千问 Vision

Whisper ASR

Paraformer ASR

OpenAI TTS

Sambert TTS

GPT-4V 帧分析

Gemini 原生视频

PDF 理解

表格提取

文档问答

8.2 选型建议

需求 推荐方案 理由
图片理解 GPT-4V / Claude 3 能力强
OCR GPT-4V / 通义千问 中文优化
语音识别 Whisper / Paraformer 多语言/中文
文字转语音 OpenAI TTS / Sambert 音质好
视频理解 Gemini 1.5 Pro 原生支持
文档理解 GPT-4V / 通义千问 PDF 支持好

8.3 最佳实践

实践 说明
图片压缩 减小文件大小,降低成本
批量处理 使用 Batch API 降低成本
缓存结果 避免重复调用
错误处理 重试机制 + 降级方案
成本监控 Token 统计 + 成本追踪

本文基于各平台官方文档编写。如有问题欢迎评论区讨论!

Logo

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。

更多推荐