在日常工作和学习中,我们经常遇到需要从图片、PDF、扫描件中提取文字的场景。市面上的OCR工具要么收费昂贵,要么识别效果不理想。于是我决定自己开发一款免费、高效、不挑格式的桌面OCR工具——TextMiner(文字矿工)

本文将完整记录从产品规划到代码实现的全过程,并提供完整可运行的源码


一、产品定位

1.1 一句话介绍

一个不挑格式、不挑网络、无感精准提取任何可见文字的桌面效率工具。

1.2 核心差异化

  • 本地AI优先 + 云端大模型兜底的混合双擎策略
  • 保证免费额度下的最高识别准确率
  • 安装包 < 200MB,轻量级桌面应用

1.3 适用场景

用户角色 核心痛点 解决方案
大学生/研究生 PDF文献加密不能复制 截图即转Word,保留段落缩进
行政/文员 扫描件需要重打成Word 表格识别还原,自动滤除公章
程序员/极客 代码报错截图无法复制 全局热键框选,3秒粘贴
自媒体编辑 海报文案想引用 后台静默识别

二、技术选型

技术栈 用途 选型理由
PySide6 GUI框架 Qt的Python绑定,界面美观
PaddleOCR 本地OCR引擎 百度开源,中文识别效果好
PyMuPDF PDF解析 轻量高效,支持文字层提取
python-docx Word处理 读写docx文档
keyboard 全局热键 实现Ctrl+Shift+T截图
Qwen VL API 云端AI精校 置信度低时兜底

三、项目结构

textminer/
├── main.py                 # 程序入口
├── requirements.txt        # 依赖包
├── build.py               # 打包脚本
├── core/
│   ├── __init__.py
│   ├── ocr_engine.py      # OCR识别引擎
│   ├── file_parser.py     # 文件解析器
│   └── hotkey.py          # 全局热键管理
├── ui/
│   ├── __init__.py
│   ├── main_window.py     # 主窗口
│   ├── float_ball.py      # 悬浮球组件
│   └── screenshot.py      # 截图工具
├── utils/
│   ├── __init__.py
│   └── config.py          # 配置管理
└── resources/
    └── icon.png           # 程序图标

四、核心代码实现

4.1 环境依赖 (requirements.txt)

paddlepaddle==2.6.2
paddleocr==2.7.3
PySide6==6.6.1
PyMuPDF==1.23.8
python-docx==1.1.0
Pillow==10.1.0
keyboard==0.13.5
pynput==1.7.6
opencv-python-headless==4.6.0.66
requests==2.31.0

4.2 程序入口 (main.py)

"""
TextMiner - 文字矿工
一个不挑格式、不挑网络的桌面OCR效率工具
"""

import sys
import os
from PySide6.QtWidgets import QApplication
from PySide6.QtCore import Qt, QTimer
from PySide6.QtGui import QIcon, QFont

from ui.main_window import MainWindow
from ui.float_ball import FloatBall
from core.hotkey import HotkeyManager
from utils.config import ConfigManager


class TextMinerApp:
    def __init__(self):
        self.app = QApplication(sys.argv)
        self.app.setApplicationName("TextMiner")
        self.app.setOrganizationName("TextMiner")

        # 设置应用字体
        font = QFont("Microsoft YaHei", 10)
        self.app.setFont(font)

        # 初始化配置
        self.config = ConfigManager()

        # 创建主窗口
        self.main_window = MainWindow(self.config)

        # 创建悬浮球
        self.float_ball = FloatBall()
        self.float_ball.screenshot_triggered.connect(self.start_screenshot)
        self.float_ball.show_main_triggered.connect(self.show_main_window)

        # 初始化热键管理器
        self.hotkey_manager = HotkeyManager()
        self.hotkey_manager.hotkey_triggered.connect(self.start_screenshot)
        self.hotkey_manager.start()

        # 启动计数
        self.usage_count = self.config.get("usage_count", 0)
        self.check_achievement()

    def start_screenshot(self):
        """开始截图识别"""
        self.float_ball.hide()
        QTimer.singleShot(100, self._do_screenshot)

    def _do_screenshot(self):
        """执行截图"""
        from ui.screenshot import ScreenshotTool
        screenshot = ScreenshotTool()
        pixmap = screenshot.capture()

        if pixmap:
            self.main_window.process_screenshot(pixmap)
            self.usage_count += 1
            self.config.set("usage_count", self.usage_count)
            self.check_achievement()

        self.float_ball.show()

    def show_main_window(self):
        """显示主窗口"""
        self.main_window.show()
        self.main_window.raise_()
        self.main_window.activateWindow()

    def check_achievement(self):
        """检查成就"""
        if self.usage_count == 100:
            from PySide6.QtWidgets import QMessageBox
            QMessageBox.information(
                None,
                "🏆 成就解锁",
                "恭喜!您已使用TextMiner识别100次!\n"
                "您已节省了约50页打印纸 🌲"
            )

    def run(self):
        """运行应用"""
        self.float_ball.show()
        return self.app.exec()


if __name__ == "__main__":
    app = TextMinerApp()
    sys.exit(app.run())

4.3 OCR识别引擎 (core/ocr_engine.py)

"""
OCR识别引擎 - 本地PaddleOCR + 云端备胎策略
"""

import os
import re
from PIL import Image
import numpy as np
import cv2
from PySide6.QtCore import QObject, Signal, QThread


class OCREngine(QObject):
    """OCR识别引擎"""

    # 信号
    progress = Signal(str)  # 进度信息
    finished = Signal(str, float)  # 识别结果, 置信度
    error = Signal(str)  # 错误信息

    def __init__(self):
        super().__init__()
        self._ocr = None
        self._initialized = False

    def initialize(self):
        """初始化PaddleOCR"""
        if self._initialized:
            return True

        try:
            self.progress.emit("正在加载本地OCR引擎...")
            from paddleocr import PaddleOCR

            # 使用轻量级中文模型
            self._ocr = PaddleOCR(
                use_angle_cls=True,
                lang='ch',
                use_gpu=False,  # 强制CPU模式,兼容性更好
                show_log=False,
                use_mp=False
            )
            self._initialized = True
            self.progress.emit("本地OCR引擎就绪")
            return True

        except Exception as e:
            self.error.emit(f"OCR初始化失败: {str(e)}")
            return False

    def recognize(self, image):
        """
        识别图片中的文字

        Args:
            image: PIL.Image 或 numpy array
        """
        if not self._initialized:
            if not self.initialize():
                return

        # 转换图片格式
        if isinstance(image, Image.Image):
            image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

        try:
            self.progress.emit("正在解析文字...")

            # 执行OCR
            result = self._ocr.ocr(image, cls=True)

            if not result or not result[0]:
                self.finished.emit("", 0.0)
                return

            # 提取文字和置信度
            texts = []
            confidences = []

            for line in result[0]:
                text = line[1][0]
                confidence = line[1][1]

                if text and len(text.strip()) > 0:
                    texts.append(text)
                    confidences.append(confidence)

            full_text = "\n".join(texts)
            avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0

            self.progress.emit(f"识别完成,置信度: {avg_confidence:.1%}")
            self.finished.emit(full_text, avg_confidence)

        except Exception as e:
            self.error.emit(f"识别失败: {str(e)}")

    def enhance_with_cloud(self, image, api_key=None):
        """
        使用云端AI精校(Qwen VL API)

        Args:
            image: PIL.Image
            api_key: API密钥
        """
        if api_key is None:
            api_key = os.environ.get("DASHSCOPE_API_KEY", "")

        if not api_key:
            self.error.emit("请设置云端API密钥")
            return

        self.progress.emit("正在调用云端AI精校...")

        # 在新线程中执行
        class CloudWorker(QThread):
            progress = Signal(str)
            finished = Signal(str, float)
            error = Signal(str)

            def __init__(self, image, api_key):
                super().__init__()
                self.image = image
                self.api_key = api_key

            def run(self):
                try:
                    import requests
                    import base64
                    from io import BytesIO

                    # 图片转base64
                    buffered = BytesIO()
                    self.image.save(buffered, format="PNG")
                    img_base64 = base64.b64encode(buffered.getvalue()).decode()

                    # 调用Qwen VL API
                    url = "https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation"
                    headers = {
                        "Authorization": f"Bearer {self.api_key}",
                        "Content-Type": "application/json"
                    }

                    payload = {
                        "model": "qwen-vl-plus",
                        "input": {
                            "messages": [
                                {
                                    "role": "user",
                                    "content": [
                                        {"image": f"data:image/png;base64,{img_base64}"},
                                        {"text": "请精确识别并提取图片中的所有文字内容,保持原有的段落格式和换行。只输出文字内容,不要添加任何解释。"}
                                    ]
                                }
                            ]
                        }
                    }

                    self.progress.emit("云端AI正在处理...")
                    response = requests.post(url, headers=headers, json=payload, timeout=30)

                    if response.status_code == 200:
                        data = response.json()
                        text = data["output"]["choices"][0]["message"]["content"]
                        self.finished.emit(text, 0.95)
                    else:
                        self.error.emit(f"云端API错误: {response.text}")

                except Exception as e:
                    self.error.emit(f"云端调用失败: {str(e)}")

        self._cloud_worker = CloudWorker(image, api_key)
        self._cloud_worker.progress.connect(self.progress)
        self._cloud_worker.finished.connect(self.finished)
        self._cloud_worker.error.connect(self.error)
        self._cloud_worker.start()


class OCRWorker(QThread):
    """OCR工作线程"""
    progress = Signal(str)
    finished = Signal(str, float)
    error = Signal(str)

    def __init__(self, engine, image):
        super().__init__()
        self.engine = engine
        self.image = image

    def run(self):
        self.engine.progress.connect(self.progress)
        self.engine.finished.connect(self.finished)
        self.engine.error.connect(self.error)
        self.engine.recognize(self.image)

4.4 文件解析器 (core/file_parser.py)

"""
文件解析器 - 智能分流处理各类文档
"""

import os
from PySide6.QtCore import QObject, Signal
from PIL import Image


class FileParser(QObject):
    """文件解析器"""

    progress = Signal(str)
    finished = Signal(str, object)  # 文字内容, 图片对象(如需OCR)
    error = Signal(str)

    SUPPORTED_EXTENSIONS = {
        '.txt': 'text',
        '.md': 'text',
        '.py': 'text',
        '.json': 'text',
        '.xml': 'text',
        '.csv': 'text',
        '.jpg': 'image',
        '.jpeg': 'image',
        '.png': 'image',
        '.bmp': 'image',
        '.gif': 'image',
        '.pdf': 'pdf',
        '.docx': 'docx',
    }

    def parse(self, file_path):
        """解析文件"""
        ext = os.path.splitext(file_path)[1].lower()

        if ext not in self.SUPPORTED_EXTENSIONS:
            self.error.emit(f"不支持的文件格式: {ext}")
            return

        file_type = self.SUPPORTED_EXTENSIONS[ext]

        try:
            if file_type == 'text':
                self._parse_text(file_path)
            elif file_type == 'image':
                self._parse_image(file_path)
            elif file_type == 'pdf':
                self._parse_pdf(file_path)
            elif file_type == 'docx':
                self._parse_docx(file_path)
        except Exception as e:
            self.error.emit(f"解析失败: {str(e)}")

    def _parse_text(self, file_path):
        """解析纯文本文件"""
        self.progress.emit("正在读取文本文件...")
        encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']

        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    content = f.read()
                self.progress.emit("文本读取完成")
                self.finished.emit(content, None)
                return
            except UnicodeDecodeError:
                continue

        self.error.emit("无法识别文件编码")

    def _parse_image(self, file_path):
        """解析图片文件"""
        self.progress.emit("正在加载图片...")
        image = Image.open(file_path)
        self.progress.emit("图片加载完成,准备OCR识别")
        self.finished.emit("", image)

    def _parse_pdf(self, file_path):
        """解析PDF文件"""
        try:
            import fitz  # PyMuPDF

            self.progress.emit("正在解析PDF文件...")
            doc = fitz.open(file_path)

            text_content = []
            has_text = False

            for page_num, page in enumerate(doc):
                self.progress.emit(f"正在处理第 {page_num + 1}/{len(doc)} 页...")

                page_text = page.get_text()
                if page_text.strip():
                    has_text = True
                    text_content.append(page_text)
                else:
                    pix = page.get_pixmap(dpi=150)
                    img_data = pix.tobytes("png")
                    image = Image.open(__import__('io').BytesIO(img_data))
                    self.progress.emit(f"第{page_num + 1}页为扫描件,需OCR识别")
                    self.finished.emit("", image)
                    doc.close()
                    return

            doc.close()

            if has_text:
                self.progress.emit("PDF文字提取完成")
                self.finished.emit("\n\n".join(text_content), None)
            else:
                self.error.emit("PDF无法提取文字")

        except ImportError:
            self.error.emit("请安装PyMuPDF: pip install PyMuPDF")
        except Exception as e:
            self.error.emit(f"PDF解析失败: {str(e)}")

    def _parse_docx(self, file_path):
        """解析Word文档"""
        try:
            from docx import Document

            self.progress.emit("正在解析Word文档...")
            doc = Document(file_path)

            paragraphs = []
            for para in doc.paragraphs:
                if para.text.strip():
                    paragraphs.append(para.text)

            for table in doc.tables:
                for row in table.rows:
                    row_text = []
                    for cell in row.cells:
                        if cell.text.strip():
                            row_text.append(cell.text.strip())
                    if row_text:
                        paragraphs.append(" | ".join(row_text))

            self.progress.emit("Word文档解析完成")
            self.finished.emit("\n".join(paragraphs), None)

        except ImportError:
            self.error.emit("请安装python-docx: pip install python-docx")
        except Exception as e:
            self.error.emit(f"Word解析失败: {str(e)}")

4.5 全局热键 (core/hotkey.py)

"""
全局热键管理
"""

import threading
from PySide6.QtCore import QObject, Signal

try:
    import keyboard
    KEYBOARD_AVAILABLE = True
except ImportError:
    KEYBOARD_AVAILABLE = False


class HotkeyManager(QObject):
    """热键管理器"""

    hotkey_triggered = Signal()

    def __init__(self, hotkey="ctrl+shift+t"):
        super().__init__()
        self.hotkey = hotkey
        self._running = False
        self._thread = None

    def start(self):
        """启动热键监听"""
        if not KEYBOARD_AVAILABLE:
            print("警告: keyboard库未安装,热键功能不可用")
            return

        if self._running:
            return

        self._running = True
        self._thread = threading.Thread(target=self._listen, daemon=True)
        self._thread.start()

    def stop(self):
        """停止热键监听"""
        self._running = False
        if KEYBOARD_AVAILABLE:
            keyboard.unhook_all()

    def _listen(self):
        """监听热键"""
        try:
            keyboard.add_hotkey(self.hotkey, self._on_hotkey)
            while self._running:
                keyboard.wait()
        except Exception as e:
            print(f"热键监听错误: {e}")

    def _on_hotkey(self):
        """热键触发"""
        self.hotkey_triggered.emit()

4.6 主窗口 (ui/main_window.py)

"""
主窗口
"""

import os
from io import BytesIO
from PySide6.QtWidgets import (
    QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
    QTextEdit, QPushButton, QLabel, QProgressBar,
    QFileDialog, QMessageBox, QFrame,
    QToolBar, QStatusBar, QInputDialog, QApplication
)
from PySide6.QtCore import Qt, Signal, QThread, QTimer, QBuffer, QByteArray, QIODevice
from PySide6.QtGui import QAction, QDragEnterEvent, QDropEvent, QPixmap

from core.ocr_engine import OCREngine, OCRWorker
from core.file_parser import FileParser


class DropArea(QFrame):
    """拖拽区域"""

    file_dropped = Signal(str)

    def __init__(self):
        super().__init__()
        self.setAcceptDrops(True)
        self.setFrameStyle(QFrame.StyledPanel | QFrame.Sunken)
        self.setMinimumHeight(100)
        self.setMaximumHeight(150)

        layout = QVBoxLayout(self)
        layout.setAlignment(Qt.AlignCenter)

        self.label = QLabel("📁 拖拽文件到此处\n支持: PDF, JPG, PNG, Word, TXT")
        self.label.setAlignment(Qt.AlignCenter)
        self.label.setStyleSheet("""
            QLabel {
                color: #666;
                font-size: 14px;
                padding: 20px;
            }
        """)
        layout.addWidget(self.label)

        self.setStyleSheet("""
            DropArea {
                border: 2px dashed #ccc;
                border-radius: 8px;
                background: #f9f9f9;
            }
            DropArea:hover {
                border-color: #667eea;
                background: #f0f3ff;
            }
        """)

    def dragEnterEvent(self, event: QDragEnterEvent):
        if event.mimeData().hasUrls():
            event.acceptProposedAction()
            self.setStyleSheet("""
                DropArea {
                    border: 2px solid #667eea;
                    border-radius: 8px;
                    background: #e8eeff;
                }
            """)

    def dragLeaveEvent(self, event):
        self.setStyleSheet("""
            DropArea {
                border: 2px dashed #ccc;
                border-radius: 8px;
                background: #f9f9f9;
            }
            DropArea:hover {
                border-color: #667eea;
                background: #f0f3ff;
            }
        """)

    def dropEvent(self, event: QDropEvent):
        self.dragLeaveEvent(event)
        urls = event.mimeData().urls()
        if urls:
            file_path = urls[0].toLocalFile()
            self.file_dropped.emit(file_path)


class MainWindow(QMainWindow):
    """主窗口"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.ocr_engine = OCREngine()
        self.file_parser = FileParser()
        self.current_image = None

        self._setup_ui()
        self._setup_connections()

        QTimer.singleShot(100, self._init_ocr)

    def _setup_ui(self):
        """设置UI"""
        self.setWindowTitle("TextMiner - 文字矿工")
        self.setGeometry(100, 100, 900, 700)

        self.setStyleSheet("""
            QMainWindow { background: #fff; }
            QToolBar {
                background: #f5f5f5;
                border: none;
                border-bottom: 1px solid #ddd;
                spacing: 5px;
                padding: 5px;
            }
            QPushButton {
                background: #667eea;
                color: white;
                border: none;
                padding: 8px 16px;
                border-radius: 4px;
                font-weight: bold;
            }
            QPushButton:hover { background: #764ba2; }
            QPushButton:disabled { background: #ccc; }
            QTextEdit {
                border: 1px solid #ddd;
                border-radius: 4px;
                padding: 10px;
                font-family: 'Consolas', 'Microsoft YaHei', monospace;
                font-size: 12pt;
            }
            QStatusBar { background: #f5f5f5; }
        """)

        central_widget = QWidget()
        self.setCentralWidget(central_widget)

        main_layout = QVBoxLayout(central_widget)
        main_layout.setContentsMargins(10, 10, 10, 10)
        main_layout.setSpacing(10)

        self._create_toolbar()

        self.drop_area = DropArea()
        main_layout.addWidget(self.drop_area)

        self.progress_bar = QProgressBar()
        self.progress_bar.setVisible(False)
        main_layout.addWidget(self.progress_bar)

        self.status_label = QLabel("就绪")
        self.status_label.setStyleSheet("color: #666; padding: 5px;")
        main_layout.addWidget(self.status_label)

        self.editor = QTextEdit()
        self.editor.setPlaceholderText("识别结果将显示在这里...")
        main_layout.addWidget(self.editor)

        button_layout = QHBoxLayout()

        self.ai_enhance_btn = QPushButton("🤖 AI精校")
        self.ai_enhance_btn.setEnabled(False)
        self.ai_enhance_btn.setToolTip("使用云端AI提升识别准确度")
        button_layout.addWidget(self.ai_enhance_btn)

        button_layout.addStretch()

        self.copy_btn = QPushButton("📋 复制全文")
        self.copy_btn.setEnabled(False)
        button_layout.addWidget(self.copy_btn)

        self.export_txt_btn = QPushButton("💾 导出TXT")
        self.export_txt_btn.setEnabled(False)
        button_layout.addWidget(self.export_txt_btn)

        self.export_docx_btn = QPushButton("📄 导出Word")
        self.export_docx_btn.setEnabled(False)
        button_layout.addWidget(self.export_docx_btn)

        main_layout.addLayout(button_layout)

        self.status_bar = QStatusBar()
        self.setStatusBar(self.status_bar)
        self.confidence_label = QLabel("")
        self.status_bar.addPermanentWidget(self.confidence_label)

    def _create_toolbar(self):
        """创建工具栏"""
        toolbar = QToolBar()
        self.addToolBar(toolbar)

        open_action = QAction("📂 打开文件", self)
        open_action.triggered.connect(self._open_file)
        toolbar.addAction(open_action)

        screenshot_action = QAction("📸 截图", self)
        screenshot_action.triggered.connect(self._screenshot_from_main)
        toolbar.addAction(screenshot_action)

        toolbar.addSeparator()

        clear_action = QAction("🗑️ 清空", self)
        clear_action.triggered.connect(lambda: self.editor.clear())
        toolbar.addAction(clear_action)

    def _setup_connections(self):
        """设置信号连接"""
        self.drop_area.file_dropped.connect(self.process_file)
        self.file_parser.progress.connect(self._update_status)
        self.file_parser.finished.connect(self._on_parse_finished)
        self.file_parser.error.connect(self._on_error)

        self.ocr_engine.progress.connect(self._update_status)
        self.ocr_engine.finished.connect(self._on_ocr_finished)
        self.ocr_engine.error.connect(self._on_error)

        self.ai_enhance_btn.clicked.connect(self._ai_enhance)
        self.copy_btn.clicked.connect(self._copy_text)
        self.export_txt_btn.clicked.connect(self._export_txt)
        self.export_docx_btn.clicked.connect(self._export_docx)

    def _init_ocr(self):
        """初始化OCR"""
        self._update_status("正在初始化OCR引擎...")
        self.ocr_engine.initialize()
        self._update_status("就绪")

    def _open_file(self):
        """打开文件"""
        file_path, _ = QFileDialog.getOpenFileName(
            self, "选择文件", "",
            "所有支持格式 (*.pdf *.jpg *.jpeg *.png *.bmp *.txt *.docx);;"
            "PDF文件 (*.pdf);;图片文件 (*.jpg *.png);;文本文件 (*.txt);;Word文档 (*.docx)"
        )
        if file_path:
            self.process_file(file_path)

    def _screenshot_from_main(self):
        """从主窗口触发截图"""
        self.hide()
        QTimer.singleShot(100, self._do_screenshot)

    def _do_screenshot(self):
        """执行截图"""
        from ui.screenshot import ScreenshotTool
        screenshot = ScreenshotTool()
        pixmap = screenshot.capture()

        if pixmap:
            self.process_screenshot(pixmap)

        self.show()

    def process_file(self, file_path):
        """处理文件"""
        self.editor.clear()
        self.current_image = None
        self.ai_enhance_btn.setEnabled(False)
        self._set_buttons_enabled(False)

        self._update_status(f"正在处理: {os.path.basename(file_path)}")
        self.file_parser.parse(file_path)

    def process_screenshot(self, pixmap):
        """处理截图"""
        self.editor.clear()
        self._set_buttons_enabled(False)
        self.ai_enhance_btn.setEnabled(False)

        from PIL import Image

        try:
            byte_array = QByteArray()
            buffer = QBuffer(byte_array)

            if not buffer.open(QIODevice.WriteOnly):
                self._on_error("无法创建内存缓冲区")
                return

            if not pixmap.save(buffer, "PNG"):
                buffer.close()
                self._on_error("截图保存失败")
                return

            buffer.close()

            png_data = bytes(byte_array.data())

            if len(png_data) == 0:
                self._on_error("截图数据为空")
                return

            self.current_image = Image.open(BytesIO(png_data))
            self.current_image.load()

            if self.current_image.mode in ('RGBA', 'LA', 'P'):
                rgb_image = Image.new('RGB', self.current_image.size, (255, 255, 255))
                if self.current_image.mode == 'P':
                    self.current_image = self.current_image.convert('RGBA')
                rgb_image.paste(self.current_image,
                                mask=self.current_image.split()[-1] if self.current_image.mode == 'RGBA' else None)
                self.current_image = rgb_image
            elif self.current_image.mode != 'RGB':
                self.current_image = self.current_image.convert('RGB')

            self._update_status("正在识别截图...")

            self.ocr_worker = OCRWorker(self.ocr_engine, self.current_image)
            self.ocr_worker.progress.connect(self._update_status)
            self.ocr_worker.finished.connect(self._on_ocr_finished)
            self.ocr_worker.error.connect(self._on_error)
            self.ocr_worker.start()

        except Exception as e:
            import traceback
            traceback.print_exc()
            self._on_error(f"截图处理失败: {str(e)}")

    def _on_parse_finished(self, text, image):
        """文件解析完成"""
        if text:
            self.editor.setPlainText(text)
            self._set_buttons_enabled(True)
            self._update_status("解析完成")
        elif image:
            self.current_image = image
            self._update_status("正在OCR识别...")

            self.ocr_worker = OCRWorker(self.ocr_engine, image)
            self.ocr_worker.progress.connect(self._update_status)
            self.ocr_worker.finished.connect(self._on_ocr_finished)
            self.ocr_worker.error.connect(self._on_error)
            self.ocr_worker.start()
        else:
            self._on_error("无法解析文件内容")

    def _on_ocr_finished(self, text, confidence):
        """OCR识别完成"""
        self.editor.setPlainText(text)
        self._set_buttons_enabled(True)
        self.ai_enhance_btn.setEnabled(True)

        self.confidence_label.setText(f"置信度: {confidence:.1%}")

        if confidence < 0.85:
            self._update_status(f"识别完成,置信度较低,建议使用AI精校")
            self.status_bar.showMessage("💡 点击「AI精校」可提升识别准确度", 5000)
        else:
            self._update_status("识别完成")

    def _on_error(self, error_msg):
        """错误处理"""
        self._update_status(f"错误: {error_msg}")
        QMessageBox.warning(self, "错误", error_msg)
        self._set_buttons_enabled(True)

    def _ai_enhance(self):
        """AI精校"""
        if not self.current_image:
            QMessageBox.information(self, "提示", "请先进行截图或导入图片")
            return

        api_key = self.config.get("dashscope_api_key", "")
        if not api_key:
            api_key, ok = QInputDialog.getText(
                self, "API密钥",
                "请输入阿里云DashScope API Key:\n"
                "(可前往 https://dashscope.console.aliyun.com/ 获取)"
            )
            if ok and api_key:
                self.config.set("dashscope_api_key", api_key)
            else:
                return

        self._set_buttons_enabled(False)
        self.ai_enhance_btn.setEnabled(False)
        self.ai_enhance_btn.setText("⏳ 处理中...")

        self.ocr_engine.enhance_with_cloud(self.current_image, api_key)

    def _copy_text(self):
        """复制文本"""
        text = self.editor.toPlainText()
        if text:
            QApplication.clipboard().setText(text)
            self.status_bar.showMessage("已复制到剪贴板", 2000)

    def _export_txt(self):
        """导出TXT"""
        file_path, _ = QFileDialog.getSaveFileName(
            self, "保存TXT文件", "识别结果.txt", "文本文件 (*.txt)"
        )
        if file_path:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(self.editor.toPlainText())
            self.status_bar.showMessage(f"已保存: {file_path}", 3000)

    def _export_docx(self):
        """导出Word"""
        try:
            from docx import Document
        except ImportError:
            QMessageBox.warning(self, "错误", "请安装python-docx: pip install python-docx")
            return

        file_path, _ = QFileDialog.getSaveFileName(
            self, "保存Word文件", "识别结果.docx", "Word文档 (*.docx)"
        )
        if file_path:
            doc = Document()
            for line in self.editor.toPlainText().split('\n'):
                doc.add_paragraph(line)
            doc.save(file_path)
            self.status_bar.showMessage(f"已保存: {file_path}", 3000)

    def _update_status(self, message):
        """更新状态"""
        self.status_label.setText(f"📋 {message}")
        self.status_bar.showMessage(message)

    def _set_buttons_enabled(self, enabled):
        """设置按钮启用状态"""
        self.copy_btn.setEnabled(enabled)
        self.export_txt_btn.setEnabled(enabled)
        self.export_docx_btn.setEnabled(enabled)

4.7 悬浮球组件 (ui/float_ball.py)

"""
悬浮球组件
"""

from PySide6.QtWidgets import QWidget, QLabel, QVBoxLayout, QMenu
from PySide6.QtCore import Qt, Signal
from PySide6.QtGui import QMouseEvent, QAction


class FloatBall(QWidget):
    """悬浮球"""

    screenshot_triggered = Signal()
    show_main_triggered = Signal()

    def __init__(self):
        super().__init__()
        self.setWindowFlags(
            Qt.WindowStaysOnTopHint |
            Qt.FramelessWindowHint |
            Qt.Tool
        )
        self.setAttribute(Qt.WA_TranslucentBackground)

        self._drag_pos = None
        self._setup_ui()

        screen = self.screen().geometry()
        self.move(screen.width() - 100, screen.height() // 2)

    def _setup_ui(self):
        """设置UI"""
        self.setFixedSize(56, 56)

        layout = QVBoxLayout(self)
        layout.setContentsMargins(0, 0, 0, 0)

        self.label = QLabel("T")
        self.label.setAlignment(Qt.AlignCenter)
        self.label.setStyleSheet("""
            QLabel {
                background: qlineargradient(x1:0, y1:0, x2:1, y2:1,
                    stop:0 #667eea, stop:1 #764ba2);
                color: white;
                border-radius: 28px;
                font-size: 20px;
                font-weight: bold;
                border: 2px solid rgba(255, 255, 255, 0.3);
            }
            QLabel:hover {
                background: qlineargradient(x1:0, y1:0, x2:1, y2:1,
                    stop:0 #764ba2, stop:1 #667eea);
            }
        """)

        layout.addWidget(self.label)

    def mousePressEvent(self, event: QMouseEvent):
        if event.button() == Qt.LeftButton:
            self._drag_pos = event.globalPosition().toPoint() - self.frameGeometry().topLeft()
        elif event.button() == Qt.RightButton:
            self._show_context_menu(event.globalPosition().toPoint())

    def mouseMoveEvent(self, event: QMouseEvent):
        if event.buttons() == Qt.LeftButton and self._drag_pos:
            self.move(event.globalPosition().toPoint() - self._drag_pos)

    def mouseReleaseEvent(self, event: QMouseEvent):
        if event.button() == Qt.LeftButton and self._drag_pos is not None:
            if (event.globalPosition().toPoint() - self._drag_pos -
                self.frameGeometry().topLeft()).manhattanLength() < 5:
                self.screenshot_triggered.emit()
        self._drag_pos = None

    def mouseDoubleClickEvent(self, event: QMouseEvent):
        if event.button() == Qt.LeftButton:
            self.show_main_triggered.emit()

    def _show_context_menu(self, pos):
        """显示右键菜单"""
        menu = QMenu()

        screenshot_action = QAction("📸 截图识别 (Ctrl+Shift+T)", menu)
        screenshot_action.triggered.connect(self.screenshot_triggered.emit)
        menu.addAction(screenshot_action)

        menu.addSeparator()

        show_main_action = QAction("📝 打开主窗口", menu)
        show_main_action.triggered.connect(self.show_main_triggered.emit)
        menu.addAction(show_main_action)

        menu.addSeparator()

        quit_action = QAction("❌ 退出", menu)
        quit_action.triggered.connect(self.close)
        menu.addAction(quit_action)

        menu.exec(pos)

4.8 截图工具 (ui/screenshot.py)

"""
截图工具
"""

from PySide6.QtWidgets import QWidget, QApplication, QRubberBand
from PySide6.QtCore import Qt, QRect, QPoint, Signal
from PySide6.QtGui import QPainter, QPen, QColor, QPixmap


class ScreenshotTool(QWidget):
    """截图工具"""

    screenshot_taken = Signal(QPixmap)

    def __init__(self):
        super().__init__()
        self.setWindowFlags(
            Qt.FramelessWindowHint |
            Qt.WindowStaysOnTopHint |
            Qt.Tool
        )
        self.setAttribute(Qt.WA_TranslucentBackground)
        self.setAttribute(Qt.WA_DeleteOnClose)

        screen = QApplication.primaryScreen()
        self.screenshot = screen.grabWindow(0)
        self.setGeometry(screen.geometry())

        self.origin = QPoint()
        self.rubber_band = QRubberBand(QRubberBand.Rectangle, self)

        self.showFullScreen()
        self.setCursor(Qt.CrossCursor)

    def paintEvent(self, event):
        """绘制半透明遮罩"""
        painter = QPainter(self)

        painter.setBrush(QColor(0, 0, 0, 100))
        painter.setPen(Qt.NoPen)
        painter.drawRect(self.rect())

        if not self.rubber_band.geometry().isNull():
            painter.setCompositionMode(QPainter.CompositionMode_Clear)
            painter.drawRect(self.rubber_band.geometry())

            painter.setCompositionMode(QPainter.CompositionMode_SourceOver)
            pen = QPen(QColor(102, 126, 234), 2)
            painter.setPen(pen)
            painter.setBrush(Qt.NoBrush)
            painter.drawRect(self.rubber_band.geometry())

    def mousePressEvent(self, event):
        if event.button() == Qt.LeftButton:
            self.origin = event.pos()
            self.rubber_band.setGeometry(QRect(self.origin, self.origin))
            self.rubber_band.show()
        elif event.button() == Qt.RightButton:
            self.close()

    def mouseMoveEvent(self, event):
        if not self.origin.isNull():
            self.rubber_band.setGeometry(QRect(self.origin, event.pos()).normalized())

    def mouseReleaseEvent(self, event):
        if event.button() == Qt.LeftButton:
            rect = self.rubber_band.geometry()
            if rect.width() > 10 and rect.height() > 10:
                cropped = self.screenshot.copy(rect)
                self.screenshot_taken.emit(cropped)
            self.close()

    def capture(self):
        """阻塞式截图,返回截取的QPixmap"""
        self.result = None

        def on_screenshot(pixmap):
            self.result = pixmap

        self.screenshot_taken.connect(on_screenshot)

        self.show()
        while self.isVisible():
            QApplication.processEvents()

        return self.result

4.9 配置管理 (utils/config.py)

"""
配置管理
"""

import os
import json
from pathlib import Path


class ConfigManager:
    """配置管理器"""

    def __init__(self):
        self.config_dir = Path.home() / ".textminer"
        self.config_file = self.config_dir / "config.json"
        self._config = {}
        self._load()

    def _load(self):
        """加载配置"""
        self.config_dir.mkdir(exist_ok=True)

        if self.config_file.exists():
            try:
                with open(self.config_file, 'r', encoding='utf-8') as f:
                    self._config = json.load(f)
            except:
                self._config = {}
        else:
            self._config = {
                "usage_count": 0,
                "dashscope_api_key": "",
            }
            self._save()

    def _save(self):
        """保存配置"""
        with open(self.config_file, 'w', encoding='utf-8') as f:
            json.dump(self._config, f, ensure_ascii=False, indent=2)

    def get(self, key, default=None):
        """获取配置"""
        return self._config.get(key, default)

    def set(self, key, value):
        """设置配置"""
        self._config[key] = value
        self._save()

4.10 打包脚本 (build.py)

"""
打包脚本 - 使用PyInstaller生成exe
"""

import os
import sys
import shutil
from pathlib import Path

def build():
    """构建exe"""
    print("开始打包 TextMiner...")

    for dir_name in ['build', 'dist']:
        if os.path.exists(dir_name):
            shutil.rmtree(dir_name)

    cmd = """
    pyinstaller 
        --name="TextMiner" 
        --windowed 
        --icon=resources/icon.png 
        --add-data="resources;resources" 
        --hidden-import=paddleocr 
        --hidden-import=paddle 
        --hidden-import=sklearn 
        --hidden-import=scipy 
        --hidden-import=PySide6.QtNetwork 
        --collect-all paddleocr 
        --collect-all paddle 
        --noconfirm 
        main.py
    """

    cmd = ' '.join(cmd.split())

    print(f"执行命令: {cmd}")
    os.system(cmd)

    print("\n打包完成!")
    print(f"输出目录: {Path.cwd() / 'dist' / 'TextMiner'}")

if __name__ == "__main__":
    build()

五、使用说明

5.1 安装运行

# 1. 安装依赖
pip install -r requirements.txt

# 2. 运行程序
python main.py

5.2 功能操作

操作 说明
Ctrl+Shift+T 全局热键,随时截图识别
悬浮球单击 触发截图识别
悬浮球双击 打开主窗口
悬浮球右键 显示菜单
拖拽文件 支持PDF/图片/Word/TXT直接拖入
AI精校 置信度低时,调用云端AI提升准确率

5.3 打包成EXE

python build.py

六、效果展示

6.1 悬浮球

在这里插入图片描述

6.2 截图识别

在这里插入图片描述

6.3 主界面

在这里插入图片描述

如果你也在为OCR工具烦恼,不妨试试 TextMiner,或者基于源码二次开发,如果觉得有用,欢迎评论讨论!

Logo

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。

更多推荐