TextMiner 文字矿工 - 免费OCR文字识别工具开发全记录(附完整源码)
·
在日常工作和学习中,我们经常遇到需要从图片、PDF、扫描件中提取文字的场景。市面上的OCR工具要么收费昂贵,要么识别效果不理想。于是我决定自己开发一款免费、高效、不挑格式的桌面OCR工具——TextMiner(文字矿工)。
本文将完整记录从产品规划到代码实现的全过程,并提供完整可运行的源码。
一、产品定位
1.1 一句话介绍
一个不挑格式、不挑网络、无感精准提取任何可见文字的桌面效率工具。
1.2 核心差异化
- 本地AI优先 + 云端大模型兜底的混合双擎策略
- 保证免费额度下的最高识别准确率
- 安装包 < 200MB,轻量级桌面应用
1.3 适用场景
| 用户角色 | 核心痛点 | 解决方案 |
|---|---|---|
| 大学生/研究生 | PDF文献加密不能复制 | 截图即转Word,保留段落缩进 |
| 行政/文员 | 扫描件需要重打成Word | 表格识别还原,自动滤除公章 |
| 程序员/极客 | 代码报错截图无法复制 | 全局热键框选,3秒粘贴 |
| 自媒体编辑 | 海报文案想引用 | 后台静默识别 |
二、技术选型
| 技术栈 | 用途 | 选型理由 |
|---|---|---|
| PySide6 | GUI框架 | Qt的Python绑定,界面美观 |
| PaddleOCR | 本地OCR引擎 | 百度开源,中文识别效果好 |
| PyMuPDF | PDF解析 | 轻量高效,支持文字层提取 |
| python-docx | Word处理 | 读写docx文档 |
| keyboard | 全局热键 | 实现Ctrl+Shift+T截图 |
| Qwen VL API | 云端AI精校 | 置信度低时兜底 |
三、项目结构
textminer/
├── main.py # 程序入口
├── requirements.txt # 依赖包
├── build.py # 打包脚本
├── core/
│ ├── __init__.py
│ ├── ocr_engine.py # OCR识别引擎
│ ├── file_parser.py # 文件解析器
│ └── hotkey.py # 全局热键管理
├── ui/
│ ├── __init__.py
│ ├── main_window.py # 主窗口
│ ├── float_ball.py # 悬浮球组件
│ └── screenshot.py # 截图工具
├── utils/
│ ├── __init__.py
│ └── config.py # 配置管理
└── resources/
└── icon.png # 程序图标
四、核心代码实现
4.1 环境依赖 (requirements.txt)
paddlepaddle==2.6.2
paddleocr==2.7.3
PySide6==6.6.1
PyMuPDF==1.23.8
python-docx==1.1.0
Pillow==10.1.0
keyboard==0.13.5
pynput==1.7.6
opencv-python-headless==4.6.0.66
requests==2.31.0
4.2 程序入口 (main.py)
"""
TextMiner - 文字矿工
一个不挑格式、不挑网络的桌面OCR效率工具
"""
import sys
import os
from PySide6.QtWidgets import QApplication
from PySide6.QtCore import Qt, QTimer
from PySide6.QtGui import QIcon, QFont
from ui.main_window import MainWindow
from ui.float_ball import FloatBall
from core.hotkey import HotkeyManager
from utils.config import ConfigManager
class TextMinerApp:
def __init__(self):
self.app = QApplication(sys.argv)
self.app.setApplicationName("TextMiner")
self.app.setOrganizationName("TextMiner")
# 设置应用字体
font = QFont("Microsoft YaHei", 10)
self.app.setFont(font)
# 初始化配置
self.config = ConfigManager()
# 创建主窗口
self.main_window = MainWindow(self.config)
# 创建悬浮球
self.float_ball = FloatBall()
self.float_ball.screenshot_triggered.connect(self.start_screenshot)
self.float_ball.show_main_triggered.connect(self.show_main_window)
# 初始化热键管理器
self.hotkey_manager = HotkeyManager()
self.hotkey_manager.hotkey_triggered.connect(self.start_screenshot)
self.hotkey_manager.start()
# 启动计数
self.usage_count = self.config.get("usage_count", 0)
self.check_achievement()
def start_screenshot(self):
"""开始截图识别"""
self.float_ball.hide()
QTimer.singleShot(100, self._do_screenshot)
def _do_screenshot(self):
"""执行截图"""
from ui.screenshot import ScreenshotTool
screenshot = ScreenshotTool()
pixmap = screenshot.capture()
if pixmap:
self.main_window.process_screenshot(pixmap)
self.usage_count += 1
self.config.set("usage_count", self.usage_count)
self.check_achievement()
self.float_ball.show()
def show_main_window(self):
"""显示主窗口"""
self.main_window.show()
self.main_window.raise_()
self.main_window.activateWindow()
def check_achievement(self):
"""检查成就"""
if self.usage_count == 100:
from PySide6.QtWidgets import QMessageBox
QMessageBox.information(
None,
"🏆 成就解锁",
"恭喜!您已使用TextMiner识别100次!\n"
"您已节省了约50页打印纸 🌲"
)
def run(self):
"""运行应用"""
self.float_ball.show()
return self.app.exec()
if __name__ == "__main__":
app = TextMinerApp()
sys.exit(app.run())
4.3 OCR识别引擎 (core/ocr_engine.py)
"""
OCR识别引擎 - 本地PaddleOCR + 云端备胎策略
"""
import os
import re
from PIL import Image
import numpy as np
import cv2
from PySide6.QtCore import QObject, Signal, QThread
class OCREngine(QObject):
"""OCR识别引擎"""
# 信号
progress = Signal(str) # 进度信息
finished = Signal(str, float) # 识别结果, 置信度
error = Signal(str) # 错误信息
def __init__(self):
super().__init__()
self._ocr = None
self._initialized = False
def initialize(self):
"""初始化PaddleOCR"""
if self._initialized:
return True
try:
self.progress.emit("正在加载本地OCR引擎...")
from paddleocr import PaddleOCR
# 使用轻量级中文模型
self._ocr = PaddleOCR(
use_angle_cls=True,
lang='ch',
use_gpu=False, # 强制CPU模式,兼容性更好
show_log=False,
use_mp=False
)
self._initialized = True
self.progress.emit("本地OCR引擎就绪")
return True
except Exception as e:
self.error.emit(f"OCR初始化失败: {str(e)}")
return False
def recognize(self, image):
"""
识别图片中的文字
Args:
image: PIL.Image 或 numpy array
"""
if not self._initialized:
if not self.initialize():
return
# 转换图片格式
if isinstance(image, Image.Image):
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
try:
self.progress.emit("正在解析文字...")
# 执行OCR
result = self._ocr.ocr(image, cls=True)
if not result or not result[0]:
self.finished.emit("", 0.0)
return
# 提取文字和置信度
texts = []
confidences = []
for line in result[0]:
text = line[1][0]
confidence = line[1][1]
if text and len(text.strip()) > 0:
texts.append(text)
confidences.append(confidence)
full_text = "\n".join(texts)
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
self.progress.emit(f"识别完成,置信度: {avg_confidence:.1%}")
self.finished.emit(full_text, avg_confidence)
except Exception as e:
self.error.emit(f"识别失败: {str(e)}")
def enhance_with_cloud(self, image, api_key=None):
"""
使用云端AI精校(Qwen VL API)
Args:
image: PIL.Image
api_key: API密钥
"""
if api_key is None:
api_key = os.environ.get("DASHSCOPE_API_KEY", "")
if not api_key:
self.error.emit("请设置云端API密钥")
return
self.progress.emit("正在调用云端AI精校...")
# 在新线程中执行
class CloudWorker(QThread):
progress = Signal(str)
finished = Signal(str, float)
error = Signal(str)
def __init__(self, image, api_key):
super().__init__()
self.image = image
self.api_key = api_key
def run(self):
try:
import requests
import base64
from io import BytesIO
# 图片转base64
buffered = BytesIO()
self.image.save(buffered, format="PNG")
img_base64 = base64.b64encode(buffered.getvalue()).decode()
# 调用Qwen VL API
url = "https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation"
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": "qwen-vl-plus",
"input": {
"messages": [
{
"role": "user",
"content": [
{"image": f"data:image/png;base64,{img_base64}"},
{"text": "请精确识别并提取图片中的所有文字内容,保持原有的段落格式和换行。只输出文字内容,不要添加任何解释。"}
]
}
]
}
}
self.progress.emit("云端AI正在处理...")
response = requests.post(url, headers=headers, json=payload, timeout=30)
if response.status_code == 200:
data = response.json()
text = data["output"]["choices"][0]["message"]["content"]
self.finished.emit(text, 0.95)
else:
self.error.emit(f"云端API错误: {response.text}")
except Exception as e:
self.error.emit(f"云端调用失败: {str(e)}")
self._cloud_worker = CloudWorker(image, api_key)
self._cloud_worker.progress.connect(self.progress)
self._cloud_worker.finished.connect(self.finished)
self._cloud_worker.error.connect(self.error)
self._cloud_worker.start()
class OCRWorker(QThread):
"""OCR工作线程"""
progress = Signal(str)
finished = Signal(str, float)
error = Signal(str)
def __init__(self, engine, image):
super().__init__()
self.engine = engine
self.image = image
def run(self):
self.engine.progress.connect(self.progress)
self.engine.finished.connect(self.finished)
self.engine.error.connect(self.error)
self.engine.recognize(self.image)
4.4 文件解析器 (core/file_parser.py)
"""
文件解析器 - 智能分流处理各类文档
"""
import os
from PySide6.QtCore import QObject, Signal
from PIL import Image
class FileParser(QObject):
"""文件解析器"""
progress = Signal(str)
finished = Signal(str, object) # 文字内容, 图片对象(如需OCR)
error = Signal(str)
SUPPORTED_EXTENSIONS = {
'.txt': 'text',
'.md': 'text',
'.py': 'text',
'.json': 'text',
'.xml': 'text',
'.csv': 'text',
'.jpg': 'image',
'.jpeg': 'image',
'.png': 'image',
'.bmp': 'image',
'.gif': 'image',
'.pdf': 'pdf',
'.docx': 'docx',
}
def parse(self, file_path):
"""解析文件"""
ext = os.path.splitext(file_path)[1].lower()
if ext not in self.SUPPORTED_EXTENSIONS:
self.error.emit(f"不支持的文件格式: {ext}")
return
file_type = self.SUPPORTED_EXTENSIONS[ext]
try:
if file_type == 'text':
self._parse_text(file_path)
elif file_type == 'image':
self._parse_image(file_path)
elif file_type == 'pdf':
self._parse_pdf(file_path)
elif file_type == 'docx':
self._parse_docx(file_path)
except Exception as e:
self.error.emit(f"解析失败: {str(e)}")
def _parse_text(self, file_path):
"""解析纯文本文件"""
self.progress.emit("正在读取文本文件...")
encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
content = f.read()
self.progress.emit("文本读取完成")
self.finished.emit(content, None)
return
except UnicodeDecodeError:
continue
self.error.emit("无法识别文件编码")
def _parse_image(self, file_path):
"""解析图片文件"""
self.progress.emit("正在加载图片...")
image = Image.open(file_path)
self.progress.emit("图片加载完成,准备OCR识别")
self.finished.emit("", image)
def _parse_pdf(self, file_path):
"""解析PDF文件"""
try:
import fitz # PyMuPDF
self.progress.emit("正在解析PDF文件...")
doc = fitz.open(file_path)
text_content = []
has_text = False
for page_num, page in enumerate(doc):
self.progress.emit(f"正在处理第 {page_num + 1}/{len(doc)} 页...")
page_text = page.get_text()
if page_text.strip():
has_text = True
text_content.append(page_text)
else:
pix = page.get_pixmap(dpi=150)
img_data = pix.tobytes("png")
image = Image.open(__import__('io').BytesIO(img_data))
self.progress.emit(f"第{page_num + 1}页为扫描件,需OCR识别")
self.finished.emit("", image)
doc.close()
return
doc.close()
if has_text:
self.progress.emit("PDF文字提取完成")
self.finished.emit("\n\n".join(text_content), None)
else:
self.error.emit("PDF无法提取文字")
except ImportError:
self.error.emit("请安装PyMuPDF: pip install PyMuPDF")
except Exception as e:
self.error.emit(f"PDF解析失败: {str(e)}")
def _parse_docx(self, file_path):
"""解析Word文档"""
try:
from docx import Document
self.progress.emit("正在解析Word文档...")
doc = Document(file_path)
paragraphs = []
for para in doc.paragraphs:
if para.text.strip():
paragraphs.append(para.text)
for table in doc.tables:
for row in table.rows:
row_text = []
for cell in row.cells:
if cell.text.strip():
row_text.append(cell.text.strip())
if row_text:
paragraphs.append(" | ".join(row_text))
self.progress.emit("Word文档解析完成")
self.finished.emit("\n".join(paragraphs), None)
except ImportError:
self.error.emit("请安装python-docx: pip install python-docx")
except Exception as e:
self.error.emit(f"Word解析失败: {str(e)}")
4.5 全局热键 (core/hotkey.py)
"""
全局热键管理
"""
import threading
from PySide6.QtCore import QObject, Signal
try:
import keyboard
KEYBOARD_AVAILABLE = True
except ImportError:
KEYBOARD_AVAILABLE = False
class HotkeyManager(QObject):
"""热键管理器"""
hotkey_triggered = Signal()
def __init__(self, hotkey="ctrl+shift+t"):
super().__init__()
self.hotkey = hotkey
self._running = False
self._thread = None
def start(self):
"""启动热键监听"""
if not KEYBOARD_AVAILABLE:
print("警告: keyboard库未安装,热键功能不可用")
return
if self._running:
return
self._running = True
self._thread = threading.Thread(target=self._listen, daemon=True)
self._thread.start()
def stop(self):
"""停止热键监听"""
self._running = False
if KEYBOARD_AVAILABLE:
keyboard.unhook_all()
def _listen(self):
"""监听热键"""
try:
keyboard.add_hotkey(self.hotkey, self._on_hotkey)
while self._running:
keyboard.wait()
except Exception as e:
print(f"热键监听错误: {e}")
def _on_hotkey(self):
"""热键触发"""
self.hotkey_triggered.emit()
4.6 主窗口 (ui/main_window.py)
"""
主窗口
"""
import os
from io import BytesIO
from PySide6.QtWidgets import (
QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
QTextEdit, QPushButton, QLabel, QProgressBar,
QFileDialog, QMessageBox, QFrame,
QToolBar, QStatusBar, QInputDialog, QApplication
)
from PySide6.QtCore import Qt, Signal, QThread, QTimer, QBuffer, QByteArray, QIODevice
from PySide6.QtGui import QAction, QDragEnterEvent, QDropEvent, QPixmap
from core.ocr_engine import OCREngine, OCRWorker
from core.file_parser import FileParser
class DropArea(QFrame):
"""拖拽区域"""
file_dropped = Signal(str)
def __init__(self):
super().__init__()
self.setAcceptDrops(True)
self.setFrameStyle(QFrame.StyledPanel | QFrame.Sunken)
self.setMinimumHeight(100)
self.setMaximumHeight(150)
layout = QVBoxLayout(self)
layout.setAlignment(Qt.AlignCenter)
self.label = QLabel("📁 拖拽文件到此处\n支持: PDF, JPG, PNG, Word, TXT")
self.label.setAlignment(Qt.AlignCenter)
self.label.setStyleSheet("""
QLabel {
color: #666;
font-size: 14px;
padding: 20px;
}
""")
layout.addWidget(self.label)
self.setStyleSheet("""
DropArea {
border: 2px dashed #ccc;
border-radius: 8px;
background: #f9f9f9;
}
DropArea:hover {
border-color: #667eea;
background: #f0f3ff;
}
""")
def dragEnterEvent(self, event: QDragEnterEvent):
if event.mimeData().hasUrls():
event.acceptProposedAction()
self.setStyleSheet("""
DropArea {
border: 2px solid #667eea;
border-radius: 8px;
background: #e8eeff;
}
""")
def dragLeaveEvent(self, event):
self.setStyleSheet("""
DropArea {
border: 2px dashed #ccc;
border-radius: 8px;
background: #f9f9f9;
}
DropArea:hover {
border-color: #667eea;
background: #f0f3ff;
}
""")
def dropEvent(self, event: QDropEvent):
self.dragLeaveEvent(event)
urls = event.mimeData().urls()
if urls:
file_path = urls[0].toLocalFile()
self.file_dropped.emit(file_path)
class MainWindow(QMainWindow):
"""主窗口"""
def __init__(self, config):
super().__init__()
self.config = config
self.ocr_engine = OCREngine()
self.file_parser = FileParser()
self.current_image = None
self._setup_ui()
self._setup_connections()
QTimer.singleShot(100, self._init_ocr)
def _setup_ui(self):
"""设置UI"""
self.setWindowTitle("TextMiner - 文字矿工")
self.setGeometry(100, 100, 900, 700)
self.setStyleSheet("""
QMainWindow { background: #fff; }
QToolBar {
background: #f5f5f5;
border: none;
border-bottom: 1px solid #ddd;
spacing: 5px;
padding: 5px;
}
QPushButton {
background: #667eea;
color: white;
border: none;
padding: 8px 16px;
border-radius: 4px;
font-weight: bold;
}
QPushButton:hover { background: #764ba2; }
QPushButton:disabled { background: #ccc; }
QTextEdit {
border: 1px solid #ddd;
border-radius: 4px;
padding: 10px;
font-family: 'Consolas', 'Microsoft YaHei', monospace;
font-size: 12pt;
}
QStatusBar { background: #f5f5f5; }
""")
central_widget = QWidget()
self.setCentralWidget(central_widget)
main_layout = QVBoxLayout(central_widget)
main_layout.setContentsMargins(10, 10, 10, 10)
main_layout.setSpacing(10)
self._create_toolbar()
self.drop_area = DropArea()
main_layout.addWidget(self.drop_area)
self.progress_bar = QProgressBar()
self.progress_bar.setVisible(False)
main_layout.addWidget(self.progress_bar)
self.status_label = QLabel("就绪")
self.status_label.setStyleSheet("color: #666; padding: 5px;")
main_layout.addWidget(self.status_label)
self.editor = QTextEdit()
self.editor.setPlaceholderText("识别结果将显示在这里...")
main_layout.addWidget(self.editor)
button_layout = QHBoxLayout()
self.ai_enhance_btn = QPushButton("🤖 AI精校")
self.ai_enhance_btn.setEnabled(False)
self.ai_enhance_btn.setToolTip("使用云端AI提升识别准确度")
button_layout.addWidget(self.ai_enhance_btn)
button_layout.addStretch()
self.copy_btn = QPushButton("📋 复制全文")
self.copy_btn.setEnabled(False)
button_layout.addWidget(self.copy_btn)
self.export_txt_btn = QPushButton("💾 导出TXT")
self.export_txt_btn.setEnabled(False)
button_layout.addWidget(self.export_txt_btn)
self.export_docx_btn = QPushButton("📄 导出Word")
self.export_docx_btn.setEnabled(False)
button_layout.addWidget(self.export_docx_btn)
main_layout.addLayout(button_layout)
self.status_bar = QStatusBar()
self.setStatusBar(self.status_bar)
self.confidence_label = QLabel("")
self.status_bar.addPermanentWidget(self.confidence_label)
def _create_toolbar(self):
"""创建工具栏"""
toolbar = QToolBar()
self.addToolBar(toolbar)
open_action = QAction("📂 打开文件", self)
open_action.triggered.connect(self._open_file)
toolbar.addAction(open_action)
screenshot_action = QAction("📸 截图", self)
screenshot_action.triggered.connect(self._screenshot_from_main)
toolbar.addAction(screenshot_action)
toolbar.addSeparator()
clear_action = QAction("🗑️ 清空", self)
clear_action.triggered.connect(lambda: self.editor.clear())
toolbar.addAction(clear_action)
def _setup_connections(self):
"""设置信号连接"""
self.drop_area.file_dropped.connect(self.process_file)
self.file_parser.progress.connect(self._update_status)
self.file_parser.finished.connect(self._on_parse_finished)
self.file_parser.error.connect(self._on_error)
self.ocr_engine.progress.connect(self._update_status)
self.ocr_engine.finished.connect(self._on_ocr_finished)
self.ocr_engine.error.connect(self._on_error)
self.ai_enhance_btn.clicked.connect(self._ai_enhance)
self.copy_btn.clicked.connect(self._copy_text)
self.export_txt_btn.clicked.connect(self._export_txt)
self.export_docx_btn.clicked.connect(self._export_docx)
def _init_ocr(self):
"""初始化OCR"""
self._update_status("正在初始化OCR引擎...")
self.ocr_engine.initialize()
self._update_status("就绪")
def _open_file(self):
"""打开文件"""
file_path, _ = QFileDialog.getOpenFileName(
self, "选择文件", "",
"所有支持格式 (*.pdf *.jpg *.jpeg *.png *.bmp *.txt *.docx);;"
"PDF文件 (*.pdf);;图片文件 (*.jpg *.png);;文本文件 (*.txt);;Word文档 (*.docx)"
)
if file_path:
self.process_file(file_path)
def _screenshot_from_main(self):
"""从主窗口触发截图"""
self.hide()
QTimer.singleShot(100, self._do_screenshot)
def _do_screenshot(self):
"""执行截图"""
from ui.screenshot import ScreenshotTool
screenshot = ScreenshotTool()
pixmap = screenshot.capture()
if pixmap:
self.process_screenshot(pixmap)
self.show()
def process_file(self, file_path):
"""处理文件"""
self.editor.clear()
self.current_image = None
self.ai_enhance_btn.setEnabled(False)
self._set_buttons_enabled(False)
self._update_status(f"正在处理: {os.path.basename(file_path)}")
self.file_parser.parse(file_path)
def process_screenshot(self, pixmap):
"""处理截图"""
self.editor.clear()
self._set_buttons_enabled(False)
self.ai_enhance_btn.setEnabled(False)
from PIL import Image
try:
byte_array = QByteArray()
buffer = QBuffer(byte_array)
if not buffer.open(QIODevice.WriteOnly):
self._on_error("无法创建内存缓冲区")
return
if not pixmap.save(buffer, "PNG"):
buffer.close()
self._on_error("截图保存失败")
return
buffer.close()
png_data = bytes(byte_array.data())
if len(png_data) == 0:
self._on_error("截图数据为空")
return
self.current_image = Image.open(BytesIO(png_data))
self.current_image.load()
if self.current_image.mode in ('RGBA', 'LA', 'P'):
rgb_image = Image.new('RGB', self.current_image.size, (255, 255, 255))
if self.current_image.mode == 'P':
self.current_image = self.current_image.convert('RGBA')
rgb_image.paste(self.current_image,
mask=self.current_image.split()[-1] if self.current_image.mode == 'RGBA' else None)
self.current_image = rgb_image
elif self.current_image.mode != 'RGB':
self.current_image = self.current_image.convert('RGB')
self._update_status("正在识别截图...")
self.ocr_worker = OCRWorker(self.ocr_engine, self.current_image)
self.ocr_worker.progress.connect(self._update_status)
self.ocr_worker.finished.connect(self._on_ocr_finished)
self.ocr_worker.error.connect(self._on_error)
self.ocr_worker.start()
except Exception as e:
import traceback
traceback.print_exc()
self._on_error(f"截图处理失败: {str(e)}")
def _on_parse_finished(self, text, image):
"""文件解析完成"""
if text:
self.editor.setPlainText(text)
self._set_buttons_enabled(True)
self._update_status("解析完成")
elif image:
self.current_image = image
self._update_status("正在OCR识别...")
self.ocr_worker = OCRWorker(self.ocr_engine, image)
self.ocr_worker.progress.connect(self._update_status)
self.ocr_worker.finished.connect(self._on_ocr_finished)
self.ocr_worker.error.connect(self._on_error)
self.ocr_worker.start()
else:
self._on_error("无法解析文件内容")
def _on_ocr_finished(self, text, confidence):
"""OCR识别完成"""
self.editor.setPlainText(text)
self._set_buttons_enabled(True)
self.ai_enhance_btn.setEnabled(True)
self.confidence_label.setText(f"置信度: {confidence:.1%}")
if confidence < 0.85:
self._update_status(f"识别完成,置信度较低,建议使用AI精校")
self.status_bar.showMessage("💡 点击「AI精校」可提升识别准确度", 5000)
else:
self._update_status("识别完成")
def _on_error(self, error_msg):
"""错误处理"""
self._update_status(f"错误: {error_msg}")
QMessageBox.warning(self, "错误", error_msg)
self._set_buttons_enabled(True)
def _ai_enhance(self):
"""AI精校"""
if not self.current_image:
QMessageBox.information(self, "提示", "请先进行截图或导入图片")
return
api_key = self.config.get("dashscope_api_key", "")
if not api_key:
api_key, ok = QInputDialog.getText(
self, "API密钥",
"请输入阿里云DashScope API Key:\n"
"(可前往 https://dashscope.console.aliyun.com/ 获取)"
)
if ok and api_key:
self.config.set("dashscope_api_key", api_key)
else:
return
self._set_buttons_enabled(False)
self.ai_enhance_btn.setEnabled(False)
self.ai_enhance_btn.setText("⏳ 处理中...")
self.ocr_engine.enhance_with_cloud(self.current_image, api_key)
def _copy_text(self):
"""复制文本"""
text = self.editor.toPlainText()
if text:
QApplication.clipboard().setText(text)
self.status_bar.showMessage("已复制到剪贴板", 2000)
def _export_txt(self):
"""导出TXT"""
file_path, _ = QFileDialog.getSaveFileName(
self, "保存TXT文件", "识别结果.txt", "文本文件 (*.txt)"
)
if file_path:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(self.editor.toPlainText())
self.status_bar.showMessage(f"已保存: {file_path}", 3000)
def _export_docx(self):
"""导出Word"""
try:
from docx import Document
except ImportError:
QMessageBox.warning(self, "错误", "请安装python-docx: pip install python-docx")
return
file_path, _ = QFileDialog.getSaveFileName(
self, "保存Word文件", "识别结果.docx", "Word文档 (*.docx)"
)
if file_path:
doc = Document()
for line in self.editor.toPlainText().split('\n'):
doc.add_paragraph(line)
doc.save(file_path)
self.status_bar.showMessage(f"已保存: {file_path}", 3000)
def _update_status(self, message):
"""更新状态"""
self.status_label.setText(f"📋 {message}")
self.status_bar.showMessage(message)
def _set_buttons_enabled(self, enabled):
"""设置按钮启用状态"""
self.copy_btn.setEnabled(enabled)
self.export_txt_btn.setEnabled(enabled)
self.export_docx_btn.setEnabled(enabled)
4.7 悬浮球组件 (ui/float_ball.py)
"""
悬浮球组件
"""
from PySide6.QtWidgets import QWidget, QLabel, QVBoxLayout, QMenu
from PySide6.QtCore import Qt, Signal
from PySide6.QtGui import QMouseEvent, QAction
class FloatBall(QWidget):
"""悬浮球"""
screenshot_triggered = Signal()
show_main_triggered = Signal()
def __init__(self):
super().__init__()
self.setWindowFlags(
Qt.WindowStaysOnTopHint |
Qt.FramelessWindowHint |
Qt.Tool
)
self.setAttribute(Qt.WA_TranslucentBackground)
self._drag_pos = None
self._setup_ui()
screen = self.screen().geometry()
self.move(screen.width() - 100, screen.height() // 2)
def _setup_ui(self):
"""设置UI"""
self.setFixedSize(56, 56)
layout = QVBoxLayout(self)
layout.setContentsMargins(0, 0, 0, 0)
self.label = QLabel("T")
self.label.setAlignment(Qt.AlignCenter)
self.label.setStyleSheet("""
QLabel {
background: qlineargradient(x1:0, y1:0, x2:1, y2:1,
stop:0 #667eea, stop:1 #764ba2);
color: white;
border-radius: 28px;
font-size: 20px;
font-weight: bold;
border: 2px solid rgba(255, 255, 255, 0.3);
}
QLabel:hover {
background: qlineargradient(x1:0, y1:0, x2:1, y2:1,
stop:0 #764ba2, stop:1 #667eea);
}
""")
layout.addWidget(self.label)
def mousePressEvent(self, event: QMouseEvent):
if event.button() == Qt.LeftButton:
self._drag_pos = event.globalPosition().toPoint() - self.frameGeometry().topLeft()
elif event.button() == Qt.RightButton:
self._show_context_menu(event.globalPosition().toPoint())
def mouseMoveEvent(self, event: QMouseEvent):
if event.buttons() == Qt.LeftButton and self._drag_pos:
self.move(event.globalPosition().toPoint() - self._drag_pos)
def mouseReleaseEvent(self, event: QMouseEvent):
if event.button() == Qt.LeftButton and self._drag_pos is not None:
if (event.globalPosition().toPoint() - self._drag_pos -
self.frameGeometry().topLeft()).manhattanLength() < 5:
self.screenshot_triggered.emit()
self._drag_pos = None
def mouseDoubleClickEvent(self, event: QMouseEvent):
if event.button() == Qt.LeftButton:
self.show_main_triggered.emit()
def _show_context_menu(self, pos):
"""显示右键菜单"""
menu = QMenu()
screenshot_action = QAction("📸 截图识别 (Ctrl+Shift+T)", menu)
screenshot_action.triggered.connect(self.screenshot_triggered.emit)
menu.addAction(screenshot_action)
menu.addSeparator()
show_main_action = QAction("📝 打开主窗口", menu)
show_main_action.triggered.connect(self.show_main_triggered.emit)
menu.addAction(show_main_action)
menu.addSeparator()
quit_action = QAction("❌ 退出", menu)
quit_action.triggered.connect(self.close)
menu.addAction(quit_action)
menu.exec(pos)
4.8 截图工具 (ui/screenshot.py)
"""
截图工具
"""
from PySide6.QtWidgets import QWidget, QApplication, QRubberBand
from PySide6.QtCore import Qt, QRect, QPoint, Signal
from PySide6.QtGui import QPainter, QPen, QColor, QPixmap
class ScreenshotTool(QWidget):
"""截图工具"""
screenshot_taken = Signal(QPixmap)
def __init__(self):
super().__init__()
self.setWindowFlags(
Qt.FramelessWindowHint |
Qt.WindowStaysOnTopHint |
Qt.Tool
)
self.setAttribute(Qt.WA_TranslucentBackground)
self.setAttribute(Qt.WA_DeleteOnClose)
screen = QApplication.primaryScreen()
self.screenshot = screen.grabWindow(0)
self.setGeometry(screen.geometry())
self.origin = QPoint()
self.rubber_band = QRubberBand(QRubberBand.Rectangle, self)
self.showFullScreen()
self.setCursor(Qt.CrossCursor)
def paintEvent(self, event):
"""绘制半透明遮罩"""
painter = QPainter(self)
painter.setBrush(QColor(0, 0, 0, 100))
painter.setPen(Qt.NoPen)
painter.drawRect(self.rect())
if not self.rubber_band.geometry().isNull():
painter.setCompositionMode(QPainter.CompositionMode_Clear)
painter.drawRect(self.rubber_band.geometry())
painter.setCompositionMode(QPainter.CompositionMode_SourceOver)
pen = QPen(QColor(102, 126, 234), 2)
painter.setPen(pen)
painter.setBrush(Qt.NoBrush)
painter.drawRect(self.rubber_band.geometry())
def mousePressEvent(self, event):
if event.button() == Qt.LeftButton:
self.origin = event.pos()
self.rubber_band.setGeometry(QRect(self.origin, self.origin))
self.rubber_band.show()
elif event.button() == Qt.RightButton:
self.close()
def mouseMoveEvent(self, event):
if not self.origin.isNull():
self.rubber_band.setGeometry(QRect(self.origin, event.pos()).normalized())
def mouseReleaseEvent(self, event):
if event.button() == Qt.LeftButton:
rect = self.rubber_band.geometry()
if rect.width() > 10 and rect.height() > 10:
cropped = self.screenshot.copy(rect)
self.screenshot_taken.emit(cropped)
self.close()
def capture(self):
"""阻塞式截图,返回截取的QPixmap"""
self.result = None
def on_screenshot(pixmap):
self.result = pixmap
self.screenshot_taken.connect(on_screenshot)
self.show()
while self.isVisible():
QApplication.processEvents()
return self.result
4.9 配置管理 (utils/config.py)
"""
配置管理
"""
import os
import json
from pathlib import Path
class ConfigManager:
"""配置管理器"""
def __init__(self):
self.config_dir = Path.home() / ".textminer"
self.config_file = self.config_dir / "config.json"
self._config = {}
self._load()
def _load(self):
"""加载配置"""
self.config_dir.mkdir(exist_ok=True)
if self.config_file.exists():
try:
with open(self.config_file, 'r', encoding='utf-8') as f:
self._config = json.load(f)
except:
self._config = {}
else:
self._config = {
"usage_count": 0,
"dashscope_api_key": "",
}
self._save()
def _save(self):
"""保存配置"""
with open(self.config_file, 'w', encoding='utf-8') as f:
json.dump(self._config, f, ensure_ascii=False, indent=2)
def get(self, key, default=None):
"""获取配置"""
return self._config.get(key, default)
def set(self, key, value):
"""设置配置"""
self._config[key] = value
self._save()
4.10 打包脚本 (build.py)
"""
打包脚本 - 使用PyInstaller生成exe
"""
import os
import sys
import shutil
from pathlib import Path
def build():
"""构建exe"""
print("开始打包 TextMiner...")
for dir_name in ['build', 'dist']:
if os.path.exists(dir_name):
shutil.rmtree(dir_name)
cmd = """
pyinstaller
--name="TextMiner"
--windowed
--icon=resources/icon.png
--add-data="resources;resources"
--hidden-import=paddleocr
--hidden-import=paddle
--hidden-import=sklearn
--hidden-import=scipy
--hidden-import=PySide6.QtNetwork
--collect-all paddleocr
--collect-all paddle
--noconfirm
main.py
"""
cmd = ' '.join(cmd.split())
print(f"执行命令: {cmd}")
os.system(cmd)
print("\n打包完成!")
print(f"输出目录: {Path.cwd() / 'dist' / 'TextMiner'}")
if __name__ == "__main__":
build()
五、使用说明
5.1 安装运行
# 1. 安装依赖
pip install -r requirements.txt
# 2. 运行程序
python main.py
5.2 功能操作
| 操作 | 说明 |
|---|---|
| Ctrl+Shift+T | 全局热键,随时截图识别 |
| 悬浮球单击 | 触发截图识别 |
| 悬浮球双击 | 打开主窗口 |
| 悬浮球右键 | 显示菜单 |
| 拖拽文件 | 支持PDF/图片/Word/TXT直接拖入 |
| AI精校 | 置信度低时,调用云端AI提升准确率 |
5.3 打包成EXE
python build.py
六、效果展示
6.1 悬浮球

6.2 截图识别

6.3 主界面
如果你也在为OCR工具烦恼,不妨试试 TextMiner,或者基于源码二次开发,如果觉得有用,欢迎评论讨论!
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐

所有评论(0)