paddleocr查看标注好的数据错误信息
·
字符计数
import os
import json
from collections import Counter
# 按字符计数
label_dir="/Users/thy/Downloads/chinese20240613"
zi_ls=[]
with open(os.path.join(label_dir,"Label.txt")) as f:
lines=f.readlines()
for line in lines:
line = line.strip("\r\n")
# print("line:",line)
line1=line.split("\t")
# print("line1:", line1[1])
json_str = json.loads(line1[1])
# 提取所有转录文本
transcriptions = [item["transcription"] for item in json_str]
print(transcriptions)
transcriptions1=[]
for trans in transcriptions:
if len(trans)==1:
transcriptions1.append(trans)
else:
trans=[char for char in trans]
for tran in trans:
transcriptions1.append(tran)
# print(transcriptions1)
for zi in transcriptions1:
zi_ls.append(zi)
# print("出现的字符串:",set(zi_ls))
char_counts = Counter(zi_ls)
# 打印结果
for char, count in char_counts.items():
print(f"字符 '{char}' 出现了 {count} 次")
# 如果需要将结果存储到字典,可以直接使用char_counts
# 输出字典内容
print(char_counts)
字符出现的次数
import os
import json
from collections import Counter
# 按字符计数
label_dir="/Users/thy/Downloads/chinese20240613"
zi_ls=[]
with open(os.path.join(label_dir,"Label.txt")) as f:
lines=f.readlines()
for line in lines:
line = line.strip("\r\n")
# print("line:",line)
line1=line.split("\t")
# print("line1:", line1[1])
json_str = json.loads(line1[1])
# 提取所有转录文本
transcriptions = [item["transcription"] for item in json_str]
# print(transcriptions)
transcriptions1=[]
for trans in transcriptions:
transcriptions1.append(trans)
print(transcriptions1)
for zi in transcriptions1:
zi_ls.append(zi)
print("出现的字符串:",set(zi_ls))
char_counts = Counter(zi_ls)
# 打印结果
for char, count in char_counts.items():
print(f"字符 '{char}' 出现了 {count} 次")
# 如果需要将结果存储到字典,可以直接使用char_counts
# 输出字典内容
print(char_counts)
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐



所有评论(0)