pubmed批量下载文献 傻瓜操作
·
1. 在pubmed上勾选文献,点击【save】
2.如图选择,点击【create】
3.浏览器就会下载出一个txt
4.接下来运行代码,代码里自带“文件选择器”
import requests
import time
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import xlwt
import requests
import datetime
import tkinter.filedialog
from tkinter.ttk import Progressbar
import tkinter as tk
import os
from tkinter import *
from tkinter import scrolledtext
#---- 设置保存pdf的路径
desktoppath = os.path.join(os.path.expanduser("~"), 'Desktop')
out_path = desktoppath + "\pubmed下载文献2" # 输出文件夹所在地
if not os.path.exists(out_path):# 输出文件夹所在地,如果不存在,则新建一个
os.mkdir(out_path)
#稍后在识别题录会使用到的3个列表
pmc_list = []
pmc_success = []
date_list = []
title_list = []
PDF_title = []
def ChooseTxt():
pmc_list.clear()
pmc_success.clear()
PDF_title.clear()
date_list.clear()
title_list.clear()
# 获取桌面路径
desktoppath = os.path.join(os.path.expanduser("~"), 'Desktop')
# print(desktoppath)
downloadpath = desktoppath.replace("\Desktop", "\Downloads")
# print(downloadpath)
filename = tk.filedialog.askopenfilename(title='选择一个txt文件', filetypes=[('txt', '*.txt'), ('All Files', '*')],initialdir=downloadpath)
if filename == '':
button1.config(state=tk.DISABLED) # 让按钮变成灰色
text1.delete('1.0', "end")
text1.insert("1.0", "文件为空,请重新选择")
text1.insert("1.0", "\n")
text1.update()
else:
text1.delete('1.0',"end")
text1.insert('1.0', "txt文件路径已定位: " + filename + " ")
text1.insert("1.0", "\n")
text1.update()
txt_path = filename
## 对pubmed导出的txt进行操作
with open(txt_path, encoding='utf-8') as f:
nextlineID_list = [] # 处理title的时候用,但是它不能进入循环
for lineID, line in enumerate(f.readlines()): # 可以获取行号
# 处理发表年份
if "DP -" in line:
date = line.split(" ")[3].replace("\n", "")
# print(line.replace("\n",""))
# print("发表年份: " + date)
text1.insert("1.0", "发表年份: " + date + "\n") # 文本框显示
date_list.append(date) # 写入list
# 处理title
if "TI -" in line and ("." not in line or "?" not in line):
nextlineID = lineID + 1
thirdlineID = lineID + 2
nextlineID_list.append(nextlineID)
nextlineID_list.append(thirdlineID)
title01 = line.split(" ", 3)[3] # 这里使用了不同的分割方法,因为标题里面有很多的空格
title01 = title01.replace("\n", "")
# print(f"下一行、下两行的ID是:{nextlineID_list}")
# print("标题第一段:" + title01)
# text1.insert("1.0", f"下一行、下两行的ID是:{nextlineID_list}" + "\n") # 文本框显示
# text1.insert("1.0", "标题第一段:" + title01 + "\n") # 文本框显示
if nextlineID_list:
# print("列表不为空")
if lineID == nextlineID_list[0]:
title02 = line.split(" ", 6)[6].replace("\n", "")
# print("标题第二段:" + title02)
# text1.insert("1.0", "标题第二段:" + title02 + "\n") # 文本框显示
if lineID == nextlineID_list[1]:
if "." in line and ("." not in line or "?" not in line) and " - " not in line:
title03 = line.split(" ", 6)[6].replace("\n", "")
# print("标题第三段:" + title03)
# text1.insert("1.0", "标题第三段:" + title03 + "\n") # 文本框显示
title = title01 + title02 + title03
# print("标题拼起来:" + title)
# text1.insert("1.0", "标题拼起来:" + title + "\n") # 文本框显示
# print("---")
title_list.append(title)
else:
title = title01 + title02
# print("标题拼起来:" + title)
# text1.insert("1.0", "标题拼起来:" + title + "\n") # 文本框显示
# print("---")
title_list.append(title)
nextlineID_list = [] # 写入list
if "TI -" in line and ("." in line or "?" in line):
title = line.split(" ", 3)[3]
title = title.replace("\n", "")
# print("标题只有一段:" + title001)
# text1.insert("1.0", "标题只有一段:" + title + "\n") # 文本框显示
title_list.append(title)
nextlineID_list = []
# 处理PMC
if "PMC -" in line:
pmc = line.split(" ")[2].replace("\n", "")
pmc_list.append(pmc)
if "SO -" in line: #到了的时候(说明处理完一段了)
if len(date_list) == len(title_list) == len(pmc_list):
# print("匹配")
text1.insert("1.0", " √有pmc号" + "\n" + title + "\n") # 文本框显示
pmc_success.append(pmc)
if len(pmc_list) < len(title_list):
pmc_list.append("缺失pmc")
# print("doi: 缺失doi")
text1.insert("1.0", " XXX缺pmc号" + "\n" + title + "\n") # 文本框显示
# print("-"*100)
text1.insert("1.0", "-"*100 + "\n") # 文本框显示
text1.update()
f.close() #关闭文档
# print(pmc_list)
# print(len(pmc_list))
number_success = str(len(pmc_success))
number = str(len(pmc_list))
text1.insert("1.0", "-"*100 + "\n") # 文本框显示
text1.insert("1.0", number_success + " 篇文献有pmc号,可供下载" + "\n") # 文本框显示
text1.insert("1.0", number + " 篇文献被识别" + "\n") # 文本框显示
text1.insert("1.0", "-"*100 + "\n") # 文本框显示
text1.update()
#生成pdf标题用
i = 0
while i < len(date_list):
pdf_title = date_list[i] + "_" + title_list[i]
PDF_title.append(pdf_title)
i = i + 1
button1.pack()
# button1.config(state=NORMAL)
#---- 下载文献
UA = UserAgent()
def Download(): # 下载的操作
button1.config(state=tk.DISABLED) # 让按钮变成灰色
button2.config(state=tk.DISABLED) # 让按钮变成灰色
p_bar['maximum'] = len(pmc_list) #更新进度条的最大值
#用来记录下载成功或失败的excel读写
FAILURE = []
FAILURE_URL = []
SUCCESS = []
i = 0
for pmc_num in pmc_list:
# 更新进度条最大值
p_bar['maximum'] = len(pmc_list)
# 更新进度条值
p_bar['value'] = i
root.update()
text1.insert("1.0", "-"*100 + "\n") # 文本框显示
text1.update()
#设置保存的文件名
titlename = PDF_title[i]
filepath = out_path + "\\" + titlename + ".pdf"
if ": " in filepath: # 如果文件名称中存在冒号,无法正常生成pdf
filepath = filepath.replace(":", "") + ".pdf"
print(filepath)
i = i + 1
if pmc_num == "缺失pmc":
FAILURE.append(titlename)
FAILURE_URL.append("缺失pmc号")
continue
# 判断是否存在这个文件了,避免重复下载
if not os.path.exists(filepath):
text1.insert("1.0", " Trying: " + titlename + "\n") #文本框显示
text1.update()
#找到pmc下载的网址
url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/' + pmc_num + "/"
# print(url)
headers = {'User-Agent': UA.ie}
request = requests.get(url, headers=headers)
try:
request = requests.get(url, headers=headers) #打开网址
# print(request) # <Response [200]> = successful
BsBOJ = BeautifulSoup(request.content, 'lxml') #解析返回的内容
download_url = BsBOJ.find_all(attrs={'class': "int-view"}) #找到下载的按钮
# print(download_url)
download_link = download_url[1].get('href')
download_head = "https://www.ncbi.nlm.nih.gov"
download_url = "https://www.ncbi.nlm.nih.gov" + download_link
print(download_url)
text1.insert("1.0", " Downloading..." + "\n") #文本框显示
text1.insert("1.0", download_url + "\n") # 文本框显示
text1.insert("1.0", "!! 如果下载失败,手动复制粘贴这个网址到浏览器即可下载成功:" + "\n") # 文本框显示
text1.update()
# ---- 保存pdf
r = requests.get(download_url, headers=headers)
with open(filepath, 'wb') as f:
f.write(r.content)
text1.insert("1.0", " √ 下载成功:" + "\n" + titlename) # 文本框显示
text1.update()
SUCCESS.append(titlename)
except Exception as e:
print("无法下载: " + titlename)
text1.insert("1.0", " X 下载失败: " + titlename + "\n") # 文本框显示
text1.update()
FAILURE.append(titlename)
FAILURE_URL.append(download_url)
else:
text1.insert("1.0", " 该Pdf已存在: " + titlename + "\n") # 文本框显示
text1.update()
SUCCESS.append(titlename)
p_bar['value'] = i
# 读写excel
# 1 新建excel文件
wb = xlwt.Workbook()
# 2 新建工作表并重命名
ws = wb.add_sheet('Sheet1') # 将工作表worksheet命名为‘Python’
# 3 写入内容
style0 = xlwt.XFStyle()
font0 = xlwt.Font()
font0.bold = True # 字体是否为粗体
style0.font = font0
ws.write(0, 0, "Download", style0)
ws.write(0, 1, "Article", style0)
ws.write(0, 2, 'URL', style0) # write(行,列,写入的内容)
# 创建一个样式对象,初始化样式 style
style = xlwt.XFStyle()
# 为样式创建字体
font = xlwt.Font()
font.colour_index = 4 # 设置字体颜色 4是深蓝色
font.name = 'Times New Roman' # 设置字体
style.font = font # 应用字体的设置
style2 = xlwt.XFStyle()
font2 = xlwt.Font()
font2.underline = True # 字体是否有下划线
style2.font = font2 # 应用字体的设置
col = 0
row = 0
for i in range(len(FAILURE)):
row = row + 1
ws.write(row, 0, "失败", style)
ws.write(row, 1, FAILURE[i])
ws.write(row, 2, xlwt.Formula(f'HYPERLINK("{FAILURE_URL[i]}")'), style2)
i = i + 1
failure_num = row
s = 0
for s in range(len(SUCCESS)):
row = row + 1
ws.write(row, 0, "下载成功")
ws.write(row, 1, SUCCESS[s]) # row不变,继续接下去
success_num = row - failure_num
# 4 保存
ws.col(1).width = 40000
ws.col(2).width = 15000 # 修改某一列的宽
# 获取时间,用于excel表格的命名
timeNow = datetime.datetime.now().strftime("%Y-%m-%d %H时%M分%S秒") # 时间类型转为字符串类型
excel_path = out_path + f'\.pubmed下载文献报告{timeNow}.xls' # 报告的excel文件下载地方
wb.save(excel_path)
# print("结束。已经保存")
# print("成功了 " + str(success_num) + " 篇")
# print("失败了 " + str(failure_num) + " 篇")
text1.insert("1.0", "成功了 " + str(success_num) + " 篇\n")
text1.insert("1.0", "失败了 " + str(failure_num) + " 篇\n")
text1.insert("1.0",f"已经保存excel文件,文件路径为:{excel_path}\n")
text1.update()
button1.config(state=tk.DISABLED) # 让按钮变成灰色
button2.config(state=tk.NORMAL) # 让按钮变成灰色
text1.insert("1.0", "\n" + "----------下载完成,右上角点击关闭即可,文件存放在桌面文件夹【pubmed下载文件】---------- ") # 文本框显示
text1.update()
button3.pack()
button3.config(state=NORMAL)
root = tk.Tk()
root.geometry('1280x700')
root.configure(bg='white') # 设置窗口背景色
root.title("pubmed文献下载器")
#文本框
text1= tk.scrolledtext.ScrolledText(root, width=160, height=20, bg="white", fg="black", font=('微软雅黑', 12), padx=10, pady=10)
# text1 = tk.Text(root, width=160, height=22, bg="white", fg="black", font=('微软雅黑', 12), padx=10, pady=10)
text1.pack()
text1.insert("1.0","请点击下方正中的‘选择文件’按钮,选择从Pubmed上下载的txt格式文件"
"\n"
"--------------------------------------------------------------------------"
"\n"
"下载方法:"
"\n"
"① pubmed选好文献后,选择【save】"
"\n"
"②【selection】选择自己合适的 - all results/selection "
"\n"
"③【format】选择’Pubmed‘!!"
"\n"
"④ 点击【create file】下载为txt格式"
"\n")
#进度条
p_bar = Progressbar(root, length=1250)
p_bar.pack(pady=10)
# 设置进度条最大值
p_bar['maximum'] = len(pmc_list)
# 设置进度条当前值(此处为清零/设初值为零)
p_bar['value'] = 0
var = tk.StringVar()
var2 = tk.StringVar()
var.set("开始下载")
button3 = tk.Button(root, text="重试", command=Download, padx=20, font=('微软雅黑',16),fg='blue')
button2 = tk.Button(root, text="选择文件", command=ChooseTxt, padx=20, font=('微软雅黑',16))
button2.pack()
button1 = tk.Button(root, textvariable=var, command=Download, padx=20, font=('微软雅黑',16),fg='dark red')
# button1.pack()
root.mainloop()
5.运行代码,就会出现一个框,点击【选择文件】,选择刚下好的txt
6.点击【开始下载】
下载完成后:
注:
由于技术垃圾,代码还有很多冗余的部分,如果有路过的大神,欢迎提供修改意见。虚心向您请教。谢谢!
这是基于pmc号的,基于sci-hub的可参考之前发表的
更多推荐
已为社区贡献1条内容
所有评论(0)