1. 在pubmed上勾选文献,点击【save】

 2.如图选择,点击【create】

 3.浏览器就会下载出一个txt

 4.接下来运行代码,代码里自带“文件选择器”

import requests
import time
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import xlwt
import requests
import datetime
import tkinter.filedialog
from tkinter.ttk import Progressbar
import tkinter as tk
import os
from tkinter import *
from tkinter import scrolledtext

#---- 设置保存pdf的路径
desktoppath = os.path.join(os.path.expanduser("~"), 'Desktop')
out_path = desktoppath + "\pubmed下载文献2"                               # 输出文件夹所在地
if not os.path.exists(out_path):# 输出文件夹所在地,如果不存在,则新建一个
    os.mkdir(out_path)

#稍后在识别题录会使用到的3个列表
pmc_list = []
pmc_success = []
date_list = []
title_list = []
PDF_title = []

def ChooseTxt():
    pmc_list.clear()
    pmc_success.clear()
    PDF_title.clear()
    date_list.clear()
    title_list.clear()
    # 获取桌面路径
    desktoppath = os.path.join(os.path.expanduser("~"), 'Desktop')
    # print(desktoppath)
    downloadpath = desktoppath.replace("\Desktop", "\Downloads")
    # print(downloadpath)
    filename = tk.filedialog.askopenfilename(title='选择一个txt文件', filetypes=[('txt', '*.txt'), ('All Files', '*')],initialdir=downloadpath)
    if filename == '':
        button1.config(state=tk.DISABLED)  # 让按钮变成灰色
        text1.delete('1.0', "end")
        text1.insert("1.0", "文件为空,请重新选择")
        text1.insert("1.0", "\n")
        text1.update()
    else:
        text1.delete('1.0',"end")
        text1.insert('1.0', "txt文件路径已定位:  " + filename  + "          ")
        text1.insert("1.0", "\n")
        text1.update()
        txt_path = filename

        ## 对pubmed导出的txt进行操作
        with open(txt_path, encoding='utf-8') as f:
            nextlineID_list = []  # 处理title的时候用,但是它不能进入循环

            for lineID, line in enumerate(f.readlines()):  # 可以获取行号

                # 处理发表年份
                if "DP  -" in line:
                    date = line.split(" ")[3].replace("\n", "")
                    # print(line.replace("\n",""))
                    # print("发表年份: " + date)
                    text1.insert("1.0", "发表年份: " + date + "\n")  # 文本框显示
                    date_list.append(date)  # 写入list

                # 处理title
                if "TI  -" in line and ("." not in line or "?" not in line):
                    nextlineID = lineID + 1
                    thirdlineID = lineID + 2
                    nextlineID_list.append(nextlineID)
                    nextlineID_list.append(thirdlineID)
                    title01 = line.split(" ", 3)[3]  # 这里使用了不同的分割方法,因为标题里面有很多的空格
                    title01 = title01.replace("\n", "")
                    # print(f"下一行、下两行的ID是:{nextlineID_list}")
                    # print("标题第一段:" + title01)
                    # text1.insert("1.0", f"下一行、下两行的ID是:{nextlineID_list}" + "\n")  # 文本框显示
                    # text1.insert("1.0", "标题第一段:" + title01 + "\n")  # 文本框显示
                if nextlineID_list:
                    # print("列表不为空")
                    if lineID == nextlineID_list[0]:
                        title02 = line.split(" ", 6)[6].replace("\n", "")
                        # print("标题第二段:" + title02)
                        # text1.insert("1.0", "标题第二段:" + title02 + "\n")  # 文本框显示
                    if lineID == nextlineID_list[1]:
                        if "." in line and ("." not in line or "?" not in line) and " - " not in line:
                            title03 = line.split(" ", 6)[6].replace("\n", "")
                            # print("标题第三段:" + title03)
                            # text1.insert("1.0", "标题第三段:" + title03 + "\n")  # 文本框显示
                            title = title01 + title02 + title03
                            # print("标题拼起来:" + title)
                            # text1.insert("1.0", "标题拼起来:" + title + "\n")  # 文本框显示
                            # print("---")
                            title_list.append(title)
                        else:
                            title = title01 + title02
                            # print("标题拼起来:" + title)
                            # text1.insert("1.0", "标题拼起来:" + title + "\n")  # 文本框显示
                            # print("---")
                            title_list.append(title)
                        nextlineID_list = []  # 写入list
                if "TI  -" in line and ("." in line or "?" in line):
                    title = line.split(" ", 3)[3]
                    title = title.replace("\n", "")
                    # print("标题只有一段:" + title001)
                    # text1.insert("1.0", "标题只有一段:" + title + "\n")  # 文本框显示
                    title_list.append(title)
                    nextlineID_list = []

                # 处理PMC
                if "PMC -" in line:
                    pmc = line.split(" ")[2].replace("\n", "")
                    pmc_list.append(pmc)

                if "SO  -" in line:  #到了的时候(说明处理完一段了)
                    if len(date_list) == len(title_list) == len(pmc_list):
                        # print("匹配")
                        text1.insert("1.0", " √有pmc号" + "\n" + title + "\n")  # 文本框显示
                        pmc_success.append(pmc)
                    if len(pmc_list) < len(title_list):
                        pmc_list.append("缺失pmc")
                        # print("doi: 缺失doi")
                        text1.insert("1.0", " XXX缺pmc号" + "\n" + title + "\n")  # 文本框显示
                    # print("-"*100)
                    text1.insert("1.0", "-"*100 + "\n")  # 文本框显示
                    text1.update()

            f.close() #关闭文档
            # print(pmc_list)
            # print(len(pmc_list))
            number_success = str(len(pmc_success))
            number = str(len(pmc_list))
            text1.insert("1.0", "-"*100 + "\n")  # 文本框显示
            text1.insert("1.0", number_success + " 篇文献有pmc号,可供下载" + "\n")  # 文本框显示
            text1.insert("1.0", number + " 篇文献被识别" + "\n")  # 文本框显示
            text1.insert("1.0", "-"*100 + "\n")  # 文本框显示
            text1.update()

            #生成pdf标题用
            i = 0
            while i < len(date_list):
                pdf_title = date_list[i] + "_" + title_list[i]
                PDF_title.append(pdf_title)
                i = i + 1

            button1.pack()
            # button1.config(state=NORMAL)



#---- 下载文献
UA = UserAgent()
def Download():  # 下载的操作

    button1.config(state=tk.DISABLED)  # 让按钮变成灰色
    button2.config(state=tk.DISABLED)  # 让按钮变成灰色

    p_bar['maximum'] = len(pmc_list)  #更新进度条的最大值

    #用来记录下载成功或失败的excel读写
    FAILURE = []
    FAILURE_URL = []
    SUCCESS = []
    i = 0

    for pmc_num in pmc_list:
        # 更新进度条最大值
        p_bar['maximum'] = len(pmc_list)
        # 更新进度条值
        p_bar['value'] = i
        root.update()
        text1.insert("1.0", "-"*100 + "\n")  # 文本框显示
        text1.update()
        #设置保存的文件名
        titlename = PDF_title[i]
        filepath = out_path + "\\" + titlename + ".pdf"
        if ": " in filepath:  # 如果文件名称中存在冒号,无法正常生成pdf
            filepath = filepath.replace(":", "") + ".pdf"
        print(filepath)

        i = i + 1

        if pmc_num == "缺失pmc":
            FAILURE.append(titlename)
            FAILURE_URL.append("缺失pmc号")
            continue

        # 判断是否存在这个文件了,避免重复下载
        if not os.path.exists(filepath):

            text1.insert("1.0", "  Trying:  " + titlename + "\n")    #文本框显示
            text1.update()

            #找到pmc下载的网址
            url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/' + pmc_num + "/"
            # print(url)
            headers = {'User-Agent': UA.ie}
            request = requests.get(url, headers=headers)

            try:
                request = requests.get(url, headers=headers)     #打开网址
                # print(request)  # <Response [200]> = successful
                BsBOJ = BeautifulSoup(request.content, 'lxml')   #解析返回的内容
                download_url = BsBOJ.find_all(attrs={'class': "int-view"})   #找到下载的按钮
                # print(download_url)
                download_link = download_url[1].get('href')
                download_head = "https://www.ncbi.nlm.nih.gov"
                download_url = "https://www.ncbi.nlm.nih.gov" + download_link
                print(download_url)
                text1.insert("1.0", "  Downloading..." + "\n")  #文本框显示
                text1.insert("1.0", download_url + "\n")  # 文本框显示
                text1.insert("1.0", "!! 如果下载失败,手动复制粘贴这个网址到浏览器即可下载成功:" + "\n")  # 文本框显示
                text1.update()
                # ---- 保存pdf
                r = requests.get(download_url, headers=headers)
                with open(filepath, 'wb') as f:
                    f.write(r.content)
                text1.insert("1.0", " √ 下载成功:" + "\n" + titlename)  # 文本框显示
                text1.update()
                SUCCESS.append(titlename)
            except Exception as e:
                print("无法下载: " + titlename)
                text1.insert("1.0", " X 下载失败: " + titlename + "\n")  # 文本框显示
                text1.update()
                FAILURE.append(titlename)
                FAILURE_URL.append(download_url)
        else:
            text1.insert("1.0", "  该Pdf已存在: " + titlename + "\n")  # 文本框显示
            text1.update()
            SUCCESS.append(titlename)

    p_bar['value'] = i

    # 读写excel
    # 1 新建excel文件
    wb = xlwt.Workbook()
    # 2 新建工作表并重命名
    ws = wb.add_sheet('Sheet1')  # 将工作表worksheet命名为‘Python’
    # 3 写入内容
    style0 = xlwt.XFStyle()
    font0 = xlwt.Font()
    font0.bold = True  # 字体是否为粗体
    style0.font = font0
    ws.write(0, 0, "Download", style0)
    ws.write(0, 1, "Article", style0)
    ws.write(0, 2, 'URL', style0)  # write(行,列,写入的内容)

    # 创建一个样式对象,初始化样式 style
    style = xlwt.XFStyle()
    # 为样式创建字体
    font = xlwt.Font()
    font.colour_index = 4  # 设置字体颜色 4是深蓝色
    font.name = 'Times New Roman'  # 设置字体
    style.font = font  # 应用字体的设置

    style2 = xlwt.XFStyle()
    font2 = xlwt.Font()
    font2.underline = True  # 字体是否有下划线
    style2.font = font2  # 应用字体的设置

    col = 0
    row = 0
    for i in range(len(FAILURE)):
        row = row + 1
        ws.write(row, 0, "失败", style)
        ws.write(row, 1, FAILURE[i])
        ws.write(row, 2, xlwt.Formula(f'HYPERLINK("{FAILURE_URL[i]}")'), style2)
        i = i + 1
    failure_num = row

    s = 0
    for s in range(len(SUCCESS)):
        row = row + 1
        ws.write(row, 0, "下载成功")
        ws.write(row, 1, SUCCESS[s])  # row不变,继续接下去
    success_num = row - failure_num
    # 4 保存
    ws.col(1).width = 40000
    ws.col(2).width = 15000  # 修改某一列的宽

    # 获取时间,用于excel表格的命名
    timeNow = datetime.datetime.now().strftime("%Y-%m-%d %H时%M分%S秒")  # 时间类型转为字符串类型
    excel_path = out_path + f'\.pubmed下载文献报告{timeNow}.xls'  # 报告的excel文件下载地方
    wb.save(excel_path)
    # print("结束。已经保存")
    # print("成功了 " + str(success_num) + " 篇")
    # print("失败了 " + str(failure_num) + " 篇")
    text1.insert("1.0", "成功了 " + str(success_num) + " 篇\n")
    text1.insert("1.0", "失败了 " + str(failure_num) + " 篇\n")
    text1.insert("1.0",f"已经保存excel文件,文件路径为:{excel_path}\n")
    text1.update()

    button1.config(state=tk.DISABLED)  # 让按钮变成灰色
    button2.config(state=tk.NORMAL)  # 让按钮变成灰色
    text1.insert("1.0", "\n" + "----------下载完成,右上角点击关闭即可,文件存放在桌面文件夹【pubmed下载文件】---------- ")  # 文本框显示
    text1.update()

    button3.pack()
    button3.config(state=NORMAL)



root = tk.Tk()
root.geometry('1280x700')
root.configure(bg='white')  # 设置窗口背景色
root.title("pubmed文献下载器")

#文本框
text1= tk.scrolledtext.ScrolledText(root, width=160, height=20, bg="white", fg="black", font=('微软雅黑', 12), padx=10, pady=10)
# text1 = tk.Text(root, width=160, height=22, bg="white", fg="black", font=('微软雅黑', 12), padx=10, pady=10)
text1.pack()
text1.insert("1.0","请点击下方正中的‘选择文件’按钮,选择从Pubmed上下载的txt格式文件"
                   "\n"
                   "--------------------------------------------------------------------------"
                   "\n"
                   "下载方法:"
                   "\n"
                   "① pubmed选好文献后,选择【save】"
                   "\n"
                   "②【selection】选择自己合适的 - all results/selection "
                   "\n"
                   "③【format】选择’Pubmed‘!!"
                   "\n"
                   "④ 点击【create file】下载为txt格式"
                   "\n")
#进度条
p_bar = Progressbar(root, length=1250)
p_bar.pack(pady=10)
# 设置进度条最大值
p_bar['maximum'] = len(pmc_list)
# 设置进度条当前值(此处为清零/设初值为零)
p_bar['value'] = 0

var = tk.StringVar()
var2 = tk.StringVar()
var.set("开始下载")

button3 = tk.Button(root, text="重试", command=Download, padx=20, font=('微软雅黑',16),fg='blue')

button2 = tk.Button(root, text="选择文件", command=ChooseTxt, padx=20, font=('微软雅黑',16))
button2.pack()

button1 = tk.Button(root, textvariable=var, command=Download, padx=20, font=('微软雅黑',16),fg='dark red')
# button1.pack()
root.mainloop()













5.运行代码,就会出现一个框,点击【选择文件】,选择刚下好的txt

 

 6.点击【开始下载】

 下载完成后:

注:

由于技术垃圾,代码还有很多冗余的部分,如果有路过的大神,欢迎提供修改意见。虚心向您请教。谢谢!

这是基于pmc号的,基于sci-hub的可参考之前发表的

Logo

旨在为数千万中国开发者提供一个无缝且高效的云端环境,以支持学习、使用和贡献开源项目。

更多推荐