selenium 模拟登录 Tesseract-orc验证码截取识别教务网站爬虫 cookies获取

yym68686

396人浏览 · 2020-09-04 17:23:12

yym68686 · 2020-09-04 17:23:12 发布

需要注意的基本都在注释里了
文章最后是我参考的文章

import os                                                         # 创建文件夹
import json                                                       # 解析cookies
import requests
import datetime                                                   #获取当前时间
import pytesseract
from lxml import etree
from bs4 import BeautifulSoup                                     # 引用beautifulsoup库，两个字母要大写
from selenium import webdriver                                    #模拟登录
from PIL import Image, ImageEnhance
from fake_useragent import UserAgent
from selenium.webdriver.common.by import By                       # 用于指定 HTML 文件中 DOM 标签元素
from selenium.webdriver.support.ui import WebDriverWait           # 等待网页加载完成
from selenium.webdriver.support import expected_conditions as EC  # 指定等待网页加载结束条件


buildingID = 9  # 七号楼
# buildingID = 13  # 二号楼



imageUrl = "****"                                           #本地保存验证码的地址
cookiesUrl = "***"                                          #本地cookies存储地址
urlsearch = '*****************************'                 #这是查询空闲教室的网址
url = '***************************************************' #教务网站地址


'''
先验证存在本地的cookies有没有用
'''
with open(cookiesUrl, 'r', encoding='utf-8') as f:         #读取本地cookies              
    listCookies = json.loads(f.read())
cookie = [item["name"] + "=" + item["value"] for item in listCookies]
cookiestr = '; '.join(item for item in cookie)  # cookies分号后面的空格不能少，否则cookies无法使用

headers = {
    'cookie': cookiestr,
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}

html = requests.get(url=urlsearch, headers=headers)
demo = html.text
soup = BeautifulSoup(demo, "html.parser")  # 用HTML格式解析网站内容

soo = soup.find('input', id='password')    # 检查是不是登陆页面，不是则模拟登录重新拿到cookies
soo2 = soup.find("a", id="errorMove")      # 检查是否网站被拦截，被拦截就模拟登录重新拿到cookies

if (soo != None or soo2 != None):
    # 实例化浏览器
    broswer = webdriver.Chrome()
    # 打开网页
    broswer.get(url)

    i = 0 #标记是否为第一次执行识别验证码，因为第一次与后面所有识别的验证码的位置不一样，要用不同的截图裁剪方案
    while (1):
        
        WebDriverWait(broswer, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'user')))  # 等待账户输入框元素出现
        user = broswer.find_element_by_xpath('//input[@id="username"]') #定位用户名输入框
        pwd = broswer.find_element_by_xpath('//input[@id="password"]') #定位密码输入框
        invaild = broswer.find_element_by_xpath('//input[@id="captcha_response"]') #定位验证码输入框


        broswer.save_screenshot(imageUrl) #先保存整个网页的截图，以后再裁剪
        ran = Image.open(imageUrl)# 打开图片
        if(i == 0): box = (675, 338, 759, 369)  # 第一次获取验证码的位置，手动定位，代表（左，上，右，下）
        else: box = (675, 376, 760, 406)        # 除第一次外，验证码的位置，手动定位，代表（左，上，右，下）
        ran.crop(box).save(imageUrl)            # 保存验证码图片


        '''
        以下操作把验证码变成白底黑字，提高orc识别率
        '''
        im = Image.open(imageUrl)  
        imgry = im.convert('L')                   # 图像加强，二值化
        sharpness = ImageEnhance.Contrast(imgry)  # 对比度增强
        sharp_img = sharpness.enhance(2.0)
        sharp_img.save(imageUrl)
        img = Image.open(imageUrl) 
        text = pytesseract.image_to_string(img, config=' --psm 7')  # 使用 pytesseract 与 tesseract-orc相配合，识别验证码，参数 --psm 7是将整张图片看作一行字来识别


        # 输入账户密码
        user.click()
        user.send_keys('你的用户名')
        pwd.click()
        pwd.send_keys('你的密码')
        invaild.click()
        invaild.send_keys(text)
        
        try:
            broswer.find_element_by_xpath('//*[@id="loginForm"]/table/tbody/tr[5]/td/input').click() 
        except:
            pass

        i = 1
        try:
            textt = broswer.find_element_by_xpath('//*[@id="messages16741228231"]/div/div/span[2]').text
        except:
            break #如果找不到元素就退出循环，说明已经登陆成功了！


    '''
    以下操作目的是将cookies存到本地以便下次使用
    '''
    cookie = broswer.get_cookies()                             # 一旦登录成功就可以获取cookies
    os.makedirs("D:/code/python/爬虫/教务网站/", exist_ok=True) #创建存在cookies文件的文件夹
    jsonCookies = json.dumps(cookie)
    with open(cookiesUrl, 'w') as f:
        f.write(jsonCookies)

    with open(cookiesUrl, 'r', encoding='utf-8') as f:
        listCookies = json.loads(f.read())
    cookie = [item["name"] + "=" + item["value"] for item in listCookies]
    cookiestr = '; '.join(item for item in cookie)
    
    
    broswer.quit() # 成功得到cookies后退出浏览器


a = [0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 2]

today = datetime.date.today() # 获取现在的时间，在后面查询学校空闲教室，提交表单时需要这个时间来查询今天的空闲教室
headers = {
    'Cookie': cookiestr,
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
}
data = {
    'classroom.type.id': '2',  # 多媒体教室
    'classroom.campus.id': '',
    'classroom.building.id': buildingID,
    'seats': '',
    'classroom.name': '',
    'cycleTime.cycleCount': '1',
    'cycleTime.cycleType': '1',
    'cycleTime.dateBegin': today,
    'cycleTime.dateEnd': today,
    'roomApplyTimeType': '1',
    'timeBegin': '14:00',
    'timeEnd': '18:00'
}
response = requests.post(url=urlsearch, data=data, headers=headers, timeout=1000)
response.encoding = 'utf-8'
demo = response.text
soup = BeautifulSoup(demo, "html.parser")           # 用HTML格式解析网站内容
soo = soup.find('tbody', id='grid15320024301_data') # 空闲教室以表格方式列出
for i in soo.find_all('tr'):
    # if (i.find_all('td')[1].text[0] == str(a[buildingID]) and i.find_all('td')[1].text[1] < '3'):
    if (i.find_all('td')[1].text[0] == str(a[buildingID])):
        print(i.find_all('td')[1].text) # 输出有哪些空闲教室

References

AtomGit开源社区

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念，把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起，为开发者提供从开发、训练到部署的一站式体验。

更多推荐

1.8B 体积、33 种语言互译｜腾讯混元 HY-MT1.5-1.8B 多语言机器翻译模型上线

在跨语言交流日益频繁的今天，阅读外语菜单、处理多语言邮件、与不同语言背景的人沟通，已经成为很多人日常工作与生活的一部分。过去，这类需求往往依赖联网翻译工具，而如今，—— 一部设备即可支持的相互翻译。当 AI 不再只是“逐字直译”，而是开始理解语境、风格与语言之间的细微差异，机器翻译就真正具备了今天为大家介绍一款高质量、多语言、支持端侧部署的机器翻译模型 ——，现已上线 AtomGit AI 社区，