因为本人十分爱忘记做作业,因此,想通过爬虫,爬取超星的学习通作业时间,并且进行定时提醒。
爬取过程
发送邮件
(对于部分模块,有些博客写的非常好,我就不进行详述,但是会提供链接)
一文搞懂HTTP协议(带图文)
cookie和session的区别
session和cookies的区别
验证码的原理及作用
简单来说(以超星学习为例子):
使用的chrome浏览器,打开F12就可以了解整个页面的转换过程。主要分为:
打开F12后主要页面如下:
在最后的图中的formdata就是本次登录的信息(账号,密码(base64加密),验证码等)
#coding=UTF-8 #File name :爬虫超星 #Author:龙文汉 #Data:2020.10.16 #Description:使用爬虫爬取超星的作业详情,获取作业的截至时间 import time import json import requests from lxml import etree import matplotlib.pyplot as plt import matplotlib.image as mpimg import os import base64 import datetime import smtplib from email.header import Header from email.mime.text import MIMEText from email.mime.image import MIMEImage from email.mime.multipart import MIMEMultipart import threading import sys import muggle_ocr class PaChaongxin(): #main_function def __init__(self,username,password,emai): self.username = username self.password = password self.to = emai self.code = 0 self.code_status = None #验证码正确还是失败 self.user_pas_status = None #账户名和密码 self.sender_mail = 'xxxx@xxx'#发送者邮件 self.sender_pass = 'xxxxxxx' # 邮箱的stm密码 self.session = requests.session() self.header = { 'User-Agent': 'xxxxxxxx'#自己的user_agent } def User_Pas(self): #输入账号,密码 # self.username = input("请输入学号:") # self.password = input("请输入密码:") # self.to = input("请输入邮箱:") #self.username = xxxxxxx #self.password = 'xxxxx' #self.to = 'xxxxxx' return def Get_code(self): #获取验证码,以及携带的cookies code_url = 'https://passport2.chaoxing.com/num/code'#超星验证码网址 path_path = 'vari_code.png' code_response = self.session.get(code_url) #保存验证码 img = open(path_path,'wb') img.write(code_response.content) img.close() #显示验证码,并且初始化,人为输入,不适用识别程序 # img_open = Image.open('vari_code.png') # img = mpimg.imread('vari_code.png',0) # plt.imshow(img) # 显示图片 # plt.axis('off') # 不显示坐标轴 # plt.show() # self.code = input("请输入验证码:") print("befor:",self.code) self.code = self.Code_Verifed() print("after:", self.code) os.remove(path_path) def Load_Page(self): #使用session进入登录界面验证 self.password = base64.b64encode(self.password.encode("utf-8")) # 被编码的参数必须是二进制数据 param = { 'fid': 'xxxx', 'uname': self.username, 'numcode': self.code, 'password': self.password, 'refer': 'http%3A%2F%2Fi.chaoxing.com', 't': 'true' } header = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7', 'Connection': 'keep-alive', 'Content-Length': '109', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'passport2.chaoxing.com', 'Origin': 'https://passport2.chaoxing.com', 'Referer': 'https://passport2.chaoxing.com/login?loginType=3&newversion=true&fid=-1&refer=http%3A%2F%2Fi.chaoxing.com', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'xxxxxxxx',#注意自己更改 'X-Requested-With': 'XMLHttpRequest', } load_url = 'https://passport2.chaoxing.com/unitlogin?' load_response = self.session.post(load_url,headers=header,data=param) load_response_msg = json.loads(load_response.text) print(load_response_msg,load_response_msg.keys()) if 'mes' in load_response_msg.keys(): if load_response_msg['mes'] == '验证码错误': print('验证码错误') self.code_status = True while(self.code_status): self.Get_code() param['numcode'] = self.code load_response = self.session.post(load_url, headers=header, data=param) load_response_msg = json.loads(load_response.text) if 'mes' not in load_response_msg.keys() or load_response_msg['mes'] != '验证码错误': self.code_status = False if 'mes' in load_response_msg.keys() and load_response_msg['mes'] == '用户名或密码错误': print('用户名或密码错误') self.user_pas_status = True while(self.user_pas_status): self.User_Pas() self.Get_code() param['numcode'] = self.code param['uname'] = self.username param['password'] = self.password load_response = self.session.post(load_url, headers=header, data=param) load_response_msg = json.loads(load_response.text) if 'mes' not in load_response_msg.keys() and load_response_msg['mes'] != '用户名或密码错误': self.code_status = False #上面返回的应该是登录成功,接下来,带着新的cookies访问主页 self_page_url = 'http://i.mooc.chaoxing.com' self_page_response = self.session.get(url=self_page_url,headers = self.header) #个人空间的主页面,这里直接提取课程的部分,要在左侧的按钮里找到相对应的连接, #该课程的页面直接镶嵌在本页面,所以带着session直接访问也可以 # self_page_html = etree.HTML(self_page.text) # return self_page_html def Get_Class_View(self): #进入所有课程的界面 class_view_url = 'http://mooc1-2.chaoxing.com/visit/courses' class_view = self.session.get(class_view_url,headers = self.header) class_view_html = etree.HTML(class_view.text) #直接返回整个页面的编码,方便后续的查找 return class_view_html def Go_to_work(self,Singel_Class_Url_after): #通过外面直接传来的课程网址,直接跳转 #进入单个课程的界面的作业模块 Singel_Class_Url = 'https://mooc1-2.chaoxing.com'+Singel_Class_Url_after single_class_response = self.session.get(Singel_Class_Url,headers=self.header) single_class_page = etree.HTML(single_class_response.text) url_after = single_class_page.xpath("/html/body/div[4]/div/div/div[2]/ul/li[6]/a/@data") if len(url_after) == 0: open_zuoye_url = 'https://mooc1-2.chaoxing.com' + single_class_page.xpath("/html/body/div[2]/div/div/div[2]/ul/li[6]/a/@data")[0] else: open_zuoye_url = 'https://mooc1-2.chaoxing.com' + single_class_page.xpath("/html/body/div[4]/div/div/div[2]/ul/li[6]/a/@data")[0] work_xml_response = self.session.get(open_zuoye_url,headers=self.header) work_xml = etree.HTML(work_xml_response.text) #这里的open_zuoye直接转到了作业的界面 #调用每个作业的函数,方便多线程 single_class_text = self.Get_work_time(work_xml) return single_class_text def Get_work_time(self,work_xml): # 以上就是关于页面跳转的函数,接下来就是作业的截取 #作业的信息提取 work_num = len(work_xml.xpath('//*[@id="RightCon"]/div/div/div[2]/ul/li')) class_name = work_xml.xpath('/html/body/div[2]/div/h1/span[1]/@title')[0] #邮件的内容:[[课程],[作业名称,截至时间,剩余时间]*n]的列表 #无作业的课程直接返回 if work_num == 0: return None work_text_info = []#有作业的课程 work_text_info.append(class_name)#添加课程名 #对每个项目进行整理 for i in range(1,work_num+1): work_name = work_xml.xpath('//*[@id="RightCon"]/div/div/div[2]/ul/li['+str(i)+']/div[1]/p/a/text()')[0].strip() work_status = work_xml.xpath('/html/body/div[3]/div[1]/div/div/div/div[2]/ul/li['+str(i)+']/div[1]/span[3]/strong/text()')[0].strip() work_end_time = work_xml.xpath('//*[@id="RightCon"]/div/div/div[2]/ul/li['+str(i)+']/div[1]/span[2]/text()') # print(work_end_time) if len(work_end_time) == 0:#无截至日期的课程 break else: work_end_time = work_end_time[0] print(class_name,work_name,work_end_time) if work_status != '已完成' and work_status != "待批阅":#存在未完成的课程 work_text_info.append([]) #增加作业名 work_text_info[-1].append(work_name) #使用datatime,计算时间差 current_time = time.strftime("%Y-%m-%d %H:%M", time.localtime()) #转化位datatime的格式 current_time_tran = datetime.datetime.strptime(current_time ,"%Y-%m-%d %H:%M") work_end_time_tran = datetime.datetime.strptime(work_end_time ,"%Y-%m-%d %H:%M") time_mul = work_end_time_tran - current_time_tran time_mul_day = time_mul.days if time_mul_day <= 1: time_m, time_s = divmod(time_mul.seconds, 60) time_h, time_m = divmod(time_m, 60) lea_time = '0天:'+str(time_h)+'小时:'+str(time_m)+'分钟' #增加剩余时间 work_text_info[-1].append(lea_time) else: lea_time = str(time_mul_day)+'天' work_text_info[-1].append(lea_time) print(class_name,"已经结束") #返回单课程的作业信息 return work_text_info def Code_Verifed(self): sdk = muggle_ocr.SDK(model_type=muggle_ocr.ModelType.Captcha) with open(r'vari_code.png', 'rb') as f: captcha_bytes = f.read() code = sdk.predict(image_bytes=captcha_bytes) return code def send_email_by_qq(self,text): # 利用邮箱发邮件提醒 # 设置总的邮件体对象,对象类型为mixed msg_root = MIMEMultipart('mixed') # 邮件添加的头尾信息等 msg_root['From'] = 'xxxx@xxxxx<xxxx@xxxx>' msg_root['To'] = self.to # 邮件的主题,显示在接收邮件的预览页面 subject = '快到作业截止时间了!' msg_root['subject'] = Header(subject, 'utf-8') # 构造文本内容 text_inf = "未完成作业总览:\n" for i in range(0,len(text)): text_inf += text[i][0]+"\n"#添加标题 for x in range(1,len(text[i])): text_inf += '\t\t' text_inf += str(text[i][x]) text_inf += "\n\n" text_sub = MIMEText(text_inf, 'plain', 'utf-8') print(text_inf) msg_root.attach(text_sub) # # 构造超文本 # url = "https://blog.csdn.net/chinesepython" # html_info = """ # <p>点击以下链接,你会去向一个更大的世界</p> # <p><a href="%s">click me</a></p> # <p>i am very galsses for you</p> # """% url # html_sub = MIMEText(html_info, 'html', 'utf-8') # # 如果不加下边这行代码的话,上边的文本是不会正常显示的,会把超文本的内容当做文本显示 # html_sub["Content-Disposition"] = 'attachment; filename="csdn.html"' # # 把构造的内容写到邮件体中 # msg_root.attach(html_sub) # # 构造图片 # image_file = open(r'D:\python_files\images\test.png', 'rb').read() # image = MIMEImage(image_file) # image.add_header('Content-ID', '<image1>') # # 如果不加下边这行代码的话,会在收件方方面显示乱码的bin文件,下载之后也不能正常打开 # image["Content-Disposition"] = 'attachment; filename="red_people.png"' # msg_root.attach(image) # # 构造附件 # txt_file = open(r'D:\python_files\files\hello_world.txt', 'rb').read() # txt = MIMEText(txt_file, 'base64', 'utf-8') # txt["Content-Type"] = 'application/octet-stream' # #以下代码可以重命名附件为hello_world.txt # txt.add_header('Content-Disposition', 'attachment', filename='hello_world.txt') # msg_root.attach(txt) try: sftp_obj = smtplib.SMTP_SSL('smtp.qq.com', 465) sftp_obj.login(self.sender_mail, self.sender_pass) sftp_obj.sendmail(self.sender_mail, self.to, msg_root.as_string()) sftp_obj.quit() print('sendemail successful!') except Exception as e: print('sendemail failed next is the reason') print(e) def Begin(self): self.User_Pas()#获取信息 self.Get_code()#获取验证码以及cookies信息,用session进行保存 self.Load_Page()#进入个人中心,更新cookies class_view_html = self.Get_Class_View()#进入所有作业的单页面,并且返该界面 len_class = len(class_view_html.xpath('/html/body/div/div[2]/div[3]/ul/li'))#计算一共有多少个课程 all_text = []#总的邮件信息 for i in range(1,len_class):#开始遍历每门课 print('开始第',i) Singel_Class_Url_after = class_view_html.xpath('/html/body/div/div[2]/div[3]/ul/li['+str(i)+']/div[2]/h3/a/@href')[0] # print(Singel_Class_Url_after) # all_text.append([threading.Thread(target=self.Go_to_work,args=(Singel_Class_Url_after)).start()]) work_text = self.Go_to_work(Singel_Class_Url_after) if work_text == None or len(work_text) <= 1: continue else: all_text.append(work_text) #将没有作业的课程删除 return all_text if __name__ == '__main__': #while 1: #time_list = time.strftime("%H:%M:%S", time.localtime()).split(":") # if time_list[0] == "08" and time_list[1] == "00" and time_list[2] == "00": # pachong = PaChaongxin(201821094053,"Mmgh774109","1766446371@qq.com") # work_email = pachong.Begin() # pachong.send_email_by_qq(work_email) # pachong = PaChaongxin(201821094049,"wcsba8102","2651447403@qq.com") # work_email = pachong.Begin() # pachong.send_email_by_qq(work_email) pachong = PaChaongxin(学号,密码,邮箱) work_email = pachong.Begin() pachong.send_email_by_qq(work_email) pachong = PaChaongxin(xxxxxxx,"xxxxxxx","xxxxx@xxxxx") work_email = pachong.Begin() pachong.send_email_by_qq(work_email)
果然任务驱动加上兴趣的学习,能更加扩展知识面,本次实例,都了解了:爬虫知识,网络知识,密码学,云服务器的使用,邮件知识,python的使用。总体来说,受益匪浅。希望大家能共同进步
PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取
联系客服