# 暂停程序运行
import time
# 后面扒下来的数据是字符串里面包着字典,毫无可读性,可以通过json字符串和python字典的相互转换来提高可读性
import json
from browsermobproxy import Server
from selenium import webdriver
# 配置代理用
from selenium.webdriver.chrome.options import Option
BMPserver = Server(r'D:\Apython\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')
BMPserver.start()
BMPproxy = BMPserver.create_proxy()
chrome_options = Options()
# 禁用扩展插件,因为我也不是太懂,总之没了这句,浏览器会报警提示如下图。魔法,勿动。
chrome_options.add_argument('--ignore-certificate-errors')
# BMPproxy.proxy返回的是localhost:8081端口
chrome_options.add_argument('--proxy-server={}'.format(BMPproxy.proxy))
# 如果Selenium驱动放在了python.exe同级目录下,executable_path参数可以省略
brosver = webdriver.Chrome(executable_path='D:\Apython\chromedriver.exe',options=chrome_options)
url = 'https://m.lagou.com/search.html'
# 个人理解是new一个空的har准备接收爬取网站的交互信息
BMPproxy.new_har('lagou',options={'captureContent': True,'captureContent': True})
# 模拟浏览器
brosver.get(url)
# 搜索classname为inputer的Element对象
searchBox = brosver.find_element_by_class_name('inputer')
# 查找搜索按钮
button = brosver.find_element_by_class_name('search')
# 向搜索文本框送入关键字(工作名称)
searchBox.send_keys('数据分析')
# 点击搜索按钮
button.click()
time.sleep(3)
result = BMPproxy.har
result_json = json.dumps(result,indent=4)
with open('lagoujob.json','w',errors='igone') as f:
f.write(result_json)
for entry in result['log']['entries']:
entry_url = entry['request']['url']
# 根据URL找到数据接口
if 'city=%E5%85%A8%E5%9B%BD&positionName=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90' in entry_url:
# 获取接口返回内容
_response = entry['response']
_content = _response['content']['text']
# 返回的均是json字符串,为了提高可读性,先将数据转换成python字典,再转成json数据
content_dict = json.loads(_content)
content_json = json.dumps(content_dict,indent=4)
else:
continue
print(content_json)
import json
# 注意是json数据字符串,注意单双引号
html_json = '{'code': '\u884c\u52a8\u4ee3\u53f7\uff1a\u5929\u738b\u76d6\u5730\u864e'}'
html_dict = json.loads(html_json)
print(html_dict)
{'code': '行动代号:天王盖地虎'}
import time
import json
from browsermobproxy import Server
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def main():
# 开启代理
BMPserver = Server(r'D:\Apython\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')
BMPserver.start()
BMPproxy = BMPserver.create_proxy()
# 配置代理启动webdriver
chrome_options = Options()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--proxy-server={}'.format(BMPproxy.proxy))
brosver = webdriver.Chrome(executable_path='D:\Apython\chromedriver.exe',options=chrome_options)
# 获取返回内容
url = 'https://m.lagou.com/search.html'
BMPproxy.new_har('lagou',options={'captureContent': True,'captureContent': True})
# 模拟浏览器
brosver.get(url)
# 搜索classname为inputer的Element对象
searchBox = brosver.find_element_by_class_name('inputer')
# 查找搜索按钮
button = brosver.find_element_by_class_name('search')
# 向搜索文本框送入关键字
searchBox.send_keys('数据分析')
# 点击搜索按钮
button.click()
time.sleep(3)
result = BMPproxy.har
# result_json = json.dumps(result,indent=4)
# with open('lagoujob.json','w',errors='igone') as f:
# f.write(result_json)
for entry in result['log']['entries']:
entry_url = entry['request']['url']
# 根据URL找到数据接口
if 'city=%E5%85%A8%E5%9B%BD&positionName=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90' in entry_url:
# 获取接口返回内容
_response = entry['response']
_content = _response['content']['text']
# 返回的均是字符串包着字典,为了提高可读性,先将数据转换成python字典,再转成json数据
content_dict = json.loads(_content)
content_json = json.dumps(content_dict,indent=4)
else:
continue
print(content_json)
main()
联系客服