本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。
作者: Star_Zhao
PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取http://t.cn/A6Zvjdun
本次爬取自如网房源信息所用到的知识点:
分析目标站点
获取单页源码
# -*- coding: utf-8 -*-import requestsimport timefrom requests.exceptions import RequestExceptiondef get_one_page(page): try: url = 'http://hz.ziroom.com/z/nl/z2.html?p=' str(page) headers = { 'Referer':'http://hz.ziroom.com/', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36' } res = requests.get(url,headers=headers) if res.status_code == 200: print(res.text) except RequestException: return Nonedef main(): page = 1 get_one_page(page)if __name__ == '__main__': main() time.sleep(1)
解析单页源码
将获取的源码保存到当前文件夹下的'result.html'中, 然后通过XPath对其进行相应内容的提取, 当然你也可以使用某些在线工具.
from lxml import etree#解析html文档html = etree.parse('./resul.html',etree.HTMLParser())results = html.xpath('//ul[@id='houseList']/li')for result in results[1:]: title = result.xpath('./div/h3/a/text()')[0][5:] if len(result.xpath('./div/h3/a/text()')[0]) >5 else '' location = result.xpath('./div/h4/a/text()')[0].replace('[','').replace(']','') area = ' '.join(result.xpath('./div/div/p[1]/span/text()')).replace(' ','',1) #使用join方法将列表中的内容以' '字符连接 nearby = result.xpath('./div/div/p[2]/span/text()')[0] print(title) print(location) print(area) print(nearby)
解析源代码
from lxml import etreedef parse_one_page(sourcehtml): '''解析单页源码''' contentTree = etree.HTML(sourcehtml) #解析源代码 results = contentTree.xpath('//ul[@id='houseList']/li') #利用XPath提取相应内容 for result in results[1:]: title = result.xpath('./div/h3/a/text()')[0][5:] if len(result.xpath('./div/h3/a/text()')[0]) > 5 else '' location = result.xpath('./div/h4/a/text()')[0].replace('[', '').replace(']', '') area = ' '.join(result.xpath('./div/div/p[1]/span/text()')).replace(' ', '', 1) # 使用join方法将列表中的内容以' '字符连接 nearby = result.xpath('./div/div/p[2]/span/text()')[0] yield { 'title': title, 'location': location, 'area': area, 'nearby': nearby }def main(): page = 1 html = get_one_page(page) print(type(html)) parse_one_page(html) for item in parse_one_page(html): print(item)if __name__ == '__main__': main() time.sleep(1)
获取多个页面
def parse_one_page(sourcehtml): '''解析单页源码''' contentTree = etree.HTML(sourcehtml) #解析源代码 results = contentTree.xpath('//ul[@id='houseList']/li') #利用XPath提取相应内容 for result in results[1:]: title = result.xpath('./div/h3/a/text()')[0][5:] if len(result.xpath('./div/h3/a/text()')[0]) > 5 else '' location = result.xpath('./div/h4/a/text()')[0].replace('[', '').replace(']', '') area = ' '.join(result.xpath('./div/div/p[1]/span/text()')).replace(' ', '', 1) # 使用join方法将列表中的内容以' '字符连接 #nearby = result.xpath('./div/div/p[2]/span/text()')[0].strip()这里需要加判断, 改写为下句 nearby = result.xpath('./div/div/p[2]/span/text()')[0].strip() if len(result.xpath('./div/div/p[2]/span/text()'))>0 else '' yield { 'title': title, 'location': location, 'area': area, 'nearby': nearby } print(nearby) #yield {'pages':pages}def get_pages(): '''得到总页数''' page = 1 html = get_one_page(page) contentTree = etree.HTML(html) pages = int(contentTree.xpath('//div[@class='pages']/span[2]/text()')[0].strip('共页')) return pagesdef main(): pages = get_pages() print(pages) for page in range(1,pages 1): html = get_one_page(page) for item in parse_one_page(html): print(item)if __name__ == '__main__': main() time.sleep(1)
存储到MongoDB中
需确保MongoDB已启动服务, 否则必然会存储失败
def save_to_mongodb(result): '''存储到MongoDB中''' # 创建数据库连接对象, 即连接到本地 client = pymongo.MongoClient(host='localhost') # 指定数据库,这里指定ziroom db = client.iroomz # 指定表的名称, 这里指定roominfo db_table = db.roominfo try: #存储到数据库 if db_table.insert(result): print('---存储到数据库成功---',result) except Exception: print('---存储到数据库失败---',result)
1 # -*- coding: utf-8 -*- 2 3 import requests 4 import time 5 import pymongo 6 from lxml import etree 7 from requests.exceptions import RequestException 8 def get_one_page(page): 9 '''获取单页源码'''10 try:11 url = 'http://hz.ziroom.com/z/nl/z2.html?p=' str(page)12 headers = {13 'Referer':'http://hz.ziroom.com/',14 'Upgrade-Insecure-Requests':'1',15 'User-Agent':'Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'16 }17 res = requests.get(url,headers=headers)18 if res.status_code == 200:19 return res.text20 return None21 except RequestException:22 return None23 def parse_one_page(sourcehtml):24 '''解析单页源码'''25 contentTree = etree.HTML(sourcehtml) #解析源代码26 results = contentTree.xpath('//ul[@id='houseList']/li') #利用XPath提取相应内容27 for result in results[1:]:28 title = result.xpath('./div/h3/a/text()')[0][5:] if len(result.xpath('./div/h3/a/text()')[0]) > 5 else ''29 location = result.xpath('./div/h4/a/text()')[0].replace('[', '').replace(']', '')30 area = ' '.join(result.xpath('./div/div/p[1]/span/text()')).replace(' ', '', 1) # 使用join方法将列表中的内容以' '字符连接31 #nearby = result.xpath('./div/div/p[2]/span/text()')[0].strip()这里需要加判断, 改写为下句32 nearby = result.xpath('./div/div/p[2]/span/text()')[0].strip() if len(result.xpath('./div/div/p[2]/span/text()'))>0 else ''33 data = {34 'title': title,35 'location': location,36 'area': area,37 'nearby': nearby38 }39 save_to_mongodb(data)40 #yield {'pages':pages}41 def get_pages():42 '''得到总页数'''43 page = 144 html = get_one_page(page)45 contentTree = etree.HTML(html)46 pages = int(contentTree.xpath('//div[@class='pages']/span[2]/text()')[0].strip('共页'))47 return pages48 def save_to_mongodb(result):49 '''存储到MongoDB中'''50 # 创建数据库连接对象, 即连接到本地51 client = pymongo.MongoClient(host='localhost')52 # 指定数据库,这里指定ziroom53 db = client.iroomz54 # 指定表的名称, 这里指定roominfo55 db_table = db.roominfo56 try:57 #存储到数据库58 if db_table.insert(result):59 print('---存储到数据库成功---',result)60 except Exception:61 print('---存储到数据库失败---',result)62 63 def main():64 pages = get_pages()65 print(pages)66 for page in range(1,pages 1):67 html = get_one_page(page)68 parse_one_page(html)69 70 if __name__ == '__main__':71 main()72 time.sleep(1)
在第三步中XPath使用注意事项
title = result.xpath('./div/h3/a/text()')此处的点'.'不能忘记, 它表示当前节点, 如果不加'.', '/'就表示从根节点开始选取
在第四步获取多个页面时出现索引超出范围错误
nearby = result.xpath('./div/div/p[2]/span/text()')[0].strip()IndexError: list index out of range
造成这种错误原因有两种:
因为这里的nearby的index是0, 排除第一种情况, 那么这里就是空行了, 加句if判断就可以解决
nearby = result.xpath('./div/div/p[2]/span/text()')[0].strip()#改写以后:nearby = result.xpath('./div/div/p[2]/span/text()')[0].strip() if len(result.xpath('./div/div/p[2]/span/text()'))>0 else ''
以上主要是对爬虫过程学习的总结, 若有不对的地方, 还请指正, 谢谢!
联系客服