今天我做了一个简单的抓取内容,并提供了一个简单的功能,并提供了一个简单的网页功能,并提供了一个简单的功能。
python-docx,所以这个模块首先要进行windows的默认安装。
1、在python官网上找到https://bootstrap.pypa.io/ez_setup.py,把代码保存到本地执行:python ez_setup.py
2、下载python-docx (https://pypi.python.org/pypi/python-docx/0.7.4),下载完成后解压并进入到XXX\python-docx-0.7.4 安装python-docx : python setup.py 安装
python-docx就安装成功了,可以用它来操作word文档了,word文档的生成参考的这里https://python-docx.readthedocs.org/en/latest/index.html
html里解析的内容是sggml的获取urlml的urllib、urllib、urllib、urllib
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | # -*- coding: cp936 -*- from sgmllib import SGMLParser import os import sys import urllib import urllib2 from docx import Document from docx.shared import Inches import time ##获取要解析的url class GetUrl(SGMLParser): def __init__( self ): SGMLParser.__init__( self ) self .start = False self .urlArr = [] def start_div( self ,attr): for name,value in attr: if value = = "ChairmanCont Bureau" : #页面js中的固定值 self .start = True def end_div( self ): self .start = False def start_a( self ,attr): if self .start: for name,value in attr: self .urlArr.append(value) def getUrlArr( self ): return self .urlArr ##解析上面获取的url,获取有用数据 class getManInfo(SGMLParser): def __init__( self ): SGMLParser.__init__( self ) self .start = False self .p = False self .dl = False self .manInfo = [] self .subInfo = [] def start_div( self ,attr): for name,value in attr: if value = = "SpeakerInfo" : #页面js中的固定值 self .start = True def end_div( self ): self .start = False def start_p( self ,attr): if self .dl: self .p = True def end_p( self ): self .p = False def start_img( self ,attr): if self .dl: for name,value in attr: self .subInfo.append(value) def handle_data( self ,data): if self .p: self .subInfo.append(data.decode( 'utf-8' )) def start_dl( self ,attr): if self .start: self .dl = True def end_dl( self ): self .manInfo.append( self .subInfo) self .subInfo = [] self .dl = False def getManInfo( self ): return self .manInfo urlSource = "http://www.XXX" sourceData = urllib2.urlopen(urlSource).read() startTime = time.clock() ##get urls getUrl = GetUrl() getUrl.feed(sourceData) urlArr = getUrl.getUrlArr() getUrl.close() print "get url use:" + str ((time.clock() - startTime)) startTime = time.clock() ##get maninfos manInfos = getManInfo() for url in urlArr: #one url one person data = urllib2.urlopen(url).read() manInfos.feed(data) infos = manInfos.getManInfo() manInfos.close() print "get maninfos use:" + str ((time.clock() - startTime)) startTime = time.clock() #word saveFile = os.getcwd() + "\\xxx.docx" doc = Document() ##word title doc.add_heading( "HEAD" .decode( 'gbk' ), 0 ) p = doc.add_paragraph( "HEADCONTENT:" .decode( 'gbk' )) ##write info for infoArr in infos: i = 0 for info in infoArr: if i = = 0 : ##img url arr1 = info.split( '.' ) suffix = arr1[ len (arr1) - 1 ] arr2 = info.split( '/' ) preffix = arr2[ len (arr2) - 2 ] imgFile = os.getcwd() + "\\imgs\\"+preffix+" ." + suffix if not os.path.exists(os.getcwd() + "\\imgs" ): os.mkdir(os.getcwd() + "\\imgs" ) imgData = urllib2.urlopen(info).read() try : f = open (imgFile, 'wb' ) f.write(imgData) f.close() doc.add_picture(imgFile,width = Inches( 1.25 )) os.remove(imgFile) except Exception as err: print (err) elif i = = 1 : doc.add_heading(info + ":" ,level = 1 ) else : doc.add_paragraph(info,style = 'ListBullet' ) i = i + 1 doc.save(saveFile) print "word use:" + str ((time.clock() - startTime)) |
以上就是本文关于pythonhtml提取数据,并生成word文档实例解析的全部内容,希望对大家有所帮助。有不足的朋友可以继续看本站其他相关专题,如有需要,欢迎留言资料。感谢朋友们对本站的支持!
联系客服