python解析html提取数据，并生成word文档实例解析

简介

今天我做了一个简单的抓取内容，并提供了一个简单的功能，并提供了一个简单的网页功能，并提供了一个简单的功能。

python-docx，所以这个模块首先要进行windows的默认安装。

安装

1、在python官网上找到https://bootstrap.pypa.io/ez_setup.py，把代码保存到本地执行：python ez_setup.py

2、下载python-docx (https://pypi.python.org/pypi/python-docx/0.7.4)，下载完成后解压并进入到XXX\python-docx-0.7.4 安装python-docx : python setup.py 安装

python-docx就安装成功了，可以用它来操作word文档了，word文档的生成参考的这里https://python-docx.readthedocs.org/en/latest/index.html

html里解析的内容是sggml的获取urlml的urllib、urllib、urllib、urllib

实现代码

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

# -*- coding: cp936 -*-

from sgmllib import SGMLParser

import os

import sys

import urllib

import urllib2

from docx import Document

from docx.shared import Inches

import time

##获取要解析的url

class GetUrl(SGMLParser):

def __init__(self):

SGMLParser.__init__(self)

self.start=False

self.urlArr=[]

def start_div(self,attr):

for name,value in attr:

if value=="ChairmanCont Bureau":#页面js中的固定值

self.start=True

def end_div(self):

self.start=False

def start_a(self,attr):

if self.start:

for name,value in attr:

self.urlArr.append(value)

def getUrlArr(self):

return self.urlArr

##解析上面获取的url，获取有用数据

class getManInfo(SGMLParser):

def __init__(self):

SGMLParser.__init__(self)

self.start=False

self.p=False

self.dl=False

self.manInfo=[]

self.subInfo=[]

def start_div(self,attr):

for name,value in attr:

if value=="SpeakerInfo":#页面js中的固定值

self.start=True

def end_div(self):

self.start=False

def start_p(self,attr):

if self.dl:

self.p=True

def end_p(self):

self.p=False

def start_img(self,attr):

if self.dl:

for name,value in attr:

self.subInfo.append(value)

def handle_data(self,data):

if self.p:

self.subInfo.append(data.decode('utf-8'))

def start_dl(self,attr):

if self.start:

self.dl=True

def end_dl(self):

self.manInfo.append(self.subInfo)

self.subInfo=[]

self.dl=False

def getManInfo(self):

return self.manInfo

urlSource="http://www.XXX"

sourceData=urllib2.urlopen(urlSource).read()

startTime=time.clock()

##get urls

getUrl=GetUrl()

getUrl.feed(sourceData)

urlArr=getUrl.getUrlArr()

getUrl.close()

print "get url use:" + str((time.clock() - startTime))

startTime=time.clock()

##get maninfos

manInfos=getManInfo()

for url in urlArr:#one url one person

data=urllib2.urlopen(url).read()

manInfos.feed(data)

infos=manInfos.getManInfo()

manInfos.close()

print "get maninfos use:" + str((time.clock() - startTime))

startTime=time.clock()

#word

saveFile=os.getcwd()+"\\xxx.docx"

doc=Document()

##word title

doc.add_heading("HEAD".decode('gbk'),0)

p=doc.add_paragraph("HEADCONTENT:".decode('gbk'))

##write info

for infoArr in infos:

i=0

for info in infoArr:

if i==0:##img url

arr1=info.split('.')

suffix=arr1[len(arr1)-1]

arr2=info.split('/')

preffix=arr2[len(arr2)-2]

imgFile=os.getcwd()+"\\imgs\\"+preffix+"."+suffix

if not os.path.exists(os.getcwd()+"\\imgs"):

os.mkdir(os.getcwd()+"\\imgs")

imgData=urllib2.urlopen(info).read()

try:

f=open(imgFile,'wb')

f.write(imgData)

f.close()

doc.add_picture(imgFile,width=Inches(1.25))

os.remove(imgFile)

except Exception as err:

print (err)

elif i==1:

doc.add_heading(info+":",level=1)

else:

doc.add_paragraph(info,style='ListBullet')

i=i+1

doc.save(saveFile)

print "word use:" + str((time.clock() - startTime))

总结

以上就是本文关于pythonhtml提取数据，并生成word文档实例解析的全部内容，希望对大家有所帮助。有不足的朋友可以继续看本站其他相关专题，如有需要，欢迎留言资料。感谢朋友们对本站的支持！

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。