打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
Python|想做QQ群大数据分析,这个采集源码或许能帮助你解决问题

说道引流推广的社群渠道,第一个想到的就是QQ群,没错,今天我们来实现一个QQ群爬虫!

批量抓取 QQ 群信息,包括群名称、群号、群人数、群主、地域、分类、标签、群简介等内容,返回 XLS(X) / CSV / JSON 结果文件。

使用到的第三方模块:

bottle

requests

simplejson

pyexcel-xls

unicodecsv

在QQ群爬虫类classQQGroups去实现各种方法

几个核心方法的代码:

获取二维码

def getQRCode(self):

self.newSession()

try:

url = 'http://ui.ptlogin2.qq.com/cgi-bin/login'

params = {

'appid': '715030901',

'daid': '73',

'pt_no_auth': '1',

's_url': sourceURL

}

resp = self.sess.get(url, params=params, timeout=1000)

pattern = r'imgcache\.qq\.com/ptlogin/ver/(\d+)/js'

try:

self.js_ver = re.search(pattern, resp.content).group(1)

except:

pass

self.sess.headers.update({'Referer': url})

url = 'http://ptlogin2.qq.com/ptqrshow'

params = {

'appid': '715030901',

'e': '2',

'l': 'M',

's': '3',

'd': '72',

'v': '4',

't': '%.17f' % (random()),

'daid': '73'

}

resp = self.sess.get(url, params=params, timeout=1000)

response.set_header('Content-Type', 'image/png')

response.add_header('Cache-Control', 'no-cache, no-store')

response.add_header('Pragma', 'no-cache')

except:

resp = None

return resp

扫描二维码登录

def qrLogin(self):

login_sig = self.sess.cookies.get_dict().get('pt_login_sig', '')

qrsig = self.sess.cookies.get_dict().get('qrsig', '')

status = -1

errorMsg = ''

if all([login_sig, qrsig]):

url = 'http://ptlogin2.qq.com/ptqrlogin'

params = {

'u1': sourceURL,

'ptqrtoken': self.genqrtoken(qrsig),

'ptredirect': '1',

'h': '1',

't': '1',

'g': '1',

'from_ui': '1',

'ptlang': '2052',

'action': '0-0-%d' % (time() * 1000),

'js_ver': self.js_ver,

'js_type': '1',

'login_sig': login_sig,

'pt_uistyle': '40',

'aid': '715030901',

'daid': '73'

}

try:

resp = self.sess.get(url, params=params, timeout=1000)

result = resp.content

if '二维码未失效' in result:

status = 0

elif '二维码认证中' in result:

status = 1

elif '登录成功' in result:

status = 2

elif '二维码已失效' in result:

status = 3

else:

errorMsg = str(result.text)

except:

try:

errorMsg = str(resp.status_code)

except:

pass

loginResult = {

'status': status,

'time': time(),

'errorMsg': errorMsg,

}

resp = json.dumps(loginResult)

response.set_header('Content-Type', 'application/json; charset=UTF-8')

response.add_header('Cache-Control', 'no-cache; must-revalidate')

response.add_header('Expires', '-1')

return resp

QQ群搜索

def qqunSearch(self, request):

sort = request.forms.get('sort')

pn = int(request.forms.get('pn'))

ft = request.forms.get('ft')

kw = request.forms.get('kw').strip()

if not kw:

redirect('/qqun')

self.sess.headers.update({'Referer': sourceURL})

skey = self.sess.cookies.get_dict().get('skey', '')

groups = [(u'群名称', u'群号', u'群人数', u'群上限',

u'群主', u'地域', u'分类', u'标签', u'群简介')]

gListRaw = []

if len(groups) == 1:

redirect('/qqun')

f = BytesIO()

if ft == 'xls':

sheet = pe.Sheet(groups)

f = sheet.save_to_memory('xls', f)

response.set_header('Content-Type', 'application/vnd.ms-excel')

filename = kw.replace(' ', '_') + '.xls'

response.add_header(

'Content-Disposition',

'attachment; filename='%s'' % (filename)

)

return f.getvalue()

elif ft == 'xlsx':

import tempfile

import xlsxwriter

filename = kw.replace(' ', '_') + '.xlsx'

workbook = xlsxwriter.Workbook(

tempfile.gettempdir() + '/' + filename)

worksheet = workbook.add_worksheet()

row = 0

col = 0

for a, b, c, d, e, f, g, h, i in groups:

worksheet.write(row, col, a)

worksheet.write(row, col + 1, b)

worksheet.write(row, col + 2, c)

worksheet.write(row, col + 3, d)

worksheet.write(row, col + 4, e)

worksheet.write(row, col + 5, f)

worksheet.write(row, col + 6, g)

worksheet.write(row, col + 7, h)

worksheet.write(row, col + 8, i)

row += 1

workbook.close()

resp = static_file(

filename,

root=tempfile.gettempdir(),

download=filename,

mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'

)

return resp

elif ft == 'csv':

writer = csv.writer(f, dialect='excel', encoding='utf-8')

writer.writerows(groups)

response.set_header('Content-Type', 'text/csv; charset=UTF-8')

filename = kw.replace(' ', '_') + '.csv'

response.add_header(

'Content-Disposition',

'attachment; filename='%s'' % (filename)

)

return f.getvalue()

elif ft == 'json':

json.dump(gListRaw, f, indent=4, sort_keys=True)

response.set_header(

'Content-Type', 'application/json; charset=UTF-8')

filename = kw.replace(' ', '_') + '.json'

response.add_header(

'Content-Disposition',

'attachment; filename='%s'' % (filename)

)

return f.getvalue()

关注私信0920获取该源码

本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
宝藏B站UP主,视频弹幕尽收囊中!
爬虫爬取代理ip
JavaWeb springMvc文件下载,亲测可用
北邮人论坛爬虫实操:session替代cookies
PHP导入Excel
App爬虫篇 - 破解移动端登录,助力 Python 爬虫
更多类似文章 >>
生活服务
热点新闻
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服