python爬虫beta版之抓取知乎单页面

PHP中文网 • 2025年2月27日 19:23:49 • 编程技术 • 阅读 2

鉴于之前用python写爬虫，帮运营人员抓取过京东的商品品牌以及分类，这次也是用python来搞简单的抓取单页面版，后期再补充哈。

#-*- coding: UTF-8 -*- import requestsimport sysfrom bs4 import BeautifulSoup#－－－－－－知乎答案收集－－－－－－－－－－#获取网页body里的内容def get_content(url , data = None):    header={        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',        'Accept-Encoding': 'gzip, deflate, sdch',        'Accept-Language': 'zh-CN,zh;q=0.8',        'Connection': 'keep-alive',        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'    }    req = requests.get(url, headers=header)    req.encoding = 'utf-8'    bs = BeautifulSoup(req.text, "html.parser")  # 创建BeautifulSoup对象    body = bs.body # 获取body部分    return body#获取问题标题def get_title(html_text):     data = html_text.find('span', {'class': 'zm-editable-content'})     return data.string.encode('utf-8')#获取问题内容def get_question_content(html_text):     data = html_text.find('div', {'class': 'zm-editable-content'})     if data.string is None:         out = '';         for datastring in data.strings:             out = out + datastring.encode('utf-8')         print '内容：' + out     else:         print '内容：' + data.string.encode('utf-8')#获取点赞数def get_answer_agree(body):    agree = body.find('span',{'class': 'count'})    print '点赞数：' + agree.string.encode('utf-8') + ''#获取答案def get_response(html_text):     response = html_text.find_all('div', {'class': 'zh-summary summary clearfix'})     for index in range(len(response)):         #获取标签         answerhref = response[index].find('a', {'class': 'toggle-expand'})         if not(answerhref['href'].startswith('javascript')):             url = 'http://www.zhihu.com/' + answerhref['href']             print url             body = get_content(url)             get_answer_agree(body)             answer = body.find('div', {'class': 'zm-editable-content clearfix'})             if answer.string is None:                 out = '';                 for datastring in answer.strings:                     out = out + '' + datastring.encode('utf-8')                 print out             else:                 print answer.string.encode('utf-8')html_text = get_content('https://www.zhihu.com/question/43879769')title = get_title(html_text)print "标题：" + title + ''questiondata = get_question_content(html_text)print ''data = get_response(html_text)

登录后复制

输出结果：