Scrapy爬取知乎所有问题和回答
模拟登录
知乎需要登录才能进⼊。
所以,爬取知乎的第⼀步就是模拟登录,这⾥我们使⽤的是selenium模拟登录。
start_requests函数是scrapy中spider的⼊⼝,所以模拟登录应该放在这个函数中,我们重写start_requests函数:
def start_requests(self):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
os.system('chcp 65001')
os.popen(' --remote-debugging-port=9222 --user-data-dir="C:\selenum\AutomationProfile"')
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress","127.0.0.1:9222")
browser = webdriver.Chrome(
executable_path='F:\BaiduNetdiskDownload\',
options=chrome_options)
完全二叉树一定存在度为1的结点<("www.zhihu/signin")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("⽤户名")
import time
time.sleep(3)
browser.find_element_by_css_selector(".SignFlow-password input").send_keys("密码")
splice是不是只能苹果browser.find_element_by_css_selector(
".Button.SignFlow-submitButton").click()
time.sleep(10)
Cookies = _cookies()
print("----",Cookies)
cookie_dict ={}
for cookie in Cookies:
cookie_dict[cookie['name']]= cookie['value']
mysql面试题 知乎browser.close()
return[scrapy.Request(url=self.start_urls[0],dont_filter=True,headers=self.headers,cookies=cookie_dict)]#callback默认是parse函数
模拟登录后,获取cookies,并在settings.py中将COOKIES_ENABLED设为True,这样我们就只需要在第⼀次请求中加上cookies,后⾯的所有请求都会默认带上cookies
COOKIES_ENABLED =True
另外,记得携带请求头,不然会被知乎识别为爬⾍。
爬取所有的问题
由于知乎没有提供所有问题的⼊⼝,所以我们采⽤的是深度优先的爬取算法
使⽤filter函数过滤掉⾮https开头的url,过滤后的url如果是/question/***格式, 就提交给下载器,如果不是,则进⼀步解析跟踪。
def parse(self, response):
'''
提取出html页⾯中所有的url  并跟踪这些url进⼀步爬取
如果提取的格式为/question/*** 就提交给下载器
'''
all_urls = response.xpath('//a/@href').extract()
all_urls =[parse.urljoin(response.url,url)for url in all_urls]
all_urls =filter(lambda x:True if x.startswith('https')else False,all_urls)#过滤掉⾮https开头的url
for url in all_urls:
match_obj = re.match('(.*zhihu/question/(\d+))(/|$).*',url)
if match_obj:
#如果url格式为question格式,则下载后解析页⾯
request_url = up(1)
request_id = up(2)
yield scrapy.Request(request_url,headers=self.headers,meta={"zhihuid":request_id},callback=self.parse_question)
else:
#如果url不是question格式,则进⼀步跟踪链接
yield scrapy.Request(url,headers=self.headers,callback=self.parse)
获取到问题的url后,我们就将需要的字段解析出来:
def parse_question(self,response):
zhihu_id = ("zhihuid","")
item_loader = ArticleItemLoader(item=ZhihuQuestionItem(),response=response)
item_loader.add_xpath('title','//h1[@class="QuestionHeader-title"]/text()')
item_loader.add_xpath('content','//div[@class="QuestionRichText QuestionRichText--expandable QuestionRichText--collapsed"]//span[@class="RichTex t ztext"]/text()')
item_loader.add_value('url',response.url)
item_loader.add_value('zhihu_id',zhihu_id)
item_loader.add_xpath('answer_num','//h4[@class="List-headerText"]/span/text()')
item_loader.add_xpath('comments_num','//div[@class="QuestionHeader-Comment"]/button/text()')
item_loader.add_xpath('watch_user_num','//div[@class="NumberBoard-item"]//strong[@class="NumberBoard-itemValue"]/text()')
item_loader.add_xpath('topics','//div[@class="Tag QuestionTopic"]//div[@class="Popover"]//text()')#xpath类名要写全
qustion_item = item_loader.load_item()
yield  qustion_item
同时,在items.py中定义相关的item:
class ArticleItemLoader(ItemLoader):
#重载ItemLoader类⾃定义itemloader
default_output_processor = TakeFirst()#设置默认output_processor:取列表中第0个元素def get_nums(value):
#提取数字
match_re = re.match(".*?(\d+).*", value)
if match_re:
nums =int(up(1))
else:
nums =0
return nums
class ZhihuQuestionItem(scrapy.Item):
#知乎的问题 item
zhihu_id = scrapy.Field()
topics = scrapy.Field(
output_processor = Join(",")
)
url = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
answer_num = scrapy.Field()
comments_num = scrapy.Field(
input_processor=MapCompose(get_nums)
)
watch_user_num = scrapy.Field()
crawl_time = scrapy.Field()
爬取问题的所有回答
⾸先分析下接⼝
打开调试⼯具,点击页⾯中的“查看所有回答”或者不断滚动页⾯滚动条。
可以在network中发现这个可疑的接⼝。
点击查看response,发现返回的是⼀段json数据。
具体的数据包括了:data paging
然后还发现:is_end和next,is_end是判断当前url是否是最后⼀个请求,next就是下⼀个请求的url。
这样⼀来,我们就可以⽅便的获得所有的回答了。
具体的爬取逻辑就是:判断is_end的值,如果为False,则继续请求next中的url。
接着,我们再来分析下请求回答url的参数结构:
include是固定的,limit是限制每次请求返回的数量,offset是偏移量
分析结束后,我们就可以写代码了:
start_answer_url ='www.zhihu/api/v4/questions/{0}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_inf o%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count %2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_t ime%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Ci s_labeled%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%pics&limit={1}&offset ={2}&platform=desktop&sort_by=default'
def parse_question(self,response):
zhihu_id = ("zhihuid","")
#此处省略解析问题的代码部分
yield scrapy.Request(self.start_answer_url.format(zhihu_id,5,0),headers=self.headers,callback=self.parse_answer)
def parse_answer(self,response):
ans_json = json.)
is_end = ans_json['paging']['is_end']
next_answer_url = ans_json['paging']['next']
for answer in ans_json['data']:
answer_item = ZhihuAnswerItem()
visual studio圣诞树代码answer_item['zhihu_id']= answer['id']
answer_item['url']= answer['url']
answer_item['question']= answer['question']['title']
answer_item['author_id']= answer['author']['id']if'id'in answer['author']else None
answer_item['content']= answer['content']if'content'in answer else None
answer_item['praise_num']= answer['voteup_count']
answer_item['comments_num']= answer['comment_count']
answer_item['create_time']= answer['created_time']
answer_item['update_time']= answer['updated_time']
answer_item['crawl_time']= w()
yield answer_item
sqlserver日志文件在哪
if not is_end:
yield scrapy.Request(next_answer_url,headers=self.headers,callback=self.parse_answer)
我们在解析完问题之后,就开始请求初始回答url:start_answer_url,然后在parse_answer中对回答进⾏解析。
同时,也要记得在items.py中定义对应的item:
必利劲可以根治早泄吗class ZhihuAnswerItem(scrapy.Item):
#知乎问题回答 item
zhihu_id = scrapy.Field()
url = scrapy.Field()
question = scrapy.Field()
author_id = scrapy.Field()
content = scrapy.Field()
praise_num = scrapy.Field()
comments_num = scrapy.Field()
create_time = scrapy.Field()
update_time = scrapy.Field()
crawl_time = scrapy.Field()
保存数据到mysql