关于Python爬取⽹页返回521状况码的解决⽅案⽂章⽬录
# 项⽬场景: Python3.8
问题描述:
在使⽤Python爬⾍爬取⽹页的列表页中的详情页时,返回的详情页的html⽂件的数据长度有限。
原因分析:
频繁爬取⽬标⽹站,导致的⽹址反爬⾍措施
解决⽅案:
⽅法⼀:
换⼀个vpn,也就是换⼀台电脑执⾏程序
⽅法⼆:
复制⽬标⽹页的Headers添加到代码中
根据⽬标情况不同修改
⽅法三:
两次访问⽬标详情页代码⼀def  askURL (url ):    head = {  # 模拟浏览器头部信息,向⾖瓣服务器发送消息        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',        'Cache-Control': 'max-age=0',        'Connection': 'keep-alive',        'Cookie': 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo";s:2:"ft";s:19:"2022-01-10        'Host': 'www.mafengwo',        'Upgrade-Insecure-Requests': '1',        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.107
}    # ⽤户代理,表⽰告诉⾖瓣服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么⽔平的⽂件内容)    request = urllib .request .Request (url , headers =head )    html = ""    try :        response = urllib .request .urlopen (request )        html = response .read ().decode ("utf-8")    except  urllib .error .URLError as  e :        if  hasattr (e , "code"):            print (e .code )        if  hasattr (e , "reason"):            print (e .reason )    return  html }1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
url编码处理
17
18
19
20
21
22
23
24
25
26import  execjs import  requests import  re head = {  # 模拟浏览器头部信息,向⾖瓣服务器发送消息    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',    'Cache-Control': 'max-age=0',    'Connection': 'keep-alive',    'Host': 'www.mafengwo',    'Upgrade-Insecure-Requests': '1',    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.}url = 'www.mafengwo/poi/5423409.html'# response = (url)# # cookie1# cookie1 = kies # # js 代码# js_code = def  get_521_content (url ,head ):    req = requests .get (url , headers =head )    cookies = req .cookies 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
代码⼆    cookies = '; '.join (['='.join (item ) for  item in  cookies .items ()])    txt_521 = req .text    txt_521 = ''.join (re .findall ('<script>(.*?)</script>', txt_521))    return  (txt_521, cookies )def  fixed_fun (function ):    func_return = function .replace ('eval', 'return')    content = execjs pile (func_return )    req = requests .get (url , headers =head )    evaled_func = ''.join (re .findall ('<script>(.*?)</script>', req .text ))    # print(js_con)    # fn = js_con.split('=').split(' ')    # evaled_func = content.call(fn)    # print(evaled_func)    mode_func = evaled_func .replace ('while(window._phantom||window.__phantomas){};', ''). \        replace ('kie=', 'return').replace (';if((function(){try{return !!window.addEventListener;}', ''). \        replace ("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',l,false);}", ''). \        replace ("else{document.attachEvent('onreadystatechange',l);}", '').replace (        r"setTimeout('location.href=place(/[\?|&]captcha-challenge/,\'\')',1500);", '')    content = execjs pile (mode_func )    cookies = content .call ('l')    __jsl_clearance = cookies .split (';')[0]    r
eturn  __jsl_clearance def  cookie_dict (js , id ):    dict  = {}    js = js .split ('=')    id  = id .split ('=')    dict [js [0]] = js [1]    dict [id [0]] = id [1]    return  dict if  __name__ == '__main__':    func = get_521_content (url ,head )    content = func [0]    cookie_id = func [1]    cookie_js = fixed_fun (func [0])    dicted_cookie = cookie_dict (cookie_js , cookie_id )    head = {  # 模拟浏览器头部信息,向⾖瓣服务器发送消息        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',        'Cache-Control': 'max-age=0',        'Connection': 'keep-alive',        'Host': 'www.mafengwo',        'Upgrade-Insecure-Requests': '1',        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.107        'Cookie': cookie_id + ';' + cookie_js    }    req = requests .get (url , headers =head )    print (req .status_code )2930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687# resouce:blog.csdn/qq_41879417/article/details/101701120?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_
1
# resouce:blog.csdn/qq_41879417/article/details/101701120?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_ # -*- coding: utf-8 -*-# @Time : 2022/1/16 9:11# @Author : sherlock # @File : creeper_2_521.py # @Project : creeper import  execjs import  re import  requests url = 'www.mafengwo/poi/5423409.html'head = {  # 模拟浏览器头部信息,向⾖瓣服务器发送消息    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',    'Cache-Control': 'max-age=0',    'Connection': 'keep-alive',    'Host': 'www.mafengwo',    'Upgrade-Insecure-Requests': '1',    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.}def  get_521_content (url ):    req = requests .get (url , headers =head , timeout =5)    print (req .status_code , req .text )    if  req .status_code == 521:        cookies = dict (req .cookies .items ())        print (cookies )        js_con = ''.join (re .findall ('<script>(.*?)</script>', req .text ))        if  js_con :            __jsl_clearance = fixed_fun (js_con , url )            if  __jsl_clearance :                key , value = __jsl_clearance .split ('=')                cookies [key ] = value                return  cookies # 执⾏js 代码获取cookies 的__jsl_clearance 的键值def  fixed_fun (js_con , url ):  # js_con 第⼀次请求获取的js 内容    func_return = js_con .replace ('eval(', 'return(')    print ('第⼀次替换eval==》return 后:  ', func_return )
    content = execjs pile (func_return )    # fn = js_con.split('=')[0].split(' ')[1]    # 只有['kie']    fn = js_con .split ('=')[0].split (' ')[1]    evaled_func = content .call (fn )    print ('第⼀次执⾏js 代码后: ', evaled_func )    fn = evaled_func .split ('=')[0].split (' ')[1]  # 获取动态函数名    aa = evaled_func .split ("<a href=\\'/\\'>")  # 获取<a>标签的内容    aa = aa [1].split ("</a>")[0] if  len (aa ) >= 2 else  ''    mode_func = evaled_func . \        replace (        "setTimeout('location.href=location.pathname+place(/[\\?|&]captcha-challenge/,\\'\\')',1500);kie=",        'return'). \        replace (';if((function(){try{return !!window.addEventListener;}', ''). \        replace (        "}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange',"        ''). \        replace (123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
代码三        replace (        "if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else        ''). \        replace ("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \        replace (        "var " + fn + "=ateElement('div');" + fn + ".innerHTML='<a href=\\'/\\'>" + aa + "</a>';" + fn + "=" + fn + ".firstChild.href",        "var " + fn + "='" + url + "'")    print ('第⼆次替换后的js 代码:', mode_func )    t
ry :        content = execjs pile (mode_func )        cookies = content .call (fn )        __jsl_clearance = cookies .split (';')[0]        print (__jsl_clearance )        return  __jsl_clearance    except :        print ('js 执⾏错误:', mode_func )        return  None # 携带解密后的cookies 第⼆次爬取详情页def  con_spider (cookies , url ):    response = requests .get (url , headers =head , cookies =cookies , timeout =5)    if  response .status_code == 200:        response .encoding = 'utf-8'        print (response .status_code )        print (response .text )        return  response    else :        print ('第⼆次爬取错误状态码:', response .status_code )        return  None if  __name__ == "__main__":    cookies = get_521_content (url )    con_spider (cookies , url )66676869707172737475767778798081828384858687888990919293949596979899100101# resource:wwwblogs/gongs/p/10524710.html import  execjs import  re import  requests url = 'www.mafengwo/poi/5423409.html'head = {  # 模拟浏览器头部信息,向⾖瓣服务器发送消息    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",    "Cache-Control": "max-age=0",    "Connection": "keep-alive",    "Host": "www.mafengwo",    "Upgrade-Insecure-Requests": "1",    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072}123456789101112131415161718192021222324