Python爬⾍之淘宝数据爬取(商品名称,价格,图⽚,销量)
代码详细注释,仅供交流与参考,不作商业⽤途
代码参考北京理⼯⼤学嵩天⽼师
图片爬虫appimport requests    #导⼊第三⽅库
import re
import os
def getHTMLText(url):
try:
r = (url, timeout =30)#timeout超时响应参数,这⾥是30秒
r.raise_for_status()#判断是否异常,200为正常
#返回内容
except:
return""
def parsePage(ilt, html):
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)#*?前⼀个字符0次获⽆限次扩展,这⾥是.(任何单个字符)⽆限扩展,知道 “ 结束
img = re.findall(r'\"pic_url\"\:\".*?\"', html)
sal = re.findall(r'\"view_sales\"\:\".*?\"', html)
for i in range(len(plt)):
price =eval(plt[i].split(":")[1])#使⽤.split以 : 为界限分割,[1]取分割后的第⼆部分, eval函数去掉双引号
title =eval(tlt[i].split(":")[1])
image =eval(img[i].split(":")[1])
sales =eval(sal[i].split(":")[1])
ilt.append([price, title, image, sales])
except:
print("")
def printGoodsList(ilt):
tplt ="{:4}\t{:8}\t{:16}\t{:128}\t{:16}"
print(tplt.format("序号","价格","商品名称","图⽚链接","销量"))
count =0#序号
for g in ilt:
count = count +1
print(tplt.format(count, g[0], g[1],"http:"+ g[2], g[3]))#"http:" + g[2]  补全图⽚链接
root ="/Users/ljf/Desktop/爬⾍学习/爬取的图⽚//"#保存图⽚的根⽬录(根据个⼈情况填写)
path = root +str(count)+".jpg"#图⽚的名称
try:
if not ists(root):#判断⽂件夹是否存在
os.mkdir(root)#不存在则创建⽂件夹
r = ("http:"+ g[2])#获取图⽚的内容
with open(path,'wb')as f:
f.t)#保存
f.close()
except:
print("爬取失败")
#print("⽂件保存成功")
print("")
def main():
goods ="⽆⼈机"#要搜索的商品名称
depth =1#页数
start_url ="s.taobao/search?q="+ goods #接⼝ + 商品名称
infoList =[]
for i in range(depth):
try:
url = start_url +"&s="+str(44* i)# 44表⽰翻页
headers ={
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"cookie":"isg=BOnpzZFsrYGGUpH39-VzSEJW-pNDtt3oqodoP4veklAPUg1kyAc4uJmAFHbkUXUg; l=eBNeqQCVju9HU_wsBO5anurza77tWIRcG kPzaNbMiInca1xRwFtBINCCCGCWkdtfgtCbM3-PHw1aeRnw8GUZw2HvCbKrCyConxvO.; tfstk=c7fPB3YsZBjb-jR9ZQOEPvELFpPRaKCG9m8BEt-fa6-Kvuv w7see9TuL1PM8FvC..; mt=ci=1_1; uc1=cookie15=U%2BGCWk%2F75gdr5Q%3D%3D&existShop=false&pas=0&cookie14=Uoe2zXwcputqGQ%3D%3D&co okie21=W5iHLLyFeYZ1WM9hVnmS&cookie16=VFC%2FuZ9az08KUQ56dCrZDlbNdA%3D%3D; enc=9Y%2FMSeHapmbZeG6XlvgC0Oc50OvRn5xdM4Ne HN0HYeek34jmgMphPmVwAIcXh0NdbH6PjHVamvYGzLQ4TedObw%3D%3D; JSESSIONID=4E454F22121952F877F6A1410E151A0A; alitrackid=login.tao bao; lastalitrackid=login.taobao; _cc_=VT5L2FSpdA%3D%3D; _l_g_=Ug%3D%3D; _nk_=tb1449678
43; _tb_token_=3eee331efe136; cookie1=U UtLcQJdn1hJFYRew1usFmKmezYbMs1fAYqooiRlRcI%3D; cookie17=Vy0T4dzZRxBIkw%3D%3D; cookie2=1bf4e35f5ffca24ca53818ddbc8719b6; csg=732 c7bb3; dnk=tb144967843; existShop=MTYyMDY1MjA5Ng%3D%3D; lgc=tb144967843; sg=364; sgcookie=E1003%2FUEY9wGmTMDOeZkjQQinX56Zs2O %2FCbD02GdpJBV1ymGeEqBl%2BGdklOQBKGXFkc61ayAW3n66TmMS2jJ3V5i4Q%3D%3D; skt=2efc87efd3d4ad05; t=68244622a071e32615a09150a6 129bb6; tracknick=tb144967843; uc3=nk2=F5REOtROM2Cp5EQ%3D&lg2=VT5L2FSpMGV7TQ%3D%3D&id2=Vy0T4dzZRxBIkw%3D%3D&vt3=F8dCuwgs uF2ycxNk97A%3D; uc4=id4=0%40VXqdEk6gqB3vwrSDD4ExDn%2BuTkyu&nk4=0%40FY4PaJQ8hS13HmYHjEKTa3sEipEqVw%3D%3D; unb=4104558176 ; _samesite_flag_=true; xlly_s=1; cna=bPrhGNbpuEMCAd9oFHI1Y8o4; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156"
}#登陆淘宝后,填写⾃⼰浏览器中的cookie,否则会⽆法爬取
html = (url, headers = headers)
#) #打印HTML内容
parsePage(infoList, )#调⽤函数,提取信息
except:
continue
printGoodsList(infoList)#调⽤函数,输出信息
main()