使⽤selenium⽤ISBN在京东上批量爬取书籍信息
⾸先读取 .xls ⽂件,然后根据表格⾥的ISBN在京东上挨个搜索,再把需要的信息从⽹页上提取出来保存在另⼀个⽂件⾥。每次运⾏ .py ⽂件后打开浏览器会弹出登录页⾯(30s),在此期间⼿动登录,30秒后开始爬取。
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from selenium import webdriver
ptions import TimeoutException, NoSuchElementException
import json
from selenium.webdrivermon.keys import Keys
from lxml import etree
import xlrd
import xlwt
import datetime
from time import sleep
# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-gpu')
# options.add_argument('--disable-dev-shm-usage')
# driver = webdriver.Chrome(chrome_options=options)
data_dict = tDict = {'ISBN': '0000000000000', '出版时间': '0000-00-00', '版次': '1'}
driver = webdriver.Chrome()
def test01_login():
driver = webdriver.Chrome()
<(
"passport.jd/new/login.aspx?ReturnUrl=https%3A%2F%2Fwww.jd%2F")
sleep(30)  # ⼿动登陆⼀次
cookies = _cookies()
# 将 cookies 写⼊⽂件
with open("", "w") as f:
json.dump(cookies, f)
def singleData(para):
try:
<('www.jd/')
# 加载 cookies信息
with open("", "r") as f:
cookies = json.load(f)
for cookie in cookies:
driver.add_cookie(cookie)
driver.find_element_by_id("key").send_keys(para)
driver.find_element_by_xpath('//*[@id="search"]/div/div[2]/button/i').click()
sleep(3)
html = driver.page_source
h = etree.HTML(html)
# 在搜索到的结果中仅取⼀条链接
<("https:" + h.xpath('//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/@href')[0])
html = driver.page_source
h = etree.HTML(html)
# 获得所爬信息
list = h.xpath('//div/ul[@class="parameter2 p-parameter-list"]/li/text()')
for li in list:
if li.lstrip().startswith('ISBN'):  # 消去左边的空格,然后检测以“ISBN”开头的⼀条
data_dict["ISBN"] = li
if li.lstrip().startswith('出版时间'):
data_dict["出版时间"] = li
if li.lstrip().startswith('版次'):
data_dict["版次"] = li
# driver.close()
return data_dict
except Exception as e:
# error occurred, log 'e', etc.
with open("", "a", encoding="utf-8") as f:
f.write(str(e) + "\n")
f.close()
readbook = xlrd.open_workbook(r'table.xls')
SheetOfInput = readbook.sheet_by_name('Sheet1')
selenium获取cookie
nrows = ws  # 获取最⼤⾏
writebook = xlwt.Workbook(encoding="utf8")  # 打开⼀个excel
SheetOfOutput = writebook.add_sheet('test')  # 在打开的excel中添加⼀个sheet
test01_login()
for gi in range(0,nrows):
try:
lng = ll(gi, 4).value  # 获取i⾏3列的表格值
tDict = singleData(lng)
SheetOfOutput.write(gi, 0, tDict["ISBN"])
SheetOfOutput.write(gi, 1, tDict["出版时间"])
SheetOfOutput.write(gi, 2, tDict["版次"])
writebook.save('answer.xls')
print('tDict["ISBN"] = %s, tDict["出版时间"] = %s, tDict["版次"] = %s, gi = %d. ' %(tDict["ISBN"], tDict["出版时间"], tDict["版次"], gi)) except Exception as e:
# error occurred, log 'e', etc.
with open("", "a", encoding="utf-8") as f:
f.write(str(e) + "\n")
f.close()
driver.quit()
>>>>>>>####
# 定义⼀个爬⾍函数,针对单条isbn进⾏爬取,返回⼀个字典
# 打开table,读取isbn号,
# 调⽤定义的函数,然后将返回的字典写⼊table

发表评论