python爬取⽹易云⾳乐评论及相关信息python爬取⽹易云⾳乐评论及相关信息
urllib
requests
正则表达式
爬取⽹易云⾳乐评论及相关信息
urllib了解
requests了解
正则表达式
爬取⽹易云⾳乐评论及相关信息
1、分析⽹易云页⾯
2、获取加密的参数 params 和 encSecKey
url编码和utf8区别
3、爬取⽹易云⾳乐评论及相关信息
1、分析⽹易云⾳乐页⾯
2、获取加密的参数 params 和 encSecKey
3、爬取⽹易云⾳乐评论及相关信息(代码有冗余)
(1)使⽤User Agent和代理IP隐藏⾝份之为何要设置User Agent
agents = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like
Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
import random
# 爬取多⾸歌评时可以每次随机选取⼀个User Agent
header = {'User-Agent': ''.join(random.sample(agents, 1))}
# random.sample() 的值是列表, ''.join()转列表为字符串
print(header)
(2)爬取指定⼀⾸歌的热评
注意:分析页⾯发现,热评只在每⼀⾸歌的⾸页,有15条。
代码说明:代码中的url和data参数值在上⾯的图中圈出的部分复制。
# -*-coding:utf-8-*-
"""
爬取⽹易云⾳乐指定歌曲的15条热评,
2018年6⽉26⽇
"""
quest
import urllib.parse
import json
# 抓取⽹易云⾳乐指定url的热评
def get_hotComments():
url = 'music.163/weapi/v1/resource/comments/R_SO_4_862102137?csrf_token='# 歌评url
header = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
# post请求表单数据
data = {'params':'LPkOcWb/uz2Nj6xw+RFhGJ1PkFi4+lh4agK+1jRGmjMAiOcJ5RHxQBbZa+aME54AUdi21JkqLu/yeHjjIaLQJ4wzqiuzrzYUKciRCqmCDX9z 'encSecKey':'5627cc7941cf4cbd59b13668efe38a622ed0889d33cdcf603d18b025eb34ac434c882ed5ad16ca06e88e40a8b91de455483d0b88b6b46    postdata = urllib.parse.urlencode(data).encode('utf8')  # 进⾏编码
request = quest.Request(url, headers=header, data=postdata)
response = quest.urlopen(request).read().decode('utf8')
json_dict = json.loads(response)  # 获取json
hot_comment = json_dict['hotComments']  # 获取json中的热门评论
num = 1
for item in hot_comment:
print('第%d条评论:' % num + item['content'])
num += 1
if __name__ == '__main__':
get_hotComments()
代码输出,如下图:有完整15条数据,截图范围有限,显⽰6条。
(3)爬取⽹易云⾳乐199⾸热歌榜每⾸歌的评论数据
分析问题,要获取⼀⾸歌曲的页⾯,
代码说明1:如果导⼊from Crypto.Cipher import AES提⽰错误No module named Crypto.Cipher,请参考⽂章
当页⾯评论不⾜指定页⾯的数量时,代码可以选择跳过或者break
# -*- coding:utf-8 -*-
"""
爬取⽹易云⾳乐热歌榜的最新评论,指定页数的所有评论,⽐如前10页
2018年6⽉26⽇
"""
import os
import re
import random
quest
import urllib.parse
from Crypto.Cipher import AES
import base64
import base64
import requests
import json
import time
agents = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {
'Host':'music.163',
'Origin':'music.163',
'Referer':'music.163/song?id=28793052',
'User-Agent':''.join(random.sample(agents, 1))
}
# 除了第⼀个参数,其他参数为固定参数,可以直接套⽤
# offset的取值为:(评论页数-1)*20,total第⼀页为true,其余页为false
# 第⼀个参数
# first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
# 第⼆个参数
second_param = "010001"
# 第三个参数
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf6 # 第四个参数
forth_param = "0CoJUm6Qyw8W8jud"
# 获取参数
def get_params(page):  # page为传⼊页数
iv = "0102030405060708"
first_key = forth_param
second_key = 16 * 'F'
if(page == 1):  # 如果为第⼀页
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
h_encText = AES_encrypt(first_param, first_key, iv)
else:
offset = str((page-1)*20)
first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' % (offset,'false')
h_encText = AES_encrypt(first_param, first_key, iv)
h_encText = AES_encrypt(h_encText, second_key, iv)
return h_encText
# 获取 encSecKey
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50e return encSecKey
return encSecKey
# 解密过程
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = w(key, AES.MODE_CBC, iv)
encrypt_text = pt(text)
encrypt_text = base64.b64encode(encrypt_text)
encrypt_text = str(encrypt_text, encoding="utf-8")  # 注意⼀定要加上这⼀句,没有这⼀句则出现错误
return encrypt_text
# 获得评论json数据
def get_json(url, params, encSecKey):
data = {
"params": params,
"encSecKey": encSecKey
}
response = requests.post(url, headers=headers, data=data)
t.decode('utf-8')  # 解码
# 获取热歌榜所有歌曲名称和id
def get_all_hotSong():
url = 'music.163/discover/toplist?id=3778678'# ⽹易云云⾳乐热歌榜url
header = {'User-Agent': ''.join(random.sample(agents, 1))}  # random.sample() 的值是列表, ''.join()转列表为字符串
request = quest.Request(url=url, headers=header)
html = quest.urlopen(request).read().decode('utf8')  # 打开url
html = str(html)    # 转换成str
# print(html)
pat1 = r'<ul class="f-hide"><li><a href="/song\?id=\d*?">.*</a></li></ul>'# 进⾏第⼀次筛选的正则表达式
result = repile(pat1).findall(html)    # ⽤正则表达式进⾏筛选
# print(result)
result = result[0]    # 获取tuple的第⼀个元素
pat2 = r'<li><a href="/song\?id=\d*?">(.*?)</a></li>'# 进⾏歌名筛选的正则表达式
pat3 = r'<li><a href="/song\?id=(\d*?)">.*?</a></li>'# 进⾏歌ID筛选的正则表达式
hot_song_name = repile(pat2).findall(result)    # 获取所有热门歌曲名称
hot_song_id = repile(pat3).findall(result)    # 获取所有热门歌曲对应的Id
# print(hot_song_name)
# print(hot_song_id)
return hot_song_name, hot_song_id
# 抓取某⼀⾸歌的前page页评论
def get_all_comments(hot_song_id, page, hot_song_name, hot_song_order):  # hot_song_order为了给⽂件命名添加⼀个编号    all_comments_list = []  # 存放所有评论
url = 'music.163/weapi/v1/resource/comments/R_SO_4_' + hot_song_id + '?csrf_token='# 歌评url
dir = os.getcwd() + '\\Comments\\'
if not ists(dir):    # 判断当前路径是否存在,没有则创建new⽂件夹
os.makedirs(dir)
num = 0
f = open(dir + str(hot_song_order) + ' ' + hot_song_name + '.txt', 'w', encoding='utf-8')
# ' '是为了防⽌⽂件名也是数字混合,加个空格分隔符,写⼊⽂件, a 追加
for i in range(page):  # 逐页抓取
# print(url, i)
params = get_params(i+1)
encSecKey = get_encSecKey()
json_text = get_json(url, params, encSecKey)
# print(json_text)