51job_view.py quest
sheet1.write(0,2,'公司名称')
sheet1.write(0,3,'公司地点')
sheet1.write(0,4,'公司性质')
sheet1.write(0,5,'薪资')
sheet1.write(0,6,'学历要求')
sheet1.write(0,7,'⼯作经验')
sheet1.write(0,8,'公司规模')
sheet1.write(0,9,'公司类型')
sheet1.write(0,10,'公司福利')
sheet1.write(0,11,'发布时间')
number =1#保存到excel中第⼏条数据
item =input('请输⼊岗位关键词(与⼤数据相关职业):')
for j in range(1,10000):#页数⾃⼰随便改
try:
print("正在爬取第"+str(j)+"页数据...")
html = getfront(j,item)#调⽤获取⽹页原码
for i in getInformation(html):
try:
'''
i[0]:职位
i[1]:职位⽹址
i[2]:公司名称
i[4]:公司地点
i[5]:薪资
i[6]:发布时间
company[0][0]:公司性质
job_need[2][0]:学历要求
job_need[1][0]:⼯作经验
company[0][1]:公司规模
company[0][2]:公司类型
welfare:公司福利
'''
url1 = i[1]#职位⽹址
res1 = quest.urlopen(url1).read().decode('gbk')
company = re.findall(repile(r'<div class="com_tag">.*?<p class="at" title="(.*?)"><span class="i_flag">.*?<p class="at" title="(.*?)">.*?<p class ="at" title="(.*?)">.*?',re.S),res1)
job_need = re.findall(repile(r'<p class="msg ltype".*?>.*?  <span>|</span>  (.*?)  <span>|</span>&nbs p; (.*?)  <span>|</span>  .*?</p>',re.S),res1)
welfare = re.findall(repile(r'<span class="sp4">(.*?)</span>',re.S),res1)
print(i[0],i[2],i[4],i[5],company[0][0],job_need[2][0],job_need[1][0],company[0][1],company[0][2],welfare,i[6])
sheet1.write(number,0,number)
sheet1.write(number,1,i[0])
sheet1.write(number,2,i[2])
sheet1.write(number,3,i[4])
sheet1.write(number,4,company[0][0])
sheet1.write(number,5,i[5])
sheet1.write(number,6,job_need[2][0])
sheet1.write(number,7,job_need[1][0])
sheet1.write(number,8,company[0][1])
sheet1.write(number,9,company[0][2])
sheet1.write(number,10,("  ".join(str(i)for i in welfare)))
sheet1.write(number,11,i[6])
number+=1
# 保存⽂件
excel1.save("51job.xls")
time.sleep(0.3)#休息间隔,避免爬取海量数据时被误判为攻击,IP遭到封禁
except:
pass
except:
pass
51job_view2.py
import pandas as pd    #pandas 是对表格数据模型在python上的模拟,它有简单的像SQL 对数据的处理,能够⽅便的在python上实现
import re              #正则表达式
result = pd.DataFrame(data)#将数据总表模板加载
a = result.dropna(axis=0,how='any')#值为0则删除⾏ axis=1则是删除列,any是只要有空值就删除,⽽all则是全部为空再删除
# 还有⼀个inplace参数,True是在原数据集上操作,False是返回新的数据集
pd.set_option('display.max_rows',None)#输出全部⾏,不省略
#删除与⼤数据⽆关的职业
b = u'数据'
number =1
li = a['职位']
for i in range(0,len(li)):
try:
if b in li[i]:
#print(number,li[i])
number+=1
else:
a = a.drop(i,axis=0)#删除整⾏
except:
pass
#学历表格那⼀栏出现招多少⼈,表⽰爬取数据出错,删除该⾏数据
b2 ='⼈'
li2 = a['学历要求']
for i in range(0,len(li2)):
try:
if b2 in li2[i]:
# print(number,li2[i])
number +=1
a = a.drop(i, axis=0)
except:
pass
#转换薪资单位为万/⽉
b3 =u'万/年'
b4 =u'千/⽉'
li3 = a['薪资']
for i in range(0,len(li3)):
try:
if b3 in li3[i]:
x = re.findall(r'\d*\.?\d+',li3[i])
#print(x)
min_ =format(float(x[0])/12,'.2f')#转换成浮点型并保留两位⼩数
max_ =format(float(x[1])/12,'.2f')
li3[i][1]= min_+'-'+max_+u'万/⽉'
if b4 in li3[i]:
x = re.findall(r'\d*\.?\d+',li3[i])
#print(x)
#input()
min_ =format(float(x[0])/10,'.2f')
python 定义数组
max_ =format(float(x[1])/10,'.2f')
li3[i][1]=str(min_+'-'+max_+'万/⽉')
print(i,li3[i])
except:
pass
#保存到另⼀个Excel⽂件
<_excel('51job2.xls', sheet_name='Job', index=False)#index :布尔类型,默认是Ture写⾏名(索引)
>>>>>>>>>>>>>>>>>>###
import pandas as pd    #pandas 是对表格数据模型在python上的模拟,它有简单的像SQL 对数据的处理,能够⽅便的在python上实现import re              #正则表达式
from pyecharts import Funnel,Pie,Geo    #pyecharts 是⼀个⽤于⽣成 Echarts 图表的类库。
#Funnel(漏⽃图)、Pie(饼图)、Geo(地理坐标系)
import matplotlib.pyplot as plt        #⽤于数据可视化。
f = pd.DataFrame(file)#将数据总表模板加载
pd.set_option('display.max_rows',None)#输出全部⾏,不省略
add = f['公司地点']#公司地点数据的集合
sly = f['薪资']#薪资数据的集合
edu = f['学历要求']#学历要求的集合
exp = f['⼯作经验']#⼯作经验的集合
address =[]
salary =[]
education =[]
experience =[]
for i in range(0,len(f)):
try:
a = add[i].split('-')#通过'-',将字符串add[i]拆分成⼀个字符串数组
address.append(a[0])#只要前⾯的
#print(address[i])
s = re.findall(r'\d*\.?\d+',sly[i])# sly[i]的薪资
s1=float(s[0])#最低⼯资
s2 =float(s[1])#最⾼⼯资
salary.append([s1,s2])#将薪资添加到salary列表
#print(salary[i])
education.append(edu[i])#将学历要求添加到education列表
#print(education[i])
experience.append(exp[i])#将⼯作经验添加到experience列表
#print(experience[i])
except:
pass
min_s=[]#定义存放最低薪资的列表
max_s=[]#定义存放最⾼薪资的列表
for i in range(0,len(experience)):
min_s.append(salary[i][0])
max_s.append(salary[i][1])
#matplotlib模块如果显⽰不了中⽂字符串可以⽤以下代码。
my_df = pd.DataFrame({'experience':experience,'min_salay': min_s,'max_salay': max_s})#关联⼯作经验与薪资data1 = upby('experience').mean()['min_salay'].plot(kind='line')
plt.show()
my_df2 = pd.DataFrame({'education':education,'min_salay': min_s,'max_salay': max_s})#关联学历与薪资data2 = upby('education').mean()['min_salay'].plot(kind='line')
plt.show()
#统计岗位中各学历要求的个数
def get_edu(list):
education2 ={}
for i in set(list):
education2[i]=unt(i)
return education2
dir1 = get_edu(education)
#print(dir1)
#将岗位中各学历要求的个数绘制成饼图
attr= dir1.keys()
value = dir1.values()
pie = Pie("学历要求")
pie.add("", attr, value, center=[50,50], is_random=False, radius=[35,75], rosetype='radius',
is_legend_show=False, is_label_show=True,legend_orient='vertical')
#统计各地区岗位个数
def get_address(list):
address2 ={}
address2 ={}
for i in set(list):
address2[i]=unt(i)
address2.pop('异地招聘')
# 有些地名可能不合法或者地图包⾥没有可以⾃⾏删除。
return address2
dir2 = get_address(address)
#print(dir2)
#将各地区岗位个数绘制成地理坐标图
geo = Geo("⼤数据⼈才需求分布图", title_color="#2E2E2E",
title_text_size=24,title_top=20,title_pos="center", width=1300,height=600)
attr2 = dir2.keys()
value2 = dir2.values()
geo.add("",attr2, value2,type="effectScatter", is_random=True, visual_range=[0,1000], maptype='china',symbol_size=8, effect_scale=5, is_visualmap=Tru e)
#统计各学历要求岗位个数
def get_experience(list):
experience2 ={}
for i in set(list):
experience2[i]=unt(i)
return experience2
dir3 = get_experience(experience)
#print(dir3)
#将各学历要求岗位个数绘制成漏⽃图
attr3= dir3.keys()
value3 = dir3.values()
funnel = Funnel("⼯作经验漏⽃图",title_pos='center')
funnel.add("", attr3, value3,is_label_show=True,label_pos="inside", label_text_color="#fff",legend_orient='vertical',legend_pos='left')
环境版本
python  3.6
urllib3  1.24.3
xlwt  1.3.0
pandas0.25.3
pyecharts0.1.94

发表评论