网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。
- from pprint import pprint
- import csv
- from collections import Counter
- import requests
- from bs4 import BeautifulSoup
- import jieba
- import matplotlib.pyplot as plt
- from wordcloud import WordCloud
- class JobSpider():
- def __init__(self):
- self.company = []
- self.text = ""
- self.headers = {'X-Requested-With': 'XMLHttpRequest',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
- 'Chrome/56.0.2924.87 Safari/537.36'}
- def job_spider(self):
- """ 爬虫入口 """
- url = "http://search.51job.com/list/010000%252C020000%252C030200%252C040000,000000,0000,00,9,99,Python,2,{}.html?" \
- "lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0" \
- "&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
- urls = [url.format(p) for p in range(1, 14)]
- for url in urls:
- r = requests.get(url, headers=self.headers).content.decode('gbk')
- bs = BeautifulSoup(r, 'lxml').find("div", class_="dw_table").find_all("div", class_="el")
- for b in bs:
- try:
- href, post = b.find('a')['href'], b.find('a')['title']
- locate = b.find('span', class_='t3').text
- salary = b.find('span', class_='t4').text
- d = {'href':href, 'post':post, 'locate':locate, 'salary':salary}
- self.company.append(d)
- except Exception:
- pass
- def post_require(self):
- """ 爬取职位描述 """
- for c in self.company:
- r = requests.get(c.get('href'), headers=self.headers).content.decode('gbk')
- bs = BeautifulSoup(r, 'lxml').find('div', class_="bmsg job_msg inbox").text
- s = bs.replace("举报", "").replace("分享", "").replace("\t", "").strip()
- self.text += s
- # print(self.text)
- with open(r".\data\post_require.txt", "w+", encoding="utf-8") as f:
- f.write(self.text)
- def post_desc_counter(self):
- """ 职位描述统计 """
- # import thulac
- post = open(r".\data\post_require.txt", "r", encoding="utf-8").read()
- # 使用 thulac 分词
- # thu = thulac.thulac(seg_only=True)
- # thu.cut(post, text=True)
- # 使用 jieba 分词
- jieba.load_userdict(r".\data\user_dict.txt")
- seg_list = jieba.cut(post, cut_all=False)
- counter = dict()
- for seg in seg_list:
- counter[seg] = counter.get(seg, 1) + 1
- counter_sort = sorted(counter.items(), key=lambda value: value[1], reverse=True)
- pprint(counter_sort)
- with open(r".\data\post_pre_desc_counter.csv", "w+", encoding="utf-8") as f:
- f_csv = csv.writer(f)
- f_csv.writerows(counter_sort)
- def post_counter(self):
- """ 职位统计 """
- lst = [c.get('post') for c in self.company]
- counter = Counter(lst)
- counter_most = counter.most_common()
- pprint(counter_most)
- with open(r".\data\post_pre_counter.csv", "w+", encoding="utf-8") as f:
- f_csv = csv.writer(f)
- f_csv.writerows(counter_most)
- def post_salary_locate(self):
- """ 招聘大概信息,职位,薪酬以及工作地点 """
- lst = []
- for c in self.company:
- lst.append((c.get('salary'), c.get('post'), c.get('locate')))
- pprint(lst)
- with open(r".\data\post_salary_locate.csv", "w+", encoding="utf-8") as f:
- f_csv = csv.writer(f)
- f_csv.writerows(lst)
- def post_salary(self):
- """ 薪酬统一处理 """
- mouth = []
- year = []
- thouand = []
- with open(r".\data\post_salary_locate.csv", "r", encoding="utf-8") as f:
- f_csv = csv.reader(f)
- for row in f_csv:
- if "万/月" in row[0]:
- mouth.append((row[0][:-3], row[2], row[1]))
- elif "万/年" in row[0]:
- year.append((row[0][:-3], row[2], row[1]))
- elif "千/月" in row[0]:
- thouand.append((row[0][:-3], row[2], row[1]))
- # pprint(mouth)
- calc = []
- for m in mouth:
- s = m[0].split("-")
- calc.append((round((float(s[1]) - float(s[0])) * 0.4 + float(s[0]), 1), m[1], m[2]))
- for y in year:
- s = y[0].split("-")
- calc.append((round(((float(s[1]) - float(s[0])) * 0.4 + float(s[0])) / 12, 1), y[1], y[2]))
- for t in thouand:
- s = t[0].split("-")
- calc.append((round(((float(s[1]) - float(s[0])) * 0.4 + float(s[0])) / 10, 1), t[1], t[2]))
- pprint(calc)
- with open(r".\data\post_salary.csv", "w+", encoding="utf-8") as f:
- f_csv = csv.writer(f)
- f_csv.writerows(calc)
- def post_salary_counter(self):
- """ 薪酬统计 """
- with open(r".\data\post_salary.csv", "r", encoding="utf-8") as f:
- f_csv = csv.reader(f)
- lst = [row[0] for row in f_csv]
- counter = Counter(lst).most_common()
- pprint(counter)
- with open(r".\data\post_salary_counter1.csv", "w+", encoding="utf-8") as f:
- f_csv = csv.writer(f)
- f_csv.writerows(counter)
- def world_cloud(self):
- """ 生成词云 """
- counter = {}
- with open(r".\data\post_desc_counter.csv", "r", encoding="utf-8") as f:
- f_csv = csv.reader(f)
- for row in f_csv:
- counter[row[0]] = counter.get(row[0], int(row[1]))
- pprint(counter)
- wordcloud = WordCloud(font_path=r".\font\msyh.ttf",
- max_words=100, height=600, width=1200).generate_from_frequencies(counter)
- plt.imshow(wordcloud)
- plt.axis('off')
- plt.show()
- wordcloud.to_file('.\images\worldcloud.jpg')
- def insert_into_db(self):
- """ 插入数据到数据库
- create table jobpost(
- j_salary float(3, 1),
- j_locate text,
- j_post text
- );
- """
- import pymysql
- conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="0303", db="chenx", charset="utf8")
- cur = conn.cursor()
- with open(r".\data\post_salary.csv", "r", encoding="utf-8") as f:
- f_csv = csv.reader(f)
- sql = "insert into jobpost(j_salary, j_locate, j_post) values(%s, %s, %s)"
- for row in f_csv:
- value = (row[0], row[1], row[2])
- try:
- cur.execute(sql, value)
- conn.commit()
- except Exception as e:
- print(e)
- cur.close()
- if __name__ == "__main__":
- spider = JobSpider()
- # spider.job_spider()
- # spider.post_salary()
- # spider.insert_into_db()
- # spider.post_salary_counter()
- # spider.post_counter()
- # spider.world_cloud()


雷达卡


京公网安备 11010802022788号







