本人呢,算是学统计的,就想着爬一下智联的统计岗位信息,嗯,岗位很强势。。。
这里用了requests,bs4进行抓取与解析,数据存入mysql数据库。代码比较乱,先凑和着看,有时间再整理吧。。。
import requests from bs4 import BeautifulSoup import re import time import datetime import MySQLdb now = datetime.datetime.now() # 获取网页数据,并解析网页 def get_save_url(url, headers): html = requests.get(url, headers=headers) if html.status_code == 200: print("网页" + url + "已经打开") soup = BeautifulSoup(html.text, 'lxml') global soup html = html.text global html get_post_fact_city_num(soup,html) print("正在抓取第"+pagenum+"页的职位信息") # 定义解析网页的函数 def get_post_fact_city_num(soupx,htmlx): l_post_name = [] l_fact_name = [] l_city_name = [] l_num = [] l_lab = [] l_post_time = [] l_now_time = [] regex = "__ga__fullResult(.*)postname_clicksfullresult(.*)postnames_001" posts = soup.findAll("a", {"class":re.compile(regex)}) for post in posts[::2]: post = post.get_text() #print(post) l_post_name.append(post) facts = soup.findAll("p", {"class":"searchResultCompanyname"}) for fact in facts[::2]: fact = fact.get_text() l_fact_name.append(fact) cities = soup.findAll("em", {"class":"searchResultJobCityval"}) for city in cities[::2]: city = city.get_text() l_city_name.append(city) nums = soup.findAll("em", {"class":"searchResultJobPeopnum"}) for num in nums: num = num.get_text() l_num.append(num) #print("nums: "+str(inums)) labs = soup.findAll("p", {"class":"searchResultCompanyIndustry"}) for lab in labs: lab = lab.get_text() l_lab.append(lab) time_regex = "<span>发布时间\:<em></em>(.*)</span>" time_pa = re.compile(time_regex) times = re.findall(time_pa,html) itimes = 1 for time in times: l_post_time.append(time) save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time) headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"} # 这里页数可以在搜索结果包含的范围内任意设置 urls = ["http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_{}_0".format(str(x)) for x in range(2, 20)] #url = "http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_2_0" db = MySQLdb.connect(host="localhost", user="root", passwd="这里是你的密码,我的不给看",db="test",use_unicode=True,charset="utf8") cursor = db.cursor() # 这里注意字符的大小尽量大,有些公司介绍比较长。。。 sqlxx = """CREATE TABLE zhilian_tongji( post_name VARCHAR(100), fact_name VARCHAR(100), city_name VARCHAR(20), num VARCHAR(50), lab VARCHAR(200), post_time VARCHAR(50), now_time VARCHAR(50) ) """ cursor.execute(sqlxx) def save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time): now_time = datetime.datetime.now() sql = """INSERT INTO zhilian_tongji\ SET post_name=%s, fact_name=%s, city_name=%s, num=%s, lab=%s, post_time=%s, now_time=%s""" for x in range(0,len(l_post_name)): #print(len(l_post_name)) #print(x) #print(l_fact_name) cursor.execute(sql,(l_post_name[x],l_fact_name[x],l_city_name[x],l_num[x],l_lab[x],l_post_time[x],now_time)) db.commit() print("抓取成功,已存入数据库!") for url in urls: try: time.sleep(1) pagenum = url.split("_")[-2] get_save_url(url=url,headers=headers) except: print("第 "+str(pagenum)+" 失败...") pass db.close() print("大功告成!!!")
代码输出结果如下。
数据库查询结果如下。
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:爬虫再探实战(一)——爬取智联招聘职位信息 - Python技术站