from bs4 import BeautifulSoup html = """ <table class="tablelist" cellpadding="0" cellspacing="0"> <tr class="h"> <td class="l" width="374">职位名称</td> <td>职位类别</td> <td>人数</td> <td>地点</td> <td>发布时间</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=45021&keywords=python&tid=0&lid=0">22989-腾讯云计费PHP高级开发工程师</a></td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=45005&keywords=python&tid=0&lid=0">25663-腾讯云高级后台开发(互联网业务)(北京)</a></td> <td>技术类</td> <td>1</td> <td>北京</td> <td>2018-10-23</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=45007&keywords=python&tid=0&lid=0">TEG06-云计算架构师(深圳)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=44980&keywords=python&tid=0&lid=0">PCG04-PCG研发部数据科学家(深圳/北京)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=44981&keywords=python&tid=0&lid=0">PCG04-PCG研发部业务运维工程师(深圳)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=44971&keywords=python&tid=0&lid=0">23674-腾讯新闻大数据分析工程师(北京)</a></td> <td>技术类</td> <td>2</td> <td>北京</td> <td>2018-10-23</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=44964&keywords=python&tid=0&lid=0">TEG05-高级数据挖掘工程师(深圳)</a></td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=44968&keywords=python&tid=0&lid=0">PCG01-QQ后台推荐算法工程师</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=44969&keywords=python&tid=0&lid=0">PCG01-QQ后台大数据开发工程师</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=44952&keywords=python&tid=0&lid=0">22989-腾讯云AI产品高级咨询顾问(深圳北京)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> </table> """ soup = BeautifulSoup(html, "lxml") # 1.找到所有的tr标签 # trs = soup.find_all("tr") # 2.找到第二个tr标签,limit表示找到个数,在列表层面获取具体标签 # tr = soup.find_all("tr", limit=2)[1] # 3.找到所有class等于even的tr标签,class关键字冲突,加下划线 # trs = soup.find_all("tr", class_="even") # 4.attrs属性可添加多个,以key-value形式 # trs = soup.find_all("tr", attrs={"class": "even"}) # 5.将所有a标签有target属性的找到,可以添加多个关键字参数 # aList = soup.find_all("a", target="_blank") # 6.获取所有的a标签的href属性 # aList = soup.find_all("a") # for a in aList: # 1.通过下标操作的方式 # href = a["href"] # 2.通过attrs属性的方式 # href = a.attrs["href"] # 获取所有的职位信息,过滤掉第一个 trs = soup.find_all("tr")[1:] jobs = [] for tr in trs: job = {} # tds = tr.find_all("td") # title = tds[0].string # category = tds[1].string # nums = tds[2].string # city = tds[3].string # pubtime = tds[4].string # job["title"] = title # job["category"] = category # job["nums"] = nums # job["city"] = city # job["pubtime"] = pubtime # jobs.append(job) # 获取所有文本 infos = list(tr.stripped_strings) job["title"] = infos[0] job["category"] = infos[1] job["nums"] = infos[2] job["city"] = infos[3] job["pubtime"] = infos[4] jobs.append(job) print(jobs)
View Code
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:Python爬虫bs4解析实战 - Python技术站