Python爬虫爬虫必备—BeautifulSoup

1. python3中只要记住：urllib , requests 两个库

GET一个URL
>>> import urllib.request
>>> with urllib.request.urlopen('http://www.python.org/') as f:
...     print(f.read(300))

PUT一个请求
import urllib.request
DATA=b'some data'
req = urllib.request.Request(url='http://localhost:8080', data=DATA,method='PUT')
with urllib.request.urlopen(req) as f:
    pass
print(f.status)
print(f.reason)

基本的HTTP认证
import urllib.request
auth_handler = urllib.request.HTTPBasicAuthHandler()
auth_handler.add_password(realm='PDQ Application',
                          uri='https://mahler:8092/site-updates.py',
                          user='klem',
                          passwd='kadidd!ehopper')
opener = urllib.request.build_opener(auth_handler)
urllib.request.install_opener(opener)
urllib.request.urlopen('http://www.example.com/login.html')

使用proxy
proxy_handler = urllib.request.ProxyHandler({'http': 'http://www.example.com:3128/'})
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()
proxy_auth_handler.add_password('realm', 'host', 'username', 'password')

opener = urllib.request.build_opener(proxy_handler, proxy_auth_handler)
opener.open('http://www.example.com/login.html')

添加头部
import urllib.request
req = urllib.request.Request('http://www.example.com/')
req.add_header('Referer', 'http://www.python.org/')
r = urllib.request.urlopen(req)

更改User-agent
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
opener.open('http://www.example.com/')

使用GET时设置URL的参数
>>> import urllib.request
>>> import urllib.parse
>>> params = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})
>>> url = "http://www.musi-cal.com/cgi-bin/query?%s" % params
>>> with urllib.request.urlopen(url) as f:
...     print(f.read().decode('utf-8'))
...

使用POST时设置参数
>>> import urllib.request
>>> import urllib.parse
>>> data = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})
>>> data = data.encode('ascii')
>>> with urllib.request.urlopen("http://requestb.in/xrbl82xr", data) as f:
...     print(f.read().decode('utf-8'))
...

指定proxy
>>> import urllib.request
>>> proxies = {'http': 'http://proxy.example.com:8080/'}
>>> opener = urllib.request.FancyURLopener(proxies)
>>> with opener.open("http://www.python.org") as f:
...     f.read().decode('utf-8')
...
不使用proxy, 覆盖环境变量的proxy
>>> import urllib.request
>>> opener = urllib.request.FancyURLopener({})
>>> with opener.open("http://www.python.org/") as f:
...     f.read().decode('utf-8')
...

Requests 官方文档

http://docs.python-requests.org/zh_CN/latest/user/quickstart.html

http://docs.python-requests.org/zh_CN/latest/

Beautiful Soup 文档

BeautifulSoup基本用法总结

爬虫必备—BeautifulSoup

# 字符串处理函数整理 https://www.cnblogs.com/hardsoftware/p/6220374.html
# https://www.cnblogs.com/OldJack/p/7455124.html

import requests
from requests_ntlm import HttpNtlmAuth
from bs4 import BeautifulSoup

#response  = requests.get("http://e/CAD/Index",auth=HttpNtlmAuth('cnsvwsh00\\lh','xxx'))
#response  = requests.get("http://e/Share/Index",auth=HttpNtlmAuth('cnsvwsh00\\lh','xxx'))
response  = requests.get("http://e/CAD/Index",auth=HttpNtlmAuth('cnsvwsh00\\lh','xxx'))

bs=BeautifulSoup(response.text,'html5lib')

#格式化输出内容
#text=bs.prettify()
#print(text)
text=bs.find(class_="layui-container clearfix").find_all('a') 
#print(text)
 
for a in text:
    if(a.string is not None):
        print(a.string)
        if(a['href'].startswith('/')):
            print ("http://e"+a['href'])
        else:
            print(a['href'])

本站文章如无特殊说明，均为本站原创，如若转载，请注明出处：Python爬虫爬虫必备—BeautifulSoup - Python技术站