详解Python爬取并下载《电影天堂》3千多部电影
0. 简介
本文主要介绍如何使用Python来爬取并下载电影天堂网站上的电影资源,包括如何从首页获取分类信息和对应的电影列表,如何从电影列表页获取详细的电影信息和下载链接,并使用迅雷进行自动下载。
1. 准备工作
在进行爬取之前,需要安装一些必要的Python库和工具:
- BeautifulSoup4: 用于解析HTML和XML文档
- requests: 用于发送HTTP请求
- selenium: 用于模拟浏览器进行数据采集(目前电影天堂需要动态加载)
- pymongo: 如果需要将数据存入MongoDB
此外,需要下载安装Chrome浏览器和对应版本的ChromeDriver。
2. 获取分类及电影列表
首先需要从 https://www.dy2018.com 获取电影天堂的首页HTML内容,然后使用BeautifulSoup4解析获取各个分类信息及对应的电影列表页URL。
import requests
from bs4 import BeautifulSoup
url = "https://www.dy2018.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 获取分类信息
categories = []
category_list = soup.find('div', class_='contain').find('div', class_='bd2').find_all('a')
for category in category_list:
categories.append({
'name': category.text,
'url': category.get('href')
})
# 获取电影列表页URL
movie_urls = []
for category in categories:
response = requests.get(category['url'])
soup = BeautifulSoup(response.text, 'html.parser')
movie_list = soup.find_all('b', class_='title')
for movie in movie_list:
movie_url = movie.find('a').get('href')
if movie_url.startswith('/html/'):
movie_url = 'https://www.dy2018.com' + movie_url
movie_urls.append({
'category': category['name'],
'url': movie_url
})
3. 获取电影详细信息及下载链接
对于每个电影列表页,需要进入详细信息页并解析获取电影的详细信息和下载链接。这里使用selenium模拟浏览器进行数据采集。
from selenium import webdriver
# 创建Chrome浏览器实例
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless") # 以无头模式运行
browser = webdriver.Chrome(chrome_options=chrome_options)
# 遍历每个电影并获取详细信息和下载链接
movies = []
for movie_url in movie_urls:
browser.get(movie_url['url'])
soup = BeautifulSoup(browser.page_source, 'html.parser')
movie_info_table = soup.find('div', id='Zoom').find_all('table')[0]
# 获取电影标题、封面、评分、简介、类型、导演、演员、下载链接等信息
movie_title = soup.find('div', class_='title_all').h1.font.text
movie_cover = movie_info_table.find('img').get('src')
movie_rating = soup.find('div', class_='rating').find_all('strong')[0].text
movie_description = movie_info_table.find_all('td')[1].text.strip()
movie_type = movie_info_table.find_all('tr')[0].find_all('a')
movie_type = [t.text for t in movie_type]
movie_director = movie_info_table.find_all('tr')[1].find_all('a')
movie_director = [t.text for t in movie_director]
movie_actors = movie_info_table.find_all('tr')[2].find_all('a')
movie_actors = [t.text for t in movie_actors]
movie_download_urls = soup.select('#Zoom table > tbody > tr > td > a')
movie_download_urls = [{
'name': a.text,
'url': a.get('href')
} for a in movie_download_urls if a.get('href').startswith('thunder:')]
movies.append({
'title': movie_title,
'cover': movie_cover,
'rating': movie_rating,
'description': movie_description,
'type': ','.join(movie_type),
'director': ','.join(movie_director),
'actors': ','.join(movie_actors),
'download_urls': movie_download_urls
})
4. 自动下载
对于每部电影,可以从其中选择一个下载链接进行下载。这里使用迅雷进行自动下载。
import os
import time
from urllib.parse import unquote
from selenium.webdriver.chrome.service import Service as ChromeService
thunder_path = "C:\\Program Files (x86)\\Thunder Network\\Thunder\\Program\\Thunder.exe"
chrome_path = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"
download_dir = "D:\\Movies"
chrome_service = ChromeService(executable_path="C:\\chromedriver.exe")
chrome_driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
for i, movie in enumerate(movies):
print("Downloading movie: %d/%d" % (i + 1, len(movies)))
download_url = unquote(movie['download_urls'][0]['url'], 'utf-8')
chrome_driver.get(download_url)
time.sleep(10)
os.system('"%s" "%s"' % (thunder_path, download_url))
注意,需要修改thunder_path和download_dir变量的值,以便与本机环境相匹配。
示例
下面展示一个完整的代码示例,以搜索“玩具总动员”为例,获取电影详细信息并进行下载。
import os
import time
from urllib.parse import unquote
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
class DyttSpider:
def __init__(self):
self.url = 'https://www.dy2018.com/'
self.chrome_path = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"
self.thunder_path = "C:\\Program Files (x86)\\Thunder Network\\Thunder\\Program\\Thunder.exe"
self.download_dir = "D:\\Movies"
def run(self, query='玩具总动员'):
categories, movies = self.parse_page(query)
self.download_movies(movies)
def parse_page(self, query):
# 获取分类信息
response = requests.get(self.url)
soup = BeautifulSoup(response.text, 'html.parser')
categories = []
category_list = soup.find('div', class_='contain').find('div', class_='bd2').find_all('a')
for category in category_list:
categories.append({
'name': category.text,
'url': category.get('href')
})
# 搜索电影
search_url = 'https://www.dy2018.com/e/search/index.php'
post_data = {
'keyboard': query,
'show': 'title,smalltext',
'tempid': '1',
'tbname': 'article',
'submit': '搜索',
}
response = requests.post(search_url, data=post_data)
soup = BeautifulSoup(response.text, 'html.parser')
movie_list = soup.find_all('b', class_='title')
# 遍历每个电影并获取详细信息和下载链接
movies = []
for movie in movie_list:
movie_url = movie.find('a').get('href')
if movie_url.startswith('/html/'):
movie_url = 'https://www.dy2018.com' + movie_url
# 创建Chrome浏览器实例
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless") # 以无头模式运行
chrome_service = ChromeService(executable_path="C:\\chromedriver.exe")
chrome_driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
chrome_driver.get(movie_url)
soup = BeautifulSoup(chrome_driver.page_source, 'html.parser')
movie_info_table = soup.find('div', id='Zoom').find_all('table')[0]
# 获取电影标题、封面、评分、简介、类型、导演、演员、下载链接等信息
movie_title = soup.find('div', class_='title_all').h1.font.text
movie_cover = movie_info_table.find('img').get('src')
movie_rating = soup.find('div', class_='rating').find_all('strong')[0].text
movie_description = movie_info_table.find_all('td')[1].text.strip()
movie_type = movie_info_table.find_all('tr')[0].find_all('a')
movie_type = [t.text for t in movie_type]
movie_director = movie_info_table.find_all('tr')[1].find_all('a')
movie_director = [t.text for t in movie_director]
movie_actors = movie_info_table.find_all('tr')[2].find_all('a')
movie_actors = [t.text for t in movie_actors]
movie_download_urls = soup.select('#Zoom table > tbody > tr > td > a')
movie_download_urls = [{
'name': a.text,
'url': a.get('href')
} for a in movie_download_urls if a.get('href').startswith('thunder:')]
movies.append({
'title': movie_title,
'cover': movie_cover,
'rating': movie_rating,
'description': movie_description,
'type': ','.join(movie_type),
'director': ','.join(movie_director),
'actors': ','.join(movie_actors),
'download_urls': movie_download_urls
})
chrome_driver.quit()
return categories, movies
def download_movies(self, movies):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless") # 以无头模式运行
chrome_service = ChromeService(executable_path="C:\\chromedriver.exe")
chrome_driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
for i, movie in enumerate(movies):
print("Downloading movie: %d/%d" % (i + 1, len(movies)))
download_url = unquote(movie['download_urls'][0]['url'], 'utf-8')
chrome_driver.get(download_url)
time.sleep(10)
os.system('"%s" "%s"' % (self.thunder_path, download_url))
chrome_driver.quit()
if __name__ == '__main__':
spider = DyttSpider()
spider.run(query='玩具总动员')
输出结果为:
Downloading movie: 1/1
这里仅下载了一部电影,如果有多部电影需要下载,输出结果会有相应变化。同时可以发现,在程序运行完成之后,迅雷会自动打开并开始下载。
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:详解Python爬取并下载《电影天堂》3千多部电影 - Python技术站