如何爬取电影信息并保存为JSON文件?
摘要:1. 网站地址:https:ssr1.scrape.center 2. 代码 import requests from lxml import etree import json # 存放所有电影信息 all_movies_data =
1. 网站地址:https://ssr1.scrape.center
2. 代码
import requests
from lxml import etree
import json
# 存放所有电影信息
all_movies_data = []
for page_num in range(1, 11):
to_request_url = f'https://ssr1.scrape.center/page/{page_num}'
res = requests.get(url=to_request_url)
res.encoding = 'utf-8'
tree_html = etree.HTML(res.text)
# 当前页面所有电影的div列表
movies_list = tree_html.xpath('//*[@id="index"]/div[1]/div[1]/div')
for movie in movies_list:
# 电影名称
movie_name = movie.xpath('./div/div/div[2]/a/h2/text()')[0]
# 类型
movie_type = movie.xpath('./div/div/div[2]/div[1]/button/span/text()')
# 电影地区
movie_area = movie.xpath('./div/div/div[2]/div[2]/span[1]/text()')[0]
# 电影时长
movie_length = movie.xpath('./div/div/div[2]/div[2]/span[3]/text()')[0]
# 上映年份
movie_time = movie.xpath('./div/div/div[2]/div[3]/span/text()')[0] if len(movie.xpath('./div/div/div[2]/div[3]/span/text()')) > 0 else '暂无'
# 评分
movie_score = movie.xpath('./div/div/div[3]/p[1]/text()')[0].strip()
# 获取当前电影的详情介绍
# 需要点击链接进去才能拿到
href = movie.xpath('./div/div/div[1]/a/@href')[0]
detail_url = 'https://ssr1.scrape.center' + href
detail = requests.get(url=detail_url)
tree_detail = etree.HTML(detail.text)
detail_text = tree_detail.xpath('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[4]/p/text()')[0].strip()
# 一个电影的信息
current_movie_data = {
'name': movie_name,
'type': movie_type,
'area': movie_area,
'length': movie_length,
'time': movie_time,
'score': movie_score,
'detail': detail_text
}
all_movies_data.append(current_movie_data)
print(f'page {page_num} done !')
# 保存电影信息到本地的JSON文件中
with open('movies_info.json', mode='w', encoding='utf-8') as f:
json.dump(all_movies_data, f, ensure_ascii=False, indent=4)
