如何爬取电影信息并保存为JSON文件?

摘要:1. 网站地址:https:ssr1.scrape.center 2. 代码 import requests from lxml import etree import json # 存放所有电影信息 all_movies_data =
1. 网站地址:https://ssr1.scrape.center 2. 代码 import requests from lxml import etree import json # 存放所有电影信息 all_movies_data = [] for page_num in range(1, 11): to_request_url = f'https://ssr1.scrape.center/page/{page_num}' res = requests.get(url=to_request_url) res.encoding = 'utf-8' tree_html = etree.HTML(res.text) # 当前页面所有电影的div列表 movies_list = tree_html.xpath('//*[@id="index"]/div[1]/div[1]/div') for movie in movies_list: # 电影名称 movie_name = movie.xpath('./div/div/div[2]/a/h2/text()')[0] # 类型 movie_type = movie.xpath('./div/div/div[2]/div[1]/button/span/text()') # 电影地区 movie_area = movie.xpath('./div/div/div[2]/div[2]/span[1]/text()')[0] # 电影时长 movie_length = movie.xpath('./div/div/div[2]/div[2]/span[3]/text()')[0] # 上映年份 movie_time = movie.xpath('./div/div/div[2]/div[3]/span/text()')[0] if len(movie.xpath('./div/div/div[2]/div[3]/span/text()')) > 0 else '暂无' # 评分 movie_score = movie.xpath('./div/div/div[3]/p[1]/text()')[0].strip() # 获取当前电影的详情介绍 # 需要点击链接进去才能拿到 href = movie.xpath('./div/div/div[1]/a/@href')[0] detail_url = 'https://ssr1.scrape.center' + href detail = requests.get(url=detail_url) tree_detail = etree.HTML(detail.text) detail_text = tree_detail.xpath('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[4]/p/text()')[0].strip() # 一个电影的信息 current_movie_data = { 'name': movie_name, 'type': movie_type, 'area': movie_area, 'length': movie_length, 'time': movie_time, 'score': movie_score, 'detail': detail_text } all_movies_data.append(current_movie_data) print(f'page {page_num} done !') # 保存电影信息到本地的JSON文件中 with open('movies_info.json', mode='w', encoding='utf-8') as f: json.dump(all_movies_data, f, ensure_ascii=False, indent=4)
阅读全文