ssr2 介绍: 电影数据网站,无反爬,无 HTTPS 证书,适合用作 HTTPS 证书验证。
0x00 网站分析
与案例ssr1基本一致,区别在于对于无证书情况下的代码考察。
0x01 考查要点
InsecureRequestWarning: Unverified HTTPS request is being made to host ‘ssr2.scrape.center’. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings warnings.warn(
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
# 禁用 SSL 警告
disable_warnings(InsecureRequestWarning)
0x02 爬虫构建
import requests
from bs4 import BeautifulSoup
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
# 禁用 SSL 警告
disable_warnings(InsecureRequestWarning)
# 基础 URL
base_url = 'https://ssr2.scrape.center/page/{}'
def get_movie_data(page):
url = base_url.format(page)
response = requests.get(url,verify=False) # 禁用 SSL 证书验证
soup = BeautifulSoup(response.text, 'html.parser')
# 提取每部电影的数据
movies = soup.find_all('div', class_='el-card item m-t is-hover-shadow')
for movie in movies:
name = movie.find('a', class_='name').get_text(strip=True)
categories = [button.get_text(strip=True) for button in movie.find_all('button', class_='category')]
info = movie.find_all('div', class_='m-v-sm info')
country_duration = info[0].get_text(strip=True).split('/')
country = country_duration[0].strip()
duration = country_duration[1].strip() if len(country_duration) > 1 else ''
release_date = info[1].get_text(strip=True)
print('名称:', name)
print('类别:', ', '.join(categories))
print('国家/地区:', country)
print('时长:', duration)
print('上映日期:', release_date)
print('-' * 50)
# 遍历第1页到第11页
for page in range(1, 12):
print(f'第 {page} 页的电影数据:')
get_movie_data(page)