Python 爬虫入门 | Story Begins……

爬虫扫盲练习。

实战练习：豆瓣电影 TOP250

源码：

import requests
from bs4 import BeautifulSoup

header = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36 Edg/110.0.1587.63"
}

for start_num in range(0, 250, 25):
    response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=header)
    html = response.text
    soup = BeautifulSoup(html, "html.parser") #html.parser 是一个解析器

    all_titles = soup.findAll("span", attrs={"class": "title"})
    for title in all_titles:
        title_string = title.string
        if "/" not in title_string:
            print(title_string)

实现步骤

安装 Requests 和 BeautifulSoup 库。

1 2	pip install Requests pip install BeautifulSoup

导入 Requests 库，并爬取豆瓣电影 TOP250

import requests

response = requests.get("https://movie.douban.com/top250")
print(response.status_code) #直接打印 response 或 response.status_code 都可以

运行代码，返回 418，表示豆瓣不想理你。

加入请求头（headers）将代码伪装成浏览器。

浏览器打开豆瓣电影 Top 250 (douban.com)，点击右键→选择【检查】→选择【网络】，在【名称】中找到【top250】，查看【标头】→【请求标头】→【User-Agent】，复制冒号后面的内容。
在代码中加入 headers。

import requests

header = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36 Edg/110.0.1587.63"
}

response = requests.get("https://movie.douban.com/top250", headers=header)
print(response.status_code) #返回值 200，表示正常访问
print(response.text) #可以看到打印下来 HTML 代码

可以把 reponse.text 的返回值命名为 html，使用 BeautifulSoup 去解析，返回值命名为 soup，然后调用 soup 的属性 findAll，返回值命名为 all_title。

import requests

header = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36 Edg/110.0.1587.63"
}

response = requests.get("https://movie.douban.com/top250", headers=header)
html = response.text
soup = BeautifulSoup(html, "html.parser")
all_titles = soup.findAll("span", attrs={"class": "title"})
for title in all_titles:
	print(title) 
---    
#打印结果 
<span class="title">疯狂动物城</span>
<span class="title"> / Zootopia</span>
---
#如果只想要文字元素，则改为
for title in all_titles:
	print（title.string)
---
#打印结果
疯狂动物城
 / Zootopia
---
#这时候会有原名，分析发现原名前面都有 /，使用 if 语句判断，即可剔除掉
for title in all_titles:
    title_string = title.string
    if "/" not in title_string:
        print(title_string)
---
#打印结果
疯狂动物城
---

我们注意到，这里只打印了前 25 个结果，如果想要打印 250 个结果，那么就要分析网站地址，发现第二页的网站地址是这样的：

1	https://movie.douban.com/top250?start=25&filter=

在后面写上 ?start=xx 即可。于是可以写一个 for 循环来完成：

for start_num in range(0, 250, 25):
    print(start_num)
---
#打印结果
0 25 50 …… 225

将 request.get() 里面的网址字符串格式化。f 表示字符串格式化，它可以将大括号 {} 中的变量值替换为实际的值。在这个网址中，{start_num} 会被替换为实际的数字，从而实现动态生成网址的功能。

再将前面打印前 25 电影名的代码，写进 for 循环体。

for start_num in range(0, 250, 25):
    response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=header)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    all_titles = soup.findAll("span", attrs={"class": "title"})
    for title in all_titles:
        title_string = title.string
        if "/" not in title_string:
            print(title_string)
---
#打印结果
肖申克的救赎
霸王别姬
阿甘正传
泰坦尼克号
这个杀手不太冷
…… （一直到第 250 个）

完成！

拓展练习

以上是根据 B 站一位 Up 主的教程，手把手教着写的。现在只是在终端里打印下来了这 250 个名字，如果我想要做个表格，有序号、中文名、原名、年份、评分、时长、简介。于是开始折腾……

加入了电影原名（第 3 列）

import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook

header = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36 Edg/110.0.1587.63"
}

wb = Workbook()
ws = wb.active
ws.title = "豆瓣电影TOP250"
ws["A1"] = "序号"
ws["B1"] = "电影名称"
ws["C1"] = "电影原名"
ws["D1"] = "其他名称"

row_num = 2

for start_num in range(0, 250, 25):

    response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=header)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    all_titles = soup.findAll("span", attrs={"class": "title"})
    all_other_titles = soup.findAll("span", attrs={"class": "other"})
    for i, title in enumerate(all_titles):
        title_string = title.string
        if "/" not in title_string:
            ws.cell(row=row_num, column=1, value=i+1+start_num)
            ws.cell(row=row_num, column=2, value=title_string)
            last_row_num = row_num
            row_num += 1
        else:
            ws.cell(row=last_row_num, column=3, value=title_string.split('/')[1].strip())

wb.save("douban_top250_5.xlsx")

又加入了别名（第 4 列）

import requests
import re
from bs4 import BeautifulSoup
from openpyxl import Workbook

header = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36 Edg/110.0.1587.63"
}

wb = Workbook()
ws = wb.active
ws.title = "豆瓣电影TOP250"
ws["A1"] = "序号"
ws["B1"] = "电影名称"
ws["C1"] = "电影原名"
ws["D1"] = "其他名称"

row_num = 2
last_row_num = 2
for start_num in range(0, 250, 25):

    response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=header)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    all_elements = soup.findAll("span", attrs={"class": ["title", "other", ""]})

    for i, element in enumerate(all_elements):
        element_string = element.string
        ws.cell(row=row_num, column=1, value=i+1+start_num)
        if element.has_attr("class") and "other" in element["class"]:
            element_string = re.sub(r"\s*/\s*", "", element_string, count=1)
            element_string = re.sub(r"\s*/\s*", "/", element_string, count=1)
            ws.cell(row = last_row_num, column = 4, value=element_string)
        else:
            if "/" not in element_string:
                ws.cell(row=row_num, column=2, value=element_string)
                last_row_num = row_num
                row_num += 1
            else:
                ws.cell(row=last_row_num, column=3, value=element_string.split('/')[1].strip())

wb.save("douban_top250_18.xlsx")

又加入了年份、国家（第 5、6 列）

import requests
import re
from bs4 import BeautifulSoup
from openpyxl import Workbook

header = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36 Edg/110.0.1587.63"
}

wb = Workbook()
ws = wb.active
ws.title = "豆瓣电影TOP250"
ws["A1"] = "序号"
ws["B1"] = "电影名称"
ws["C1"] = "电影原名"
ws["D1"] = "其他名称"
ws["E1"] = "年份"
ws["F1"] = "国家"
ws["G1"] = "评分"
ws["H1"] = "简介"

row_num = 2
last_row_num = 2
for start_num in range(0, 250, 25):

    response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=header)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    all_elements = soup.select("p, span.title, span.other")

    for element in all_elements:
        element_string = element.string
        if element.name == "p":
            text = element.text.strip()
            match = re.search(r"\d{4}\s*/\s*\S+", text)
            if match:
                year_country = match.group(0)
                year, country = map(str.strip, year_country.split("/"))
                ws.cell(row=last_row_num, column=5, value=year)
                ws.cell(row=last_row_num, column=6, value=country)
        elif element.name == "span" and "title" in element["class"]:
            if "/" not in element_string:
                ws.cell(row=row_num, column=2, value=element_string)
                last_row_num = row_num
                row_num += 1
            else:
                ws.cell(row=last_row_num, column=3, value=element_string.split('/')[1].strip())
        elif element.name == "span" and "other" in element["class"]:
            element_string = element.text.strip()
            element_string = re.sub(r"\s*/\s*", "", element_string, count=1)
            element_string = re.sub(r"\s+", "", element_string)
            ws.cell(row = last_row_num, column = 4, value=element_string)

wb.save("douban_top250_20.xlsx")

又加入了简介（第 7 列）

import requests
import re #用于正则匹配
from bs4 import BeautifulSoup
from openpyxl import Workbook #用于生成 Excel 表格

#加入请求头，伪装成浏览器
header = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36 Edg/110.0.1587.63"
}

#表头
wb = Workbook()
ws = wb.active
ws.title = "豆瓣电影TOP250" #这张表格的名字，在左下角显示
ws["A1"] = "序号"
ws["B1"] = "电影名称"
ws["C1"] = "电影原名"
ws["D1"] = "其他名称"
ws["E1"] = "年份"
ws["F1"] = "国家"
ws["G1"] = "评分"
ws["H1"] = "简介"

row_num = 2 #从第二行开始往里面填充内容
last_row_num = 2 #用于在上一行填充列元素

#第一个循环体，按页索引，写在 ?start= 后面
for start_num in range(0, 250, 25):

    response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=header) #爬取内容，后面要加上伪装
    html = response.text #得到 HTML 网页，属性为 text
    soup = BeautifulSoup(html, "html.parser") #通过解析器解析，可以找到各种属性

    all_elements = soup.select("p, span.title, span.other, span.rating_num, span.inq") #选择不同的属性，span.xxx 表示在 span 标签里面的 xxx 类（class）

    #第二个循环体，爬取当前页的所有（↑上面标签中的）元素，
    for element in all_elements:
        if element.name == "p": #P 标签
            text = element.text.strip() #输出文本，.strip() 是为了去除字符串开头和结尾的空白字符。空白字符包括空格、制表符和换行符等。
            match = re.search(r"\d{4}\s*/\s*\S+", text)
            if match:
                year_country = match.group(0)
                year, country = map(str.strip, year_country.split("/"))
                ws.cell(row=last_row_num, column=5, value=year)
                ws.cell(row=last_row_num, column=6, value=country)
        elif element.name == "span" and "title" in element["class"]:
            title_string = element.string
            if "/" not in title_string:
                ws.cell(row=row_num, column=2, value=title_string)
                last_row_num = row_num
                row_num += 1
            else:
                ws.cell(row=last_row_num, column=3, value=title_string.split('/')[1].strip())
        elif element.name == "span" and "other" in element["class"]:
            other_string = element.text.strip()
            other_string = re.sub(r"\s*/\s*", "", other_string, count=1)
            other_string = re.sub(r"\s+", "", other_string)
            ws.cell(row=last_row_num, column=4, value=other_string)
        elif element.name == "span" and "rating_num" in element["class"]:
            rating_value = float(element.string.strip())
            ws.cell(row=last_row_num, column=7, value=rating_value)
        elif element.name == "span" and "inq" in element["class"]:
            inq_string= element.string.strip()
            ws.cell(row=last_row_num, column=8, value=inq_string)

wb.save("douban_top250_26.xlsx")

但是！它只显示一个国家，而有的作品制片国家不止一个，有两个三个四个的情况，经过一天的努力，通过任务分解法（先去掉一部分，再去掉一部分，再完成替换），于是有了下面的最终完成版！！！

拓展的最终版（注释完全版）

import requests
import re #用于正则匹配
from bs4 import BeautifulSoup
from openpyxl import Workbook #用于生成 Excel 表格

#加入请求头，伪装成浏览器
header = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36 Edg/110.0.1587.63"
}

#表头
wb = Workbook()
ws = wb.active
ws.title = "豆瓣电影TOP250" #这张表格的名字，在左下角显示
ws["A1"] = "序号" #这一项会比较混乱，不填充，在表格中下拉填充，刚好 250 条
ws["B1"] = "电影名称" #title
ws["C1"] = "电影原名" #title→判断带"/",输出
ws["D1"] = "其他名称" #other
ws["E1"] = "年份" #正则匹配 \d{4}
ws["F1"] = "国家" #正则匹配，通过两次去除，一次替换完成
ws["G1"] = "评分" #rating_num
ws["H1"] = "简介" #quote→inq

row_num = 2 #从第二行开始往里面填充内容
last_row_num = 2 #用于在上一行填充列元素

#第一个循环体，按页索引，写在 ?start= 后面，第一页是 0，第二页是 25……依次类推
for start_num in range(0, 250, 25):

    response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=header) #爬取内容，后面要加上伪装
    html = response.text #得到 HTML 网页，属性为 text
    soup = BeautifulSoup(html, "html.parser") #通过解析器解析，可以找到各种属性

    all_elements = soup.select("p, span.title, span.other, span.rating_num, span.inq") #选择不同的属性，span.xxx 表示在 span 标签里面的 xxx 类（class）

    #第二个循环体，爬取当前页的所有（↑上面标签中的）元素，
    for element in all_elements:
        if element.name == "p": #判断是否为 P 标签
            text = element.text.strip() #输出文本，.strip() 是为了去除字符串开头和结尾的空白字符。空白字符包括空格、制表符和换行符等。
            
            #输出年份
            pattern = r'\d{4}' #正则匹配
            match = re.search(pattern, text)
            if match: #还是要加依据判断，如果有错误会报一下
                year = match.group(0)
                ws.cell(row=last_row_num, column=5, value=year) #由于是先输出 title，这里 last_row_num 都表示和 title 在同一行
            
            #输出国家
            if '\n' in text: #一开始用了"导演"这个词，匹配到《哈利波特与阿兹卡班的囚徒》时，它的 quote（也是 p 标签）是“不一样的导演，不一样的哈利·波特。”，导致运行下面的代码时匹配不到 \d{4} 等内容，出现报错。通过一步步断点调试才发现这个问题。最后选了一个 \n，表示 <br>，这个在 quote 里面是没有的，因此顺利运行。
                if title_string != "大闹天宫" and title_string != "茶馆": #《大闹天宫》和《茶馆》的年份与别的条目都不一样，出现报错。直接 if 语句判断，出现他俩就不输出国家了。
                    pattern = r'\d{4}\s*/\s*(.*)' #正则匹配，表示只匹配 \d{4}（指年份）及之后的内容
                    match = re.search(pattern, text) 
                    country = match.group(1).split('/')[0].strip().replace(' ', '/') #三行代码简化合并而来：首先是去除 \d{4} 之前的内容，但是通过 group(1)，直接匹配到了 / 后面的内容，这是因为（0）表示整个内容，（1）表示第一个括号里的内容。第二步去除国家 / 后面的内容。第三步把国家之间的空格用 / 代替。
                    ws.cell(row=last_row_num, column=6, value=country) #同样插入与 title 同一行

        elif element.name == "span" and "title" in element["class"]:
            title_string = element.string
            if "/" not in title_string:
                ws.cell(row=row_num, column=2, value=title_string)
                last_row_num = row_num
                row_num += 1 #注意：全部代码只有这里是加了 1，这是因为它先匹配 title，匹配完后，行数加 1，但是其他的元素都还是用的这一行，即 last_row_num
            else:
                ws.cell(row=last_row_num, column=3, value=title_string.split('/')[1].strip())
        elif element.name == "span" and "other" in element["class"]: #别名，其他名称
            other_string = element.text.strip() #去掉了字符串两端的空格和换行符，留下了"/ 月黑高飞(港)  /  刺激1995(台)"
            other_string = re.sub(r"\s*/\s*", "", other_string, count=1) #这里的正则匹配是用空字符代替第一个 / 及其左右空格，留下了第二个 /："月黑高飞(港)  /  刺激1995(台)"
            other_string = re.sub(r"\s+", "", other_string) #这里的正则匹配是用空字符代替所有空格，刚好使 / 留下："月黑高飞(港)/刺激1995(台)"
            ws.cell(row=last_row_num, column=4, value=other_string)
        elif element.name == "span" and "rating_num" in element["class"]:
            rating_value = float(element.string.strip()) #这里用不用 float 都是可以的，无所谓
            ws.cell(row=last_row_num, column=7, value=rating_value)
        elif element.name == "span" and "inq" in element["class"]:
            inq_string= element.string.strip() #去除字符串两端的空格和换行符
            ws.cell(row=last_row_num, column=8, value=inq_string)

wb.save("douban_top250_1.xlsx")

算下来花了有两天的功夫，其中昨天（2023/03/09）上午跟着 Up 主学，下午开始写拓展，到了晚上的时候完成。结果快下班的时候发现了国家和地区那里出了问题，一开始想一行代码就完成正则匹配，奈何一直无解，困扰了今天（2023/03/10）一整个上午。下午去车管所回来四点，更新了思路，一步一步实现。又遇到了大闹天宫、茶馆年份和别的条目不太一样的问题，让师弟看了一下，加了 if 判断，搞定！然后又遇到了哈利波特与阿兹卡班的囚徒那里报错，原来是它的 quote 里面有导演这个词，引用就进入了国家的条目，匹配不到，就报错，换了 \n 解决了问题！断点调试流程如下图。（刚刚想起来：年份那里有个 if 判断，而我在国家那里没有加，所以才报错！换成下面的代码就不报错了，即使用导演也没有关系。哎呀，被自己蠢到。）

#输出国家
if '导演' in text: 
    if title_string != "大闹天宫" and title_string != "茶馆": 
        pattern = r'\d{4}\s*/\s*(.*)' 
        match = re.search(pattern, text) 
        if match:
            country = match.group(1).split('/')[0].strip().replace(' ', '/')
            ws.cell(row=last_row_num, column=6, value=country)

完结撒花✿✿ヽ(°▽°)ノ✿✿

参考

https://b23.tv/svWjQBz