使用python爬取b站排行榜
解析文本结构
1
2
3
4
5
6
7
8
|
import requests
url = 'https://www.bilibili.com/ranking'
# 发起网络请求
response = requests.get(url)
# 打印返回的文本
print(response.text)
—|—
打印标题
1
2
3
4
5
6
7
8
9
10
11
12
|
import requests
from bs4 import BeautifulSoup
url = 'https://www.bilibili.com/ranking'
# 发起网络请求
response = requests.get(url)
html_text = response.text
soup = BeautifulSoup(html_text, 'html.parser') # html.parser为html页面解析工具
# 打印返回的文本
print(soup.title.text)
—|—
提取列表
1
2
3
4
5
6
7
8
9
10
11
12
13
|
import requests
from bs4 import BeautifulSoup
url = 'https://www.bilibili.com/ranking'
# 发起网络请求
response = requests.get(url)
html_text = response.text
soup = BeautifulSoup(html_text, 'html.parser')
# 提取列表
items = soup.findAll('li',{'class':'rank-item'}) # rank-item是其具体属性
print(len(items))
—|—
提取标题
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# 输入模块
import requests
from bs4 import BeautifulSoup
url = 'https://www.bilibili.com/ranking'
# 发起网络请求
response = requests.get(url)
html_text = response.text
soup = BeautifulSoup(html_text, 'html.parser')
# 提取列表
items = soup.findAll('li',{'class':'rank-item'}) # rank-item是其具体属性
for itm in items:
title = itm.find('a', {'class':'title'}).text
print(title)
—|—
提取其他字段
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
# 输入模块
import requests
from bs4 import BeautifulSoup
url = 'https://www.bilibili.com/ranking'
# 发起网络请求
response = requests.get(url)
html_text = response.text
soup = BeautifulSoup(html_text, 'html.parser')
# 提取列表
items = soup.findAll('li',{'class':'rank-item'}) # rank-item是其具体属性,是总的分支
for itm in items:
title = itm.find('a', {'class':'title'}).text
rank = itm.find('div', {'class':'num'}).text
score = itm.find('div', {'class':'pts'}).text
url = itm.find('a',{'class':'title'}).get('href') # 需要用get得到属性
print(f'{title}{rank}{score}{url}')
—|—
创建提取数据的列表
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
# 输入模块
import requests
from bs4 import BeautifulSoup
url = 'https://www.bilibili.com/ranking'
# 发起网络请求
response = requests.get(url)
html_text = response.text
soup = BeautifulSoup(html_text, 'html.parser')
# 创建用来储存信息的列表
class Videos:
def __init__(self, title, rank, score, url):
self.title = title
self.rank = rank
self.score = score
self.url = url
# 提取列表
items = soup.findAll('li',{'class':'rank-item'}) # rank-item是其具体属性
videos = []
for itm in items:
title = itm.find('a', {'class':'title'}).text
rank = itm.find('div', {'class':'num'}).text
score = itm.find('div', {'class':'pts'}).text
url = itm.find('a',{'class':'title'}).get('href') # 需要用get得到属性
v = Videos(title, rank, score, url)
videos.append(v)
print(len(videos))
—|—
保存数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
# 输入模块
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.bilibili.com/ranking'
# 发起网络请求
response = requests.get(url)
html_text = response.text
soup = BeautifulSoup(html_text, 'html.parser')
# 创建用来储存信息的列表
class Videos:
def __init__(self, title, rank, score, url):
self.title = title
self.rank = rank
self.score = score
self.url = url
def to_csv(self):
return[self.title, self.rank, self.score, self.url]
def csv_title():
return(['标题', '排名', '分数', 'URL'])
# 提取列表
items = soup.findAll('li',{'class':'rank-item'}) # rank-item是其具体属性
videos = []
for itm in items:
title = itm.find('a', {'class':'title'}).text
rank = itm.find('div', {'class':'num'}).text
score = itm.find('div', {'class':'pts'}).text
url = itm.find('a',{'class':'title'}).get('href') # 需要用get得到属性
v = Videos(title, rank, score, url)
videos.append(v)
file_name = 'Top100.csv'
with open(file_name, 'w', newline='') as f:
pen = csv.writer(f)
pen.writerow(Videos.csv_title())
for v in videos:
pen.writerow(v.to_csv())
—|—
ChangeLog
20200902 python实战