-
Notifications
You must be signed in to change notification settings - Fork 0
/
bs_playground.py
84 lines (74 loc) · 3.14 KB
/
bs_playground.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from bs4 import BeautifulSoup
import re
html_content = """
<div class="item">
<div class="pic">
<em class="">4</em>
<a href="https://movie.douban.com/subject/1292722/">
<img width="100" alt="泰坦尼克号" src="https://img9.doubanio.com/view/photo/s_ratio_poster/public/p457760035.jpg" class="">
</a>
</div>
<div class="info">
<div class="hd">
<a href="https://movie.douban.com/subject/1292722/" class="">
<span class="title">泰坦尼克号</span>
<span class="title"> / Titanic</span>
<span class="other"> / 铁达尼号(港 / 台)</span>
</a>
<span class="playable">[可播放]</span>
</div>
<div class="bd">
<p class="">
导演: 詹姆斯·卡梅隆 James Cameron 主演: 莱昂纳多·迪卡普里奥 Leonardo...<br>
1997 / 美国 墨西哥 / 剧情 爱情 灾难
</p>
<div class="star">
<span class="rating5-t"></span>
<span class="rating_num" property="v:average">9.5</span>
<span property="v:best" content="10.0"></span>
<span>2275180人评价</span>
</div>
<p class="quote">
<span class="inq">失去的才是永恒的。 </span>
</p>
<p>
<span class="gact">
<a href="https://movie.douban.com/wish/45046660/update?add=1292722" target="_blank" class="j a_collect_btn" name="sbtn-1292722-wish" rel="nofollow">想看</a>
</span>
</p>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html_content, "html.parser")
pattern = re.compile(r"/(?![^(]*\))")
def parseTitle(element):
othername = []
cn_title = ""
en_title = ""
# Find all 'a' tags with class '' (empty class in the example)
for a_tag in element.find_all("a", class_=""):
# Find all 'span' with class 'title' within each 'a' tag
titles = a_tag.find_all("span", class_="title")
# Extract text and strip it to clean up whitespace
# first is Chinese name, second is english name, replace '/' to avoid confusion
for index, elm in enumerate(titles):
# if title contains "(港 / 台)" treat as one unit and not split by /
title = elm.text.replace("\xa0", " ").replace("/", "").strip()
if index == 0:
cn_title = title
else:
en_title = title
other_titles = a_tag.find("span", class_="other")
if other_titles:
alternative_names = [
name.strip()
for name in pattern.split(other_titles.text)
if name.strip()
]
othername.extend(alternative_names)
# Print the list of film names and alternative names
othername = [s.replace("\xa0", " ").strip() for s in othername]
othername = list(filter(lambda x: x != "", othername))
return [cn_title, en_title, othername]
print(parseTitle(soup))