-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrule.py
38 lines (32 loc) · 1.36 KB
/
rule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from bs4 import BeautifulSoup
import re
pattern = re.compile(r"/(?![^(]*\))")
def parseTitle(element):
othername = []
cn_title = ""
en_title = ""
# Find all 'a' tags with class '' (empty class in the example)
for a_tag in element.find_all("a", class_=""):
# Find all 'span' with class 'title' within each 'a' tag
titles = a_tag.find_all("span", class_="title")
# Extract text and strip it to clean up whitespace
# first is Chinese name, second is english name, replace '/' to avoid confusion
for index, elm in enumerate(titles):
# if title contains "(港 / 台)" treat as one unit and not split by /
title = elm.text.replace("\xa0", " ").replace("/", "").strip()
if index == 0:
cn_title = title
else:
en_title = title
other_titles = a_tag.find("span", class_="other")
if other_titles:
alternative_names = [
name.strip()
for name in pattern.split(other_titles.text)
if name.strip()
]
othername.extend(alternative_names)
# Print the list of film names and alternative names
othername = [s.replace("\xa0", " ").strip() for s in othername]
othername = list(filter(lambda x: x != "", othername))
return [cn_title, en_title, othername]