This repository has been archived by the owner on Sep 17, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
WeCimaScraper.py
81 lines (57 loc) · 2.3 KB
/
WeCimaScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from Common import akwam_get_website_safe, split_into_ranges, remove_arabic_chars, DEBUG
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import json
with open("./output/WeCima.json", "r", encoding="Utf-8") as fp:
old_series = json.load(fp)
def get_number_of_pages() -> int:
page = akwam_get_website_safe("https://wecima.tube/download-series/")
soup = BeautifulSoup(page.content, "html.parser")
return int(soup.find("ul", "page-numbers").find_all("li")[-2].text)
def scrape_pages(page_range: tuple[int]) -> dict:
content_dict = {}
for page_number in range(page_range[0], page_range[1]):
page = akwam_get_website_safe(
f"https://wecima.tube/download-series/?page_number={page_number}/")
soup = BeautifulSoup(page.content, "html.parser")
divs = soup.find_all("div", class_="GridItem")
for div in divs:
div_id = div["cpd"]
if div_id in old_series:
continue
else:
anchor = div.find("a")
source = anchor["href"].replace(
"https://wecima.tube/series/", "")
season_number = remove_arabic_chars(anchor["title"])
title = div.find("strong", class_="hasyear").text.split(
"-")[0].strip()
image_source = div.find(
"span", "BG--GridItem")["data-lazy-style"].replace("--image:url(", "").replace(");", "")
content_dict[div_id] = {
"Title": title,
"Image Source": image_source,
"Season Number": season_number,
"Source": source
}
if DEBUG:
print(f"Done page {page_number}")
else:
pass
return content_dict
def main() -> None:
page_ranges = split_into_ranges(8, get_number_of_pages())
if DEBUG:
print(page_ranges)
else:
pass
with ThreadPoolExecutor() as executor:
results = executor.map(scrape_pages, page_ranges)
for result in results:
old_series.update(result)
with open("./output/WeCima.json", "w", encoding="utf-8") as fp:
json.dump(old_series, fp, indent=4, ensure_ascii=False)
if __name__ == "__main__":
main()
else:
pass