-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
61 lines (48 loc) · 1.87 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import hashlib
import asyncio
import pandas as pd
import tempfile
from tqdm.auto import tqdm
from typing import Dict, TYPE_CHECKING
from scraper.rthk_zh import RTHKChineseTelegramScraper
from scraper.inmediahknet import InMediaHKNetTelegramScraper
from scraper.hk01 import HK01Scraper
from scraper.stheadline import HeadlineScraper
from huggingface_hub import HfApi
if TYPE_CHECKING:
from scraper.scraper import Scraper
REPO_NAME = os.getenv("HF_REPO_NAME")
HF_TOKEN = os.getenv("HF_TOKEN")
api = HfApi(token=HF_TOKEN)
def main(num_proc=3):
scrapers: Dict[str, Scraper] = {
"InMediaHKNet": InMediaHKNetTelegramScraper(num_proc=num_proc),
"RTHKChinese": RTHKChineseTelegramScraper(num_proc=num_proc),
"HK01": HK01Scraper(num_proc=num_proc),
"Headline": HeadlineScraper(num_proc=num_proc),
}
temp_dir = tempfile.TemporaryDirectory()
for key in tqdm(scrapers.keys(), desc="Scraping"):
scraper = scrapers[key]
articles = asyncio.run(scraper.get_articles())
for article in tqdm(articles, desc=f"Uploading {key}"):
# Convert article to a DataFrame
article_dict = article.to_dict()
df = pd.DataFrame([article_dict])
# md5 of the article id
article_id = hashlib.md5(article.id.encode()).hexdigest()
# Save DataFrame to a temporary CSV file
temp_file_name = f"{article_id}.csv"
temp_file_path = os.path.join(temp_dir.name, temp_file_name)
df.to_csv(temp_file_path, index=False)
# Upload the CSV file to Huggingface
api.upload_file(
path_or_fileobj=temp_file_path,
path_in_repo=f"articles/{key}/{article_id}.csv",
repo_id=REPO_NAME,
repo_type="dataset",
)
temp_dir.cleanup()
if __name__ == "__main__":
main()