Skip to content

Commit

Permalink
refactor: 重構部分名稱、用法
Browse files Browse the repository at this point in the history
- revert: Scraper 不再使用 class 包裝
  • Loading branch information
iwtba4188 authored and l7wei committed Nov 12, 2023
1 parent 607b648 commit 3eb29cc
Show file tree
Hide file tree
Showing 10 changed files with 19 additions and 48 deletions.
2 changes: 1 addition & 1 deletion src/api/routers/newsletters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pydantic import HttpUrl

from src.api import schemas
from src.utils.scraper import newsletter_scraper
from src.utils.scrapers import newsletter_scraper

router = APIRouter()

Expand Down
5 changes: 2 additions & 3 deletions src/api/routers/resources/careers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from fastapi import APIRouter

from src.utils.scraper import rpage_scraper
from src.api import schemas

from src.utils.scrapers import rpage_scraper

router = APIRouter()

Expand All @@ -12,6 +11,6 @@ async def get_bulletin_recruitment():
"""
獲取清華公佈欄的徵才公告。
"""
return rpage_scraper.announcement(
return rpage_scraper.get_announcement(
"https://bulletin.site.nthu.edu.tw/p/403-1086-5075-1.php?Lang=zh-tw"
)
12 changes: 6 additions & 6 deletions src/api/routers/resources/events.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from fastapi import APIRouter

from src.api import schemas
from src.utils.scraper import (
from src.utils.scrapers import (
cac_scraper,
goodjob_scraper,
library_scraper,
Expand Down Expand Up @@ -40,7 +40,7 @@ async def get_global_affairs_events():
"""
取得國際事務處的各類活動資料。
"""
return rpage_scraper.announcement(
return rpage_scraper.get_announcement(
"https://oga.site.nthu.edu.tw/p/403-1524-9308-1.php?Lang=zh-tw"
)

Expand All @@ -50,7 +50,7 @@ async def get_health_center_events():
"""
取得衛生保健組的活動資料。
"""
return rpage_scraper.announcement(
return rpage_scraper.get_announcement(
"https://health.site.nthu.edu.tw/p/403-1001-7467-1.php?Lang=zh-tw"
)

Expand All @@ -60,7 +60,7 @@ async def get_bulletin_art_and_cultural_events():
"""
取得清華公佈欄的藝文活動。
"""
return rpage_scraper.announcement(
return rpage_scraper.get_announcement(
"https://bulletin.site.nthu.edu.tw/p/403-1086-5083-1.php?Lang=zh-tw"
)

Expand All @@ -70,7 +70,7 @@ async def get_bulletin_academic_events():
"""
取得清華公佈欄的學術活動。
"""
return rpage_scraper.announcement(
return rpage_scraper.get_announcement(
"https://bulletin.site.nthu.edu.tw/p/403-1086-5084-1.php?Lang=zh-tw"
)

Expand All @@ -80,6 +80,6 @@ async def get_bulletin_student_events():
"""
取得清華公佈欄的學生活動。
"""
return rpage_scraper.announcement(
return rpage_scraper.get_announcement(
"https://bulletin.site.nthu.edu.tw/p/403-1086-5085-1.php?Lang=zh-tw"
)
2 changes: 1 addition & 1 deletion src/api/routers/rpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ def get_rpage_data(
"""
爬取指定 Rpage 公告的內容。
"""
return rpage_scraper.announcement(str(full_path))
return rpage_scraper.get_announcement(str(full_path))
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,13 @@
import re
from datetime import datetime, timedelta

import requests
import xmltodict
from bs4 import BeautifulSoup
from cachetools import TTLCache, cached
from fastapi import HTTPException

from src.utils import cached_request

@cached(cache=TTLCache(maxsize=64, ttl=60 * 60))
def _get_response(url: str, **kwargs) -> str:
headers = {
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-TW,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,zh-CN;q=0.5",
"dnt": "1",
"referer": url,
"sec-ch-ua": "'Chromium';v='119', 'Microsoft Edge';v='119', 'Not:A-Brand';v='24'",
"sec-ch-ua-mobile": "?1",
"sec-ch-ua-platform": "Android",
"sec-fetch-dest": "script",
"sec-fetch-mode": "no-cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36 Edg/112.0.1722.48",
}
response = requests.get(url, headers=headers, **kwargs)
status_code = response.status_code
if status_code != 200:
raise HTTPException(status_code, f"Request error: {status_code}")
return response.text


@cached(cache=TTLCache(maxsize=4, ttl=60 * 60))
def get_rss_data(rss_type: str) -> list:
"""
Args:
Expand All @@ -43,7 +19,7 @@ def get_rss_data(rss_type: str) -> list:
# 展覽及活動 RSS: https://www.lib.nthu.edu.tw/bulletin/RSS/export/rss_exhibit.xml
# 南大與人社分館 RSS: https://www.lib.nthu.edu.tw/bulletin/RSS/export/rss_branches.xml
url = f"https://www.lib.nthu.edu.tw/bulletin/RSS/export/rss_{rss_type}.xml"
xml_string = _get_response(url)
xml_string = cached_request.get(url)
xml_string = xml_string.replace("<br />", "")
dict = xmltodict.parse(xml_string)
rss_data = dict["rss"]["channel"]["item"]
Expand All @@ -55,7 +31,7 @@ def get_number_of_goods() -> dict:
取得總圖換證數量資訊。
"""
url = "https://adage.lib.nthu.edu.tw/goods/Public/number_of_goods_mix.js"
text = _get_response(url)
text = cached_request.get(url, update=True)
# 使用正規表達式從 text 中提取變量和值
variables = re.findall(r'var\s+(\w+)\s*=\s*(\d+|"[^"]*");', text)
# 將變量和值存儲在字典中
Expand All @@ -75,7 +51,7 @@ def get_opening_hours(libaray_name) -> dict:
取得指定圖書館的開放時間。
"""
url = f"https://www.lib.nthu.edu.tw/bulletin/OpeningHours/{libaray_name.value}.js"
text = _get_response(url)
text = cached_request.get(url)
# 使用正規表達式從 text 中提取日期和時間
match = re.search(
r"(\d{4}-\d{2}-\d{2}\s+\([\w]+\))<br />(\d{2}:\d{2})-(\d{2}:\d{2})", text
Expand All @@ -96,15 +72,14 @@ def get_space_data() -> list:
"""
# 來源: https://libsms.lib.nthu.edu.tw/build/
url = "https://libsms.lib.nthu.edu.tw/RWDAPI_New/GetDevUseStatus.aspx"
response = _get_response(url)
response = cached_request.get(url)
data = json.loads(response)
if data["resmsg"] != "成功":
raise HTTPException(404, "Not found")
else:
return data["rows"]


@cached(cache=TTLCache(maxsize=1, ttl=60 * 60))
def get_lost_and_found() -> list:
"""
取得失物招領資訊。
Expand All @@ -116,7 +91,7 @@ def get_lost_and_found() -> list:
date_end = date_end.strftime("%Y-%m-%d")
date_start = date_start.strftime("%Y-%m-%d")
# 發送 POST 請求
response = requests.post(
response = cached_request.post(
"https://adage.lib.nthu.edu.tw/find/search_it.php",
data={
"place": "0",
Expand All @@ -127,11 +102,8 @@ def get_lost_and_found() -> list:
"SUMIT": "送出",
},
)
if response.status_code != 200:
raise Exception(f"Request error: {response.status_code}")
html = response.text
# 找到表格
soup = BeautifulSoup(html, "html.parser")
soup = BeautifulSoup(response, "html.parser")
table = soup.find("table")
# 初始化一個列表來存儲所有行的數據
rows_data = []
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
from src.utils import cached_request


def replace_numbers_in_url(url: str, new_number: str) -> str:
def replace_numbers_in_url(self, url: str, new_number: str) -> str:
# 使用正則表達式替換 -1.php 為 -{new_number}.php
new_url = re.sub(r"-(1)\.php", f"-{new_number}.php", url)
return new_url


def announcement(url: str, maxpage: int = 1) -> list:
def get_announcement(url: str, maxpage: int = 1) -> list:
"""
從 Rpage 公告頁面取得公告資料。
Args:
Expand Down

0 comments on commit 3eb29cc

Please sign in to comment.