-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
113 lines (80 loc) · 2.86 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import re
import time
from datetime import datetime
from pathlib import Path
from urllib.request import urlopen
import pytz
import schedule
from bs4 import BeautifulSoup
import telebot
URLs = {'spot': 'https://binance-docs.github.io/apidocs/spot/en/#change-log',
'futures': 'https://binance-docs.github.io/apidocs/futures/en/#change-log',
'delivery': 'https://binance-docs.github.io/apidocs/delivery/en/#change-log'}
'''
checking regex for dates
'''
def is_valid_date(data):
pattern1 = re.compile(r'\d{4}-\d{2}-\d{2}')
pattern2 = re.compile(r'\d{4}-\d-\d{2}')
return pattern2.search(data) or pattern1.search(data)
'''
get the content of the change logs, content ends at the next date
'''
def get_current_content(webpage, latest_date_idx, trade_type):
content = webpage[latest_date_idx] + ' ' + trade_type.upper()
for i in range(latest_date_idx + 1, len(webpage) - 1, 1):
if is_valid_date(webpage[i]):
break
content += webpage[i] + '\n'
return content
'''
Update and compare current content with past content from a txt file
'''
def update_change_log(content, trade_type):
has_update = False
Path('log').mkdir(parents=True, exist_ok=True)
file_path = f'log/{trade_type}.txt'
if not os.path.exists(file_path):
f = open(file_path, "w")
f.write(content)
f.close()
has_update = True
file_reader = open(file_path, "r")
if content != file_reader.read():
file_writer = open(file_path, "w")
file_writer.write(content)
file_writer.close()
has_update = True
return has_update
def check_updates(webpage, trade_type):
webpage = webpage.split('\n')
change_log_idx = -1
for i in range(len(webpage) - 1, -1,
-1): # find index of Change Log keyword, following line is the start of latest updates
if 'Change Log' in webpage[i]:
change_log_idx = i
for j in range(change_log_idx, len(webpage) - 1):
if is_valid_date(webpage[j]):
change_log_idx = j
break
if change_log_idx < 0:
print(f'change_log_idx: {change_log_idx} is not found!')
return
content = get_current_content(webpage, change_log_idx, trade_type)
has_update = update_change_log(content, trade_type)
if has_update:
telebot.send_message(content)
def run_scraper():
for trade_type, URL in URLs.items():
page = urlopen(URL)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
check_updates(soup.get_text(), trade_type)
def heartbeat_check():
telebot.send_check_message(f'HEARTBEAT CHECK at {datetime.now(pytz.utc)}')
schedule.every(15).minutes.do(run_scraper)
schedule.every(4).hours.do(heartbeat_check)
while True:
schedule.run_pending()
time.sleep(1)