-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcrawl.py
84 lines (69 loc) · 7.48 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
FILE_NAME = 'lottery.csv'
current_date = datetime(2005, 1, 1).date()
is_mod = False
if os.path.isfile(FILE_NAME):
is_mod = True
current_date = pd.to_datetime(pd.read_csv(FILE_NAME)['Date'].iloc[-1]).date() + timedelta(days=1)
data = []
options = Options()
options.binary_location = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
options.add_argument('chromedriver_path')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
XPATHS = ['/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[2]/td[2]/div',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[3]/td[2]/div',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[4]/td[2]/div[1]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[4]/td[2]/div[2]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[5]/td[2]/div[1]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[5]/td[2]/div[2]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[5]/td[2]/div[3]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[5]/td[2]/div[4]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[5]/td[2]/div[5]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[5]/td[2]/div[6]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[6]/td[2]/div[1]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[6]/td[2]/div[2]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[6]/td[2]/div[3]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[6]/td[2]/div[4]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[7]/td[2]/div[1]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[7]/td[2]/div[2]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[7]/td[2]/div[3]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[7]/td[2]/div[4]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[7]/td[2]/div[5]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[7]/td[2]/div[6]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[8]/td[2]/div[1]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[8]/td[2]/div[2]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[8]/td[2]/div[3]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[9]/td[2]/div[1]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[9]/td[2]/div[2]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[9]/td[2]/div[3]',
'/html/body/div[1]/div/center/div/div/div[3]/div/div/div/table/tbody/tr/td[2]/div/table/tbody/tr/td[1]/div[2]/div/div/div[1]/div[2]/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr[9]/td[2]/div[4]']
with webdriver.Chrome(options=options) as browser:
while current_date != datetime.now().date():
day = current_date.day
month = current_date.month
year = current_date.year
print(f'Processing {day}-{month}-{year}...')
browser.get(f'https://www.minhngoc.net.vn/tra-cuu-ket-qua-xo-so.html?mien=2&thu=0&ngay={day}&thang={month}&nam={year}')
row = [current_date]
row.extend([0] * 10)
try:
for xpath in XPATHS:
for digit in browser.find_element(By.XPATH, xpath).text:
if digit.isdigit():
row[int(digit) + 1] += 1
except:
print(f'{day}-{month}-{year} data has been lost!')
data.append(row)
current_date += timedelta(days=1)
df = pd.DataFrame(data, columns=['Date', 'Zero', 'One', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven', 'Eight', 'Nine'])
if is_mod:
df.to_csv(FILE_NAME, mode='a', header=False, index=False, lineterminator='\n')
else:
df.to_csv(FILE_NAME, index=False)