-
Notifications
You must be signed in to change notification settings - Fork 0
/
dl-pdfs.py
91 lines (72 loc) · 4.45 KB
/
dl-pdfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
## Prompt
Python script to read urls from a xlsx file and download them, following a folder structure.
script should load a xlsx file found in `data/decl csv url.xlsx` with the following columns: data, tip, url_decl, downloaded
The script should read the date in `data` - format dd.mm.yyyy create a folder structure in the following way: <yyyy>/<mm>/<tip> and download the file from https://declaratii.integritate.eu/DownloadServlet?fileName=<url_decl> to the folder structure. Create required folders if they don't exist.
After file is downloaded, mark the `downloaded` column with `1` for the respective row. Or if the file is not found, mark the `downloaded` column with `0`.
Use try / except to handle errors and log them in a file called `data/error.log` with the following format: `date - data - url_decl - error message`
stop a random number of seconds between 0.5 and 2 before downloading the next file.
script should not overwrite, skip files that are already downloaded - resume where it left from. At the beginning of the script read `data/decl csv url.xlsx` and only attempt downloading the files that are not downloaded yet, where the downloaded column is empty.
Sample `decl csv url.xlsx` data:
data,tip,url_decl,downloaded
31.01.2024,dI,16037936_2858993_a.pdf&uniqueIdentifier=NTNTARTLNE_16037936,
19.01.2024,dI,16027942_2844314_a.pdf&uniqueIdentifier=NTNTARTLNE_16027942,
19.01.2024,dI,16027935_2843964_a.pdf&uniqueIdentifier=NTNTARTLNE_16027935,
26.05.2007,dI2,3410669_3672011_DI_2007-05-26_ALECU%20SANDRA-OANA_63369.pdf&uniqueIdentifier=NTNTARTLNE_3410669,
26.05.2007,dA2,2350515_1562660_DA_2007-05-26_VLADU%20MINODORA_40304915.pdf&uniqueIdentifier=NTNTARTLNE_2350515,
26.05.2007,dI2,1481808_594802_DI_2007-05-26_MACAU%20SALVINA_10096411.pdf&uniqueIdentifier=NTNTARTLNE_1481808,
26.05.2007,dA2,1476464_600135_DA_2007-05-26_MAN%20NICOLAE_10100544.pdf&uniqueIdentifier=NTNTARTLNE_1476464,
26.05.2007,dA2,1424389_628592_DA_2007-05-26_GHEORGHE%20SVETLANA%20EUGENIA_20101499.pdf&uniqueIdentifier=NTNTARTLNE_1424389,
"""
data_root = '/Users/pax/devbox/gov2/data/'
file_path = data_root + "decl csv url.xlsx"
base_url = "https://declaratii.integritate.eu/DownloadServlet?fileName="
err_log = data_root + "dlpdf_error.log"
import pandas as pd
import requests
import os
import time
import random
from datetime import datetime
def download_file(url, folder_path, file_name):
response = requests.get(url)
if response.status_code == 200:
with open(os.path.join(folder_path, file_name), 'wb') as f:
f.write(response.content)
return True
else:
return False
def log_error(date, data, url_decl, message):
with open(err_log, "a") as log_file:
log_file.write(f"{date} - {data} - {url_decl} - {message}\n")
def create_folder_structure(date, tip):
date_obj = datetime.strptime(date, "%d.%m.%Y")
folder_path = os.path.join(str(date_obj.year), f"{date_obj.month:02}", tip)
os.makedirs(folder_path, exist_ok=True)
return folder_path
def main():
try:
df = pd.read_excel(file_path, dtype=str)
df['downloaded'] = df['downloaded'].fillna('') # Fill NaN with empty strings
for index, row in df.iterrows():
if row['downloaded'] == '':
folder_path = create_folder_structure(row['data'], row['tip'])
file_name = row['url_decl'].split('&')[0] # Assuming the file name is before '&'
full_url = base_url + row['url_decl']
try:
# Check if file already exists to avoid re-download
if not os.path.exists(os.path.join(folder_path, file_name)):
downloaded = download_file(full_url, folder_path, file_name)
df.at[index, 'downloaded'] = '1' if downloaded else '0'
time.sleep(random.uniform(0.5, 2)) # Wait for a random time before next download
else:
df.at[index, 'downloaded'] = '1' # Mark as downloaded if file exists
except Exception as e:
log_error(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row['data'], row['url_decl'], str(e))
df.at[index, 'downloaded'] = '0'
# Save the DataFrame back to Excel
df.to_excel(file_path, index=False)
except Exception as e:
log_error(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "General", "N/A", str(e))
if __name__ == "__main__":
main()