-
Notifications
You must be signed in to change notification settings - Fork 0
/
data-exporter.py
50 lines (38 loc) · 1.45 KB
/
data-exporter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
import duckdb
from datetime import datetime
import time
export_log = "export-log.txt"
api_base = "https://data.ny.gov/resource/uhf3-t34z.csv"
db_file = "subway-trips.db"
limit = 1_000_000
offset = 0
# Function to log messages
def log_message(message: str) -> None:
"""Log messages with timestamp."""
with open(export_log, 'a') as file:
file.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')},{message}\n")
with duckdb.connect(db_file) as con:
while True:
api_path = f"{api_base}?$limit={limit}&$offset={offset}&$order=:id"
log_message(f"Fetching data from: {api_path}")
try:
df = pd.read_csv(api_path)
rows = df.shape[0]
if offset == 0:
# Create the table if it's the first chunk
con.execute("CREATE TABLE trips AS SELECT * FROM df")
else:
# Insert data into the existing table
con.execute("INSERT INTO trips SELECT * FROM df")
if rows < limit:
# If fewer rows than limit, we are done
log_message("Finished processing all data.")
break
# Update offset for the next batch
offset += limit
# Sleep to avoid hitting rate limits
time.sleep(10)
except Exception as e:
log_message(f"Error reading data: {e}")
break