forked from corneliusroemer/pango-designation-dates
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_designation_date.py
95 lines (82 loc) · 3.09 KB
/
get_designation_date.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import datetime as dt
import io
import os
from collections import defaultdict
from typing import DefaultDict
import pandas as pd
from dateutil import parser
from pydriller import Repository
from tqdm import tqdm
import typer
def main(
pango_path: str = "~/code/pango-designation",
):
# PANGO_PATH = "~/code/pango-designation" # Local path
# PANGO_PATH = "https://github.com/cov-lineages/pango-designation" # CI
# PANGO_PATH = "pango-designation" # CI
# Find out if there are new commits
# if file exists
TIMESTAMP_FILE = "data/previous_commit_timestamp.txt"
if not os.path.exists(TIMESTAMP_FILE):
print("No previous commit timestamp found. Exiting.")
exit()
print("Reading timestamp")
with open(TIMESTAMP_FILE, "r") as f:
previous_commit_datetime = parser.parse(f.read())
print("Retrieving Repo ... ", end="")
repo = Repository(
pango_path, filepath="lineages.csv", since=previous_commit_datetime
)
print("retrieved")
print("Reading commits ...")
commits = []
for commit in repo.traverse_commits():
print(" -", "Date:", commit.author_date.isoformat(), "Hash:", commit.hash, "Message:", commit.msg)
commits.append(commit)
print("Commits all read")
# commits = list(repo.traverse_commits())
total_commits = len(commits)
new_commits = [
commit
for commit in commits
if commit.author_date > previous_commit_datetime
]
if len(new_commits) > 0:
print("New commits found")
else:
print("No new commits found")
return 1
first_mention: DefaultDict[str, dt.datetime] = defaultdict(None)
df = pd.read_csv("data/lineage_designation_date.csv", index_col=0)
for index, row in df.iterrows():
try:
first_mention[index] = parser.parse(row["designation_date"])
except:
first_mention[index] = None
# SINCE = parser.parse(df.designation_date.max()) - dt.timedelta(days=5)
# TO = SINCE + dt.timedelta(days=50)
for commit in tqdm(commits, total=total_commits):
for file in commit.modified_files:
if file.filename == "lineages.csv":
code = file.source_code
df = pd.read_csv(io.StringIO(code), on_bad_lines='warn')
try:
for lineage in df.lineage.unique():
if lineage not in first_mention:
first_mention[lineage] = commit.committer_date.isoformat()
except:
print("Error parsing", commit.hash)
continue
df = pd.DataFrame.from_dict(
first_mention, orient="index", columns=["designation_date"]
)
df.to_csv("data/lineage_designation_date.csv", index_label="lineage")
print("Updating timestamp")
most_recent_commit = commits[-1]
with open(TIMESTAMP_FILE, "w") as f:
f.write(most_recent_commit.author_date.isoformat())
# Print dates and hashes of recent commits
for commit in commits:
print(commit.author_date, commit.hash)
if __name__ == "__main__":
typer.run(main)