-
Notifications
You must be signed in to change notification settings - Fork 2
/
dedupe.py
72 lines (56 loc) · 2 KB
/
dedupe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python
import csv
import gzip
import itertools
import typing
from dataclasses import dataclass
from operator import itemgetter
@dataclass
class Link:
position: str
text: str
url: str
@dataclass
class Strip:
timestamp_first: str
timestamp_last: str
links: list[Link]
def dedupe(reader: typing.Iterable[dict[str, str]]) -> tuple[list[str], list[Strip]]:
timestamps: list[str] = []
deduped: list[Strip] = []
grouped = itertools.groupby(reader, key=itemgetter("timestamp"))
for timestamp, rows in grouped:
timestamps.append(timestamp)
links = [Link(row["position"], row["text"], row["url"]) for row in rows]
if len(deduped) and links == deduped[-1].links:
deduped[-1].timestamp_last = timestamp
else:
deduped.append(Strip(timestamp, timestamp, links))
return timestamps, deduped
def main() -> None:
# Open gzipped file and dedupe
with gzip.open("data/bfn-trending-strip-raw.tsv.gz", "rt") as f:
reader = csv.DictReader(f, delimiter="\t")
timestamps, deduped = dedupe(reader)
# Write all timestamps to file
with open("data/all-timestamps.tsv", "w") as f:
ts_writer = csv.writer(f)
ts_writer.writerow(["timestamp"])
ts_writer.writerows([[x] for x in timestamps])
# Write deduped results to file
with open("data/bfn-trending-strip-deduped.tsv", "w") as f:
fieldnames = ["timestamp_first", "timestamp_last", "position", "text", "url"]
dd_writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
dd_writer.writeheader()
for entry in deduped:
for link in entry.links:
to_write = {
**{
"timestamp_first": entry.timestamp_first,
"timestamp_last": entry.timestamp_last,
},
**link.__dict__,
}
dd_writer.writerow(to_write)
if __name__ == "__main__":
main()