-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_txt_file.py
executable file
·97 lines (77 loc) · 3.49 KB
/
parse_txt_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import re
import os
from dotenv import load_dotenv, find_dotenv
# get config variables from .env file
load_dotenv(find_dotenv())
MIN_QUOTE_LENGTH = int(os.getenv("MIN_QUOTE_LENGTH"))
MAX_QUOTE_LENGTH = int(os.getenv("MAX_QUOTE_LENGTH"))
# reads flat .txt. file and parses it into a list of dictionaries
# the key 'title' is the title of the book
# the key 'author' is the author of the book
# the key 'type' can be: Highlight, Note, Bookmark
# the key 'date added' is the date when the quote was highlighted by the reader
# the key 'quote' is the quote body
def parse_to_dict(filename):
with open(filename, "r") as f:
text = f.read()
# Regex that searches for the following pattern
pattern = r"(?P<title>.+?) \((?P<author>[^)]+)\)\n- Your (?P<type>Highlight|Note|Bookmark) on (?:page (?P<page>\d+|[ivxlcdm]+) \| )?Location (?P<location>\d+-\d+) \| Added on (?P<date_added>.+?)\n\n(?P<quote>[^\n]+)"
# Searches for unique matches of the pattern within the text file
matches = re.findall(pattern, text, re.MULTILINE)
# Create a list of dictionaries, one for each match
result = []
# Use a loop to convert each tuple to a dictionary with the required keys and values
for match in matches:
title, author, htype, page, location, date_added, quote = match
words = author.split(', ') # Split author string to ['Lastname', 'Firstname']
author = words[1] + ' ' + words[0] #Replace Firstname with Lastname and conversely
result.append(
{
"title": title,
"author": author,
"type": htype,
"page": page,
"location": location,
"date_added": date_added,
"quote": quote,
}
)
return result
# Takes list of dictionaries and removes quotes that are too short, truncates quotes that are too long
def clean_list(my_list):
# Create a new list to avoid changing while iterating
cleaned_list = []
for item in my_list:
quote = item["quote"]
quote_length = len(quote)
if quote_length > MIN_QUOTE_LENGTH:
if quote_length > MAX_QUOTE_LENGTH:
truncated_quote = quote[:MAX_QUOTE_LENGTH].rsplit(" ", 1)[0] + " ..."
item["quote"] = truncated_quote
cleaned_list.append(item)
return cleaned_list
# Takes list of dictionaries and removes duplicate quotes
# Addresses a Kindle bug: when the reader edits a highlight, it registers as a new highlight
# Current logic takes the quote that reflects the most recent edit
def deduplicate_list(my_list):
new_list = []
for i, quote1 in enumerate(my_list):
is_duplicate = False
for quote2 in my_list[i + 1 :]:
# quotes are duplicates if they are a substring of each other, and are from the same book
if (
quote1["title"] == quote2["title"]
and quote1["quote"] in quote2["quote"]
):
is_duplicate = True
# keep the most recent edit of the highlight by the reader
if quote1["date_added"] <= quote2["date_added"]:
quote1 = quote2
break
if not is_duplicate:
new_list.append(quote1)
print("Number of quotes deduplicated: " + str(len(my_list) - len(new_list)))
return new_list
# run only if this is the program being explicitly run
if __name__ == "__main__":
print("You are only running" + __file__ + "and not importing it.")