-
Notifications
You must be signed in to change notification settings - Fork 1
/
autocategorize.py
246 lines (210 loc) · 11.1 KB
/
autocategorize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# This script is called as: python3 autocategorize.py path/to/sqlite-database.db FinancialAccountName
# FinancialAccountName is one of valid_accounts from the verify_accounts() function below
# To really understand how this autocategorizer works, read the reddit comment mentioned below. It shows how it works in <10 lines of code. It's very easy, I promise! :)
# The rest of the code in this script is merely to feed data to and get data out of the autocategorizer
# https://stackabuse.com/text-classification-with-python-and-scikit-learn/
# https://towardsdatascience.com/pandas-dataframe-playing-with-csv-files-944225d19ff?gi=8fce15d7d81d
# https://towardsdatascience.com/pandas-dataframe-a-lightweight-intro-680e3a212b96
# https://www.reddit.com/r/learnpython/comments/8ickh7/grouping_bank_transactions_with_text_analysis/dyr7u67/
# On MacOS, you may need to install/configure some packages as follows
# pip install pandas
# pip install cython
# brew install xz
# brew install libomp
# Add following to .bashrc or .zshrc
#export CC=/usr/bin/clang
#export CXX=/usr/bin/clang++
#export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"
#export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
#export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
#export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
# pip install scikit-learn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import re
import argparse
import sqlite3
from prompt_toolkit import prompt
from prompt_toolkit.completion import FuzzyWordCompleter
from prompt_toolkit.validation import Validator, ValidationError
def preprocess_description(description):
# Remove all special characters
document = re.sub(r'\W', ' ', description)
# Remove all words containing numbers
document = re.sub(r'\w*\d\w*', '', document).strip()
# Remove all single characters
document = re.sub(r"\b[a-zA-Z]\b", '', document)
# Alt. remove all single chars
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
# Remove all single chars from start
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
# Substitute multiple spaces with single space
document = re.sub(r'\s+', ' ', document, flags=re.I)
# Convert string to lower case
document = document.lower()
# Remove "aplpay" and "com" (for .com) from string
document = document.replace('aplpay ', '')
document = document.replace('com', '')
return document
def fetch_training_data(dbconn, account, trainingfile):
df = pd.DataFrame() # This will hold the Description,Categories that will train the classifier
dbcursor = dbconn.cursor()
if trainingfile: # i.e. if trainingfile is not None, use that as source of training data
# This is a csv file with Description and Category columns
print (f'Using training file {trainingfile} for training.')
df = pd.read_csv(trainingfile) # TODO: Error checking
df['Description'] = df.Description
df['Category'] = df.Category
if len(df) < 1:
print(f'Trainingfile seems to be empty. Stopping..')
quit()
elif trainingfile == None: # Load pre-categorized transactions in db as training data
institution = account+'Trans' # See sqlite db structure. Basically db column names are either account+'Trans' for transactions or account+'Meta' for metadata
dbcursor.execute('''SELECT Payee,Memo,Category FROM '''+institution+''' WHERE Category != 'Unknown' ''')
raw_training_records = dbcursor.fetchall() # FIXME: Can blow up with memory issues if there are too many records
if len(raw_training_records) < 1:
print ("Hmm... seems like there are no categorized records available for training. Quitting.")
quit()
nTrainingTransactions = len(raw_training_records)
print (f"Using {nTrainingTransactions} previously categorized transactions for training.")
training_description = []
training_categories = []
for row in raw_training_records:
training_description.append(row[0]+' '+row[1]) # training description is combination of payee and memo
training_categories.append(row[2])
df['Description'] = training_description
df['Category'] = training_categories
return df
def get_trained_classifier(df):
# TODO: Check that df is not NULL etc. df['Description'] should have training descriptions. df['Category'] should have corresponding categories for training
# First, we need to pre-process the descriptions. See documentation comments of preprocess_description() for more
df['ProcessedDescription'] = [preprocess_description(x) for x in df.Description] # Hmm.. seems like df['Description'] and df.Description are equivalent??
tfidf=TfidfVectorizer()
x_train = tfidf.fit_transform(df['ProcessedDescription'])
le = LabelEncoder()
y_train = le.fit_transform(df['Category'])
classifier = RandomForestClassifier(n_jobs=-1, n_estimators=100)
classifier.fit(x_train.todense(), y_train)
return(classifier, tfidf, le)
def get_uncategorized_transactions(dbconn, account):
dbcursor = dbconn.cursor()
institution = account+'Trans'
dbcursor.execute('''SELECT TransactionId,Payee, Memo, Amount FROM '''+institution+''' WHERE Category = 'Unknown' ''')
uncategorized_records = dbcursor.fetchall() # FIXME: Can blow up with memory issues if there are too many records
return uncategorized_records
def categorize_transactions(classifier, tfidf, le, uncategorized_transactions):
uncategorized_transids = []
uncategorized_descs = []
#uncategorized_amounts = []
for row in uncategorized_transactions: # TransactionId,Payee,Memo,Amount
uncategorized_transids.append(row[0])
uncategorized_descs.append(row[1]+' '+row[2]) # descs == payee + memo
#uncategorized_amounts.append(row[3]) # Not sure this is actually needed in this function
descs_to_categorize = [preprocess_description(x) for x in uncategorized_descs]
x_predict = tfidf.transform(descs_to_categorize)
predicted = classifier.predict(x_predict.todense())
guessed_categories = le.inverse_transform(predicted)
return guessed_categories
def get_valid_categories(dbconn):
dbcursor = dbconn.cursor()
dbcursor.execute('''SELECT Category FROM Categories''')
valid_categories_tuple = dbcursor.fetchall()
valid_categories = []
for cat in valid_categories_tuple:
valid_categories.append(cat[0])
return valid_categories
# An interactive function
def verify_categories_and_add_notes(dbconn, uncategorized_transactions, guessed_categories):
valid_categories = get_valid_categories(dbconn)
true_categories = guessed_categories
category_completer = FuzzyWordCompleter(valid_categories)
def is_valid_category(text):
if text in valid_categories:
return True
elif text == 'q':
return True
elif text == '': #User pressed enter key
return True
else:
return False
CategoryValidator = Validator.from_callable(
is_valid_category,
error_message='Invalid category',
move_cursor_to_end=True)
notes = [''] * len(guessed_categories) # Place holder list for any notes that may need to be added
for i in range(len(guessed_categories)):
# Need to prompt user whether the category was correct. If not, user should enter the correct category
# user entered category must be validated to be in the master Category list, else polite error msg
# user entered category must overwrite guessed category
true_category = prompt(f'{uncategorized_transactions[i]} --> {guessed_categories[i]} . [Enter] to accept, q to quit, or type correct category:', completer=category_completer, validator=CategoryValidator)
if true_category == 'q':
break
elif true_category:
true_categories[i] = true_category
#print(f'True category is {true_category}')
note = prompt(f'Add note? :')
if note:
notes[i] = note
return (true_categories, notes)
def write_verified_categories_to_db(dbconn, account, uncategorized_transactions, verified_categories, notes):
dbcursor = dbconn.cursor()
# A brief sanity check
if len(uncategorized_transactions) != len(verified_categories):
print (f'Something is VERY WRONG: Length of uncategorized_transactions != Length of verified_categories. ABORTING!')
quit()
if len(notes) != len(verified_categories):
print (f'Something is VERY WRONG: Length of notes != Length of verified_categories. ABORTING!')
quit()
institution = account+'Trans'
for i in range(len(verified_categories)):
# uncategorized_transactions[i] should be the i'th row and each row is TransactionId,Payee,Memo,Amount
dbcursor.execute('''UPDATE '''+institution+''' SET Category = ?, Notes = ? WHERE "TransactionId" = ?''', (verified_categories[i], notes[i], uncategorized_transactions[i][0]))
dbconn.commit()
def auto_categorize(dbconn, account, trainingfilename):
# Decide whether to use training file or pre-categorized transactions
# Load up the training data and train the classifier
# Load up the Uncategorized transactions in account and categorize them
# Do interactive check of categorization along with additional Notes
df = fetch_training_data(dbconn, account, trainingfilename) # Remember: The training descriptions returned are not pre-processed for training
# TODO: Ensure that df is not NULL etc.
(classifier, tfidf, le) = get_trained_classifier(df)
uncategorized_transactions = get_uncategorized_transactions(dbconn, account) # Will return TransactionId,Payee,Memo,Amount of uncategorized transactions
if len(uncategorized_transactions) < 1:
print(f'There seem to be no uncategorized transactions in {account}. Stopping..')
quit()
guessed_categories = categorize_transactions(classifier, tfidf, le, uncategorized_transactions) # same length as uncategorized_transactions
(verified_categories, notes) = verify_categories_and_add_notes(dbconn, uncategorized_transactions, guessed_categories)
write_verified_categories_to_db(dbconn, account, uncategorized_transactions, verified_categories, notes)
def parseargs():
parser = argparse.ArgumentParser()
parser.add_argument('sqlitedbfile', help="SQLite database. Expected to have extention .db")
parser.add_argument('account', help="Name of the institution whose transactions should be categorized")
parser.add_argument('-t', '--trainingfile', default=None, help="Name of input csv file with Description,Category cols used for training")
args = parser.parse_args()
return(args.sqlitedbfile, args.account, args.trainingfile)
def verify_account(account):
valid_accounts = ['AmexBlueCash', 'ChaseSapphireReserve', 'TechCUChecking', 'TechCUSavings', 'BofAChecking', 'BofASavings']
if account in valid_accounts:
return True
else:
return False
# TODO: Add error checking to this function
def open_sqlite_db(sqlitedbfilename):
# TODO: Verify sqlitedbfilename exists?
dbconn = sqlite3.connect(sqlitedbfilename)
# TODO: Verify connect is successful?
return dbconn
def main():
(sqlitedbfilename, account, trainingfilename) = parseargs()
if verify_account(account) == False:
print(f'Unknown account: {account}. Stopping..')
quit()
dbconn = open_sqlite_db(sqlitedbfilename)
# TODO: Error checking of auto_categorize() below
# If trainingfilename is valid, auto_categorize() will use its content for training. Else it'll use already categorized transactions in account for training
auto_categorize(dbconn, account, trainingfilename)
dbconn.close()
if __name__ == "__main__":
main()