-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJY_Utils.py
337 lines (230 loc) · 12.5 KB
/
JY_Utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import plotly.express as px
import plotly
import plotly.graph_objs as go
import networkx as nx
import math
# Helper function to map the day-of-month (1st-31st) to a circle with x/y coordinates
def map_values_to_circle(df):
min_val = 1 #df.min()
max_val = 31 #df.max()
# Number of unique values in the range
value_range = max_val - min_val
# Normalize values to range [0, 2*pi]
normalized_df = (df - min_val) / value_range * 2 * np.pi
# Calculate x and y coordinates
x_coords = np.cos(normalized_df)
y_coords = np.sin(normalized_df)
return x_coords, y_coords
# Helper function to assess wether 2 points on a circle are within the range of a fixed value epsilon
# This is used in the time-proximity critera for the graph algorithim.
def are_points_within_epsilon(x1, y1, x2, y2, epsilon):
# Calculate the Euclidean distance
distance = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
# Check if the distance is within epsilon
return distance <= epsilon
# Helper function to filter the dataframe according to various criteria
def filter_transactions(df,
public_token,
min_exp_threshold,
max_exp_threshold,
min_inc_threshold,
max_inc_threshold,
start_date,
end_date):
# applicant filters
if (type(public_token) is str) or (type(public_token) is np.str_):
df_filtered = df[df['public_token']==public_token]
elif type(public_token) is list:
df_filtered = df[df['public_token'].isin(public_token)]
# date filters
df_filtered = df_filtered[(df_filtered['date_posted']>=start_date) & (df_filtered['date_posted']<=end_date)]
# income and expense filters
df_filtered = df_filtered[((df_filtered['amount']<=-min_exp_threshold) & (df_filtered['amount']>=-max_exp_threshold)) | ((df_filtered['amount']>=min_inc_threshold) & (df_filtered['amount']<=max_inc_threshold))]
return df_filtered
# Helper Function to create the Description-match Matrix 'D'
def create_desc_match_matrix(series, match_method='exact'):
# Get the length of the series
n = len(series)
# Create an empty n x n matrix
M = np.zeros((n, n))
if match_method=='exact':
# Populate the matrix based on matching values
for i in range(n):
for j in range(n):
if (series.iloc[i] == series.iloc[j]):
M[i, j] = 1
elif match_method=='any':
M = np.ones((n, n))
return M
# Helper Function to create the Amount-match Matrix 'A'
def create_amt_match_matrix(series, dollar_threshold=0):
# Get the length of the series
n = len(series)
# Create an empty n x n matrix
M = np.zeros((n, n))
for i in range(n):
for j in range(n):
upper_bound = series.iloc[i] + dollar_threshold
lower_bound = series.iloc[i] - dollar_threshold
if ((upper_bound > series.iloc[j]) and (lower_bound < series.iloc[j])):
M[i, j] = 1
return M
# Helper Function to create the Time-match Matrix 'T'
def create_time_match_matrix(df, month_range=2, day_range=10):
# Get the length of the series
n = len(df)
# Create an empty n x n matrix
M = np.zeros((n, n))
# Calculate epsilon parameter to test if transaction days are nearby
C = 2*math.pi*1 #circumference of circle with radius 1, corresponding to xcoord and ycoord mapping
arc = day_range/31 # arc length
epsilon = (C/(math.pi))*(np.sin(arc*math.pi/C))
for i in range(n):
for j in range(n):
upper_bound = df['months_since_start'].iloc[i] + month_range
lower_bound = df['months_since_start'].iloc[i] - month_range
# first test if the point is within the month range
if ((upper_bound > df['months_since_start'].iloc[j]) and (lower_bound < df['months_since_start'].iloc[j])):
# if so, test wether it is within the day range
x_i = df['xcoord'].iloc[i]
y_i = df['ycoord'].iloc[i]
x_j = df['xcoord'].iloc[j]
y_j = df['ycoord'].iloc[j]
if are_points_within_epsilon(x_i, y_i, x_j, y_j, epsilon):
M[i, j] = 1
return M
def recurring_streams(df,
min_recurrences=3,
methods={'desc_match_params': ['any'],
'amt_match_params': [0],
'date_match_params': [2]}):
"""
The main algorithim to identify and group related recurring payment streams.
Args:
df: The pre-processed dataframe of transactions.
This should include all the necessary feature engineering
min_recurrences: The minimum number of recurring payments necessary to be considered part of a stream.
For example, if min_recurrences=4 then any streams of <4 transactions will be grouped
together in the 'all other transactions' category with value of -1.
methods: A dictionary of lists. Each key refers to the t
methods['desc_match_params'][0] can be 'exact' if only identical descriptions will match or 'any' if description is ignored.
methods['amt_match_params'][0] represents the dollar threshold needed for transactions to match. For example if 20 is used,
then a $200 transaction can match a $180 through $220 transaction. Defaut is zero, meaning only exact $ amount matches will match.
methods['time_match_params'][0] is the month range.
Return:
payment_stream_group: A pandas series of integers identifying which payment stream the transaction belongs to.
The series is the same length as the input dataframe.
Unassigned transactions will be grouped together with a value of -1.
"""
desc_match_method = methods['desc_match_params'][0]
dollar_threshold = methods['amt_match_params'][0]
month_range = methods['time_match_params'][0]
day_range = methods['time_match_params'][1]
### Step 1: Create Pairwise Matricies:
# D is a matrix of description-matching
# D_ij = 1 if the description of transaction i is an exact match of transaction j's description. D_ij=0 otherwise.
D = create_desc_match_matrix(pd.Series(df['description']),
match_method=desc_match_method
).astype(bool)
# A is a matrix of amount-matching
# A_ij = 1 if the transaction amount i is a near-match of transaction j's amount. A_ij=0 otherwise.
A = create_amt_match_matrix(pd.Series(df['amount']),
dollar_threshold=dollar_threshold
).astype(bool)
# T is a matrix of time-matching
# T_ij = 1 if the transaction amount i is a near-match of transaction j's amount. A_ij=0 otherwise.
T = create_time_match_matrix(df,
month_range=month_range,
day_range=day_range
).astype(bool)
### Step 2: Use the pairwise matricies above to build the adjacency matrix according to the method specified.
adj_matrix = (D & A) & T # Matrix where all matching conditions are met
np.fill_diagonal(adj_matrix, 0) # No self-connected nodes
### Step 3: Build the Graph and unpack the connected components
# build graph from adj_matrix
G = nx.from_numpy_array(adj_matrix)
# Collect recurring payment streams from connected components
recurring_streams = [c for c in sorted(nx.connected_components(G), key=len, reverse=True) if len(c)>=min_recurrences]
# All communities will start out as unassigned and given an index of -1
payment_stream_group = [-1]*nx.number_of_nodes(G)
# Go through each item in each set and assign it to the connected component
set_ind = 0
for set in recurring_streams:
for item in set:
# If the item was previously unassigned, its assigned to the set its found in
if payment_stream_group[item] == -1:
payment_stream_group[item] = set_ind
# Note: The else condition below should never trigger, since connected components should be mutually exclusive
# If the item was already in another set, its assigned to -2, which contains multi-community nodes
else:
payment_stream_group[item] = -2
set_ind += 1
return payment_stream_group
# The main algorithim used to identify late and missed payments
def label_missed_payments(df, groupings, days_late):
return_df = df.copy()
# initialize the return columns
return_df['nearest'] = pd.to_datetime('1999-01-01')
return_df['days_late'] = 1000
return_df['missed_pmt'] = False
for group in groupings.unique():
filter = (groupings == group)
df_temp = return_df[filter]
indices_to_update = groupings == group
#values_to_set = grp[grp.str[-36:] == applicant]
return_df.loc[indices_to_update, 'nearest'] = nearest_dates(pmt_dates=df_temp['date_posted'],
target_dates=get_target_dates(df_temp['date_posted'])
)#.values
return_df['days_late'] = return_df['date_posted'] - return_df['nearest']
return_df['late_payment'] = return_df['days_late'].dt.days>days_late
# test for missed payments
missed_pmt_df = pd.DataFrame()
for group in groupings.unique():
filter = (groupings == group)
df_temp = return_df[filter]
for expected_pmt in get_target_dates(df_temp['date_posted']):
if expected_pmt not in [i for i in df_temp['nearest']]:
# Define the fields to copy from the first row
fields_to_copy = ['public_token', 'group']
first_row = df_temp.iloc[0]
new_row = {field: first_row[field] for field in fields_to_copy}
#new_row = {field: first_row[field] if field in fields_to_copy else np.nan for field in df_temp.columns}
new_row_df = pd.DataFrame([new_row])
new_row_df['date_posted'] = expected_pmt
new_row_df['nearest'] = expected_pmt
new_row_df['missed_pmt'] = True
# Append the new row to the DataFrame
missed_pmt_df = pd.concat([missed_pmt_df, new_row_df], ignore_index=True)
return_df = pd.concat([return_df, missed_pmt_df], ignore_index=True)
return return_df
def get_target_dates(dates):
# Find the earliest and latest month in the date range
start_date = dates.min() - pd.DateOffset(months=1)
end_date = dates.max() + pd.DateOffset(months=1)
# Generate all months in the range
all_months = pd.date_range(start=start_date, end=end_date, freq='MS')
# Create a series with the median_day for each month
median_day = np.median(dates.dt.day)
median_day_series = all_months + pd.DateOffset(days=median_day - 1)
# Ensure that the day does not exceed the number of days in the month
#median_day_series = median_day_series[median_day_series.dt.day == median_day]
return pd.Series(median_day_series)
def nearest_dates(pmt_dates, target_dates):
# Convert dates to ordinal numbers for easier comparison
pmt_ordinals = pmt_dates.map(pd.Timestamp.toordinal).values
target_ordinals = target_dates.map(pd.Timestamp.toordinal).values
# Find the nearest date for each date in pmt_dates
nearest_dates = []
for pmt_date in pmt_ordinals:
# Compute the absolute differences with target_dates
differences = np.abs(target_ordinals - pmt_date)
# Find the index of the minimum difference
closest_index = differences.argmin()
# Append the corresponding date from target_dates
nearest_dates.append(target_dates.iloc[closest_index])
return nearest_dates