-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorona.ptexw
2012 lines (1726 loc) · 74.8 KB
/
corona.ptexw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% !TEX program = xelatex
% !TEX pweaveOutputFormat = tex
% cSpell: disable
\documentclass{report}
\usepackage{arxiv}
\renewcommand{\arraystretch}{1.25}
\usepackage{multirow}
\usepackage{amssymb,mathtools}
\usepackage{booktabs}
\usepackage{verbatim}
\usepackage{hyperref}
\hypersetup
{ pdfauthor = {Gyan Sinha},
pdftitle={Loan Payment Deferments Due to Labor Market Shocks: A Case Study},
colorlinks=TRUE,
linkcolor=black,
citecolor=blue,
urlcolor=blue
}
%
\RequirePackage{fontspec}
\setmainfont{Source Sans Pro}
\usepackage{graphicx}
\graphicspath{/home/gsinha/admin/docs/logos}
\setcounter{tocdepth}{3}
\setcounter{secnumdepth}{3}
<<imports, echo=False>>=
import warnings
warnings.filterwarnings("ignore")
import sys
import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
sns.set()
plt.rcParams.update({
"font.family": "Source Sans Pro",
"font.serif": ["Source Sans Pro"],
"font.sans-serif": ["Source Sans Pro"],
"font.size": 10,
})
import pathlib
import joblib
import collections
import numpy as np
import pandas as pd
import geopandas as gpd
import pytoml
from fredapi import Fred
import feather
import pymc3 as pm
import arviz as az
import lifelines
from lifelines import KaplanMeierFitter, NelsonAalenFitter
from scipy.special import expit
from analytics import utils
omap = {"LC": "I", "PR": "II", "ALL": None}
base_dir = "/home/gsinha/admin/db/dev/Python/projects/models/"
results_dir = {
"LC": base_dir + "defers/pymc3/" + "originator_" + omap["LC"] + "/results",
"PR": base_dir + "defers/pymc3/" + "originator_" + omap["PR"] + "/results",
"ALL": base_dir + "defers/pymc3/" + "results/"
}
import_dir = base_dir + "defers/"
sys.path.append(import_dir)
from common import *
data_dir = base_dir + "data/"
idx = pd.IndexSlice
ASOF_DATE = datetime.date(2020, 7, 12)
dep_var = "defer"
fname = data_dir + "claims.pkl"
with open(fname, "rb") as f:
claims_dict = joblib.load(f)
@
\title{Loan Payment Deferments Due to Labor Market Shocks: A Case Study}
\author{Gyan Sinha, Godolphin Capital Management, LLC%
\thanks{\scriptsize \emph{%Godolphin Capital Management, LLC,%
\href{mailto:gsinha@godolphincapital.com}{Email Gyan}. This report
has been prepared by Godolphin Capital Management, LLC
(``Godolphin'') and is provided for informational purposes only and
does not constitute an offer to sell or a solicitation to purchase
any security. The contents of this research report are not intended
to provide investment advice and under no circumstances does this
research report represent a recommendation to buy or sell a security.
The information contained herein reflects the opinions of Godolphin.
Such opinions are based on information received by Godolphin from
independent sources. While Godolphin believes that the information
provided to it by its sources is accurate, Godolphin has not independently
verified such information and does not vouch for its accuracy. Neither
the author, nor Godolphin has undertaken any responsibility to update
any portion of this research report in response to events which may
transpire subsequent to its original publication date. As such, there
can be no guarantee that the information contained herein continues
to be accurate or timely or that Godolphin continues to hold the views
contained herein. Godolphin is an investment adviser. Investments
made by Godolphin are made in the context of various factors including
investment parameters and restrictions. As such, there may not be
a direct correlation between the views expressed in this article and
Godolphin's trading on behalf of its clients.
<%print(f'Version:{datetime.datetime.now()}') %>}}
}
\date{\today}
%{\includegraphics[width=.25\textwidth]{GodolphinLogo.jpg}}
\begin{document}
\maketitle
\begin{abstract}
This report analyzes loan payment deferment as a result of
COVID-19 related shutdowns in the US. We focus on a
portfolio of unsecured consumer loans originated by 2 different
institutions. Our analysis focuses on a few key questions:
\begin{itemize}
\item what is the magnitude of COVID-related deferment?
\item are there systematic relationships between loan attributes and
payment deferment?
\item how are labor market trends related to the probability
of loan deferment?
\item does the sensitivity to labor market shocks vary by region?
\end{itemize}
The model and results presented provide a general framework that
can be applied not only to unsecured consumer loans but also more broadly
to other lending sectors. While the data are still
preliminary and the events they capture relatively recent, our conclusions are based on
a rigorous and transparent statistical analysis and presented with confidence
bounds that respect the intrinsic uncertainty of the data-generating process.
The chief contribution of this paper, in terms of techniques, is the
use of a ``mixed-model'' with random effects within a bayesian estimation
framework which has enabled us to answer some of the questions we posed
earlier and which would not have been possible using more traditional approaches.
This study should be useful to investors and policy-makers alike, allowing
for data-driven estimates of potential deferment (and distress) rates on
loan portfolios.
\end{abstract}
<<out_dicts, echo=False>>=
out_dict = {}
for i in ["pooled", "hier"]:
out_dict[i] = read_results(i, None, ASOF_DATE, results_dir["ALL"])
@
<<datasets, echo=False>>=
hard_df = out_dict["hier"]["hard_df"]
ic_date = (
out_dict["hier"]["pipe"]["p_s_1"].named_steps.add_state_macro_vars.ic_long_df["edate"].max().date()
)
data_scaler = (
out_dict["hier"]["pipe"]["p_s_2"].named_steps["standardize"].numeric_transformer.named_steps["scaler"]
)
numeric_features = [
"fico", "original_balance", "dti", "stated_monthly_income", "age", "pct_ic"
]
data_scaler_dict = {
"mu" : dict(zip(numeric_features, data_scaler.mean_)),
"sd": dict(zip(numeric_features, data_scaler.scale_))
}
@
\section{Introduction}
Our reasons for undertaking this research project were driven by
practical considerations --- like many other investors in consumer and
mortgage lending, we happen to be long these loans. As such, it is
critical for us to evaluate future losses and prospective returns on
these loans and make assessments about their ``fundamental'' value.
We do this with the explicit recognition of the unprecedented nature
of the COVID shock and the fact that in many ways, we are sailing
through uncharted waters.
While our motivations were pragmatic, a natural question that
arises in this context is the applicability of the analysis
to a broader population if loans. While there is a natural
tendency to always seek out more and greater amounts of data, in
practice, investors in most cases hold narrow subsets of the overall population of
loans. While larger datasets may give us more precise estimates (up to
a point), the fact is that we want to make statements about OUR
portfolio, not a fictional universe which is not owned by anyone in
particular. The challenge then is to employ statistical methods that
allow us to extract information from ``small'' not ``big'' data and
turn these into useful insights for decision-making but which may
nevertheless provide guidance about the broader population as well.
This is where the bayesian methods we deploy in this report come in useful since they
explicitly deal with inferential uncertainty in an intrinsic way and
can be used to provide insights in other contexts as well.
There are two parts to our project. First, we
describe the data set in some detail and present
stratifications by different loan attributes. We also
present the deferment rates within each strata in order to get
intuition around the impact of loan attributes. We then
provide statistics around the labor markets in various states. We look
at the impact of the annual percentage change in initial claims,
starting March 14th (which we peg as the start of the COVID crisis for our
purposes) and through the week ending <%print(f'{ic_date.strftime("%B %-d, %Y")}')%>.
An open question that the modeling seeks to answer is the impact of the
claims variables on deferment rates and whether these can be leveraged into a
prediction framework going forward. A discussion of the statistical model
that relates the observed outcome (did the loan defer: Yes/No?) to the
various loan attributes is provided in the appendix. The framework employed is based on
Survival Analysis, using a hierarchical bayes approach as
in~\cite{8328358dab6746d884ee538c687aa0dd}
and~\cite{doi:10.1198/004017005000000661}.
In the second part of our work, we develop a methodology for
forecasting the path of initial claims at the national and state
levels over the next few months. This analysis is unique in its own
way and leverages a brief descriptive note put out by Federal Reserve
Bank of NY researchers in a blog article. We use the claims forecast
as inputs into the predictions for deferment rates at the end of
second quarter of 2020, which is our forecast horizon. The model
and the estimation results and forecasts are provided in an
accompanying piece.
Before we dive into the details, there are three key technical aspects in
this report that are worth highlighting. \textbf{First, the use of Survival or
Hazard models} to estimate the marginal deferment probability, as a
function of weeks elapsed since the crisis, is \textbf{key} to sensible
projections of deferment \footnote{This is a benefit over and above
the intrinsic gain from using this framework in the context of
``censored'' data where most of the observations have not yet
experienced deferment}. As we show, these marginal
hazards have a very strong ``duration'' component which impacts
longer-term forecasts of the cumulative amount of deferment we expect
in the future.
Second, we extend the survival model framework by incorporating \textbf{parameter
hierarchies (within a bayesian framework)} that explicitly account for random
variation in the impact of variables across loan clusters. This allows for the
possibility of ``unobserved heterogeneity'' in the data by
explicitly modeling a cluster-specific random variable that interacts
with and modifies the hazards for loan clusters. This
is an important enhancement since (i) there may be
differences in the composition of the workforce across groups that
impact the way in which a given volume of claims affect deferment
rates, and (ii) the borrower base itself may differ across groups in both
observable and unobservable ways. We control for the observed
attributes explicitly but the hierarchical framework allows us to
model unobserved factors as well.
Third, we develop a \textbf{statistical framework
to model ``decay'' rates for weekly claims} and the role that labor markets
play in determining deferment rates, building upon ideas first discussed
by researchers at the NY Fed. The projections from this framework serve as
inputs to our longer-term deferment forecasts and allows us to model the
impact of different economic scenarios in the future, an important tool to have
in the arsenal given the considerable uncertainties that still remain
regarding the future path of the economy.
\section{Data}
In Table~\ref{tbl:portfolio_summary}, we provided an overview of our
data sample. In all, we have <%print(f'{hard_df.shape[0]}')%> loans
in our data, in roughly a 50/50 split (by count) across the 2 institutions.
\begin{table}[ht]
\centering
\caption{Portfolio Summary}
\label{tbl:portfolio_summary}
\scalebox{0.75}{
<<portfolio_summary, echo=False, results="tex">>=
hard_df["defer_dollar"] = hard_df[dep_var] * hard_df["current_balance"]
def wavg(x):
return np.nansum(
x * hard_df.loc[x.index, "current_balance"], axis=0
)/np.nansum(hard_df.loc[x.index, "current_balance"])
aaa = hard_df.groupby(["originator", "grade"]).agg(
n=('loan_id', "count"),
original_balance=('original_balance', sum),
current_balance=('current_balance', sum),
wac=('original_rate', wavg),
age=('age', wavg),
fico=('fico', wavg),
term=('original_term', wavg),
dti=('dti', wavg),
income=('stated_monthly_income', wavg),
outcome=(dep_var, wavg),
).rename(columns={"outcome": dep_var})
bbb = hard_df.groupby(["originator"]).agg(
n=('loan_id', "count"),
original_balance=('original_balance', sum),
current_balance=('current_balance', sum),
wac=('original_rate', wavg),
age=('age', wavg),
fico=('fico', wavg),
term=('original_term', wavg),
dti=('dti', wavg),
income=('stated_monthly_income', wavg),
outcome=(dep_var, wavg),
).rename(columns={"outcome": dep_var})
bbb.index = pd.MultiIndex.from_tuples(
[(omap["LC"], 'ALL'), (omap["PR"], 'ALL')], names=['originator', 'grade']
)
aaa = pd.concat([aaa, bbb])
ccc = pd.concat(
[
pd.Series(hard_df["loan_id"].apply("count"), name="n"),
pd.Series(hard_df["original_balance"].sum(), name="original_balance"),
pd.Series(hard_df["current_balance"].sum(), name="current_balance"),
hard_df[
[
"original_rate", "age", "fico", "original_term", "dti",
"stated_monthly_income"
]
].apply(wavg).to_frame().T.rename(
columns={
"original_term": "term", "original_rate": "wac", "dti": "dti",
"stated_monthly_income": "income"
}
),
pd.Series(wavg(hard_df[dep_var]), name=dep_var)
], axis=1
)
ccc.index = [('ALL', 'ALL')]
ddd = pd.concat([aaa, ccc])
ddd["pct"] = ddd["current_balance"]/ddd.loc[pd.IndexSlice["ALL", "ALL"], "current_balance"]
ddd.index.names = ["Originator", "Grade"]
cfmt = "".join(["r"] * (ddd.shape[1] + 2))
header = [
"N", "Orig. Bal.", "Cur. Bal.", "WAC", "WALA", "FICO",
"WAOT", "DTI", "Income", "Defer", "Share",
]
tbl_fmt = {
"original_balance": utils.dollar,
"current_balance": utils.dollar,
"n": utils.integer, "fico": utils.number,
"term": utils.number, "age": utils.number,
"pct": utils.percent, dep_var: utils.percent,
"wac": utils.percent, "dti": utils.number,
"income": utils.dollar
}
print(
ddd.to_latex(
index=True, multirow=True,
header=header,
formatters=tbl_fmt,
column_format=cfmt,
multicolumn_format="r",
))
one_line = ddd.loc[pd.IndexSlice["ALL", :], :]
@
}
\end{table}
The aggregate original amount issued is \$%
<%print(f'{float(one_line.original_balance):,.2f}')%>,
with a weighted-average interest rate of
<%print(f'{utils.number(100*float(one_line.wac))}')%>\%,
a weighted-average FICO score of %
<%print(f'{utils.number(float(one_line.fico))}')%> and
is <%print(f'{utils.number(float(one_line.age))}')%>
months seasoned. The weighted-average original-term %
is <%print(f'{utils.number(float(one_line.term))}')%> months.
\textbf{Overall, the deferment rate on this portfolio is %
<%print(f'{100*float(one_line.defer):.2f}')%>\%.}
The portfolio statistics presented here are as of
<%print(f'{ASOF_DATE.strftime("%B %-d, %Y")}')%> which is more than
one month into the onset of the significant ''shelter-at-home'' orders
across the country and resulting economic disruptions. Since
most of these payment deferrals are for anywhere from 1 to
3 months, the deferment percentages can be viewed as the
cumulative share of loans deferred or delinquent since the start of the
COVID crisis. By way of comparison, we provide recent deferment figures for other
related sectors such as mortgages. Approximately 8.46\% of all mortgage loans
were in forebearance as of May 24th, 2020 which is a roughly
<%print(f'{(pd.to_datetime(ASOF_DATE) - pd.to_datetime("2020-05-24")).days}') %>
days earlier than the cutoff date for our data set. In the Ginnie Mae
sector, 11.82\% of loans were in forebearance while the comparable
figure for conventional mortgages was 6.39\%.
In figure~\ref{fig:due_day_dist}, we present the frequency
distribution of the payment dates on the loans in our sample.
Since borrowers may have a tendency to hold off
on requesting a deferral until they are close to or past their
due day, and given the relatively short data window, this may
lead to biases. A relatively uniform distribution of payment
due dates would serve to assuage this concern. Thankfully,
this is exactly what we find in the data presented here,
eliminating this aspect of the data as a potential source
of concern.
\begin{figure}
\caption{Distribution of due dates}
\label{fig:due_day_dist}
\scalebox{1}{
<<due_day_dist, echo=False>>=
pos = []
for i in [omap["LC"], omap["PR"]]:
pos.append(get_due_day(i, ASOF_DATE))
pos_df = pd.concat(pos, ignore_index=True)
pos_df = pos_df[pos_df["loan_id"].isin(hard_df["loan_id"].to_list())]
fig, ax = plt.subplots(2, 1, figsize=(10, 5), sharey=True)
for i, v in enumerate([omap["LC"], omap["PR"]]):
df = pos_df[pos_df["originator"] == v]
ax[i].hist(df.pmt_day)
ax[i].set_xlabel("Due day")
ax[i].set_ylabel("Frequency")
ax[i].set_title(f"Originator: {v}")
plt.tight_layout()
@
}
\end{figure}
In Table~\ref{tbl:port_summary_purpose}, we provide a stratification of
the portfolio by loan purpose. More than two-thirds of the loans are used for
consolidating existing debt, mostly drawn on credit cards. The second
largest category is for purchases, while less than 10\% is used for
expenses such as for education, wedding etc. (``LifeCyle'').
\begin{table}[ht]
\centering
\caption{Portfolio summary, by purpose}
\label{tbl:port_summary_purpose}
\scalebox{0.7}{
<<port_summary_purpose, echo=False, results="tex">>=
purpose_tbl = summary_by_group(
["originator", "purpose"], dep_var, hard_df
)
purpose_tbl.index.names = ["Originator", "Purpose"]
cfmt = "".join(["r"] * (purpose_tbl.shape[1] + 2))
print(
purpose_tbl.to_latex(
index=True, multirow=True,
header=header,
formatters=tbl_fmt,
column_format=cfmt,
multicolumn_format="r",
))
@
}
\end{table}
In Table~\ref{tbl:port_summary_emp_status}, a stratification across
the borrower's employment status is provided. The ``Self-employed''
and ``Other'' categories generally comprise anywhere from 10\% to 15\%
of the portfolio\footnote{In the case of Originator I, the employment
category is really a dummy variable for the presence or absence
of employment history --- if there is information on this count,
this field is coded as ``Employed'' otherwise it is coded as
``Other''}.
\begin{table}[ht]
\centering
\caption{Portfolio summary, by employment status}
\label{tbl:port_summary_emp_status}
\scalebox{0.70}{
<<port_summary_emp_status, echo=False, results="tex">>=
emp_tbl = summary_by_group(
["originator", "employment_status"], dep_var, hard_df
)
emp_tbl = emp_tbl.fillna(0)
emp_tbl.index.names = ["Originator", "Employment"]
cfmt = "".join(["r"] * (emp_tbl.shape[1] + 2))
print(
emp_tbl.to_latex(
index=True, multirow=True,
header=header,
formatters=tbl_fmt,
column_format=cfmt,
multicolumn_format="r",
))
@
}
\end{table}
In Table~\ref{tbl:port_summary_homeowner}, the portfolio is stratified
across housing tenure. Across the 2 institutions, roughly a quarter to
two-thirds of the borrowing is by renters.
\begin{table}[ht]
\centering
\caption{Portfolio summary, by homeownership}
\label{tbl:port_summary_homeowner}
\scalebox{0.75}{
<<port_summary_homeowner, echo=False, results="tex">>=
homeowner_tbl = summary_by_group(
["originator", "home_ownership"], dep_var, hard_df
)
homeowner_tbl.index.names = ["Originator", "Housing"]
cfmt = "".join(["r"] * (homeowner_tbl.shape[1] + 2))
print(
homeowner_tbl.to_latex(
index=True, multirow=True,
header=header,
formatters=tbl_fmt,
column_format=cfmt,
multicolumn_format="r",
))
@
}
\end{table}
Finally, in Table~\ref{tbl:port_summary_term}, we stratify by
loan term. Across the 2 institutions, roughly 50\% - 70\% of
the loans are for 3-year amortization terms, with the remainder
for a 5-year term.
\begin{table}[ht]
\centering
\caption{Portfolio summary, by term}
\label{tbl:port_summary_term}
\scalebox{0.75}{
<<port_summary_term, echo=False, results="tex">>=
term_tbl = summary_by_group(
["originator", "original_term"], dep_var, hard_df
)
term_tbl.index.names = ["Originator", "Term"]
cfmt = "".join(["r"] * (term_tbl.shape[1] + 2))
print(
term_tbl.to_latex(
index=True, multirow=True,
header=header,
formatters=tbl_fmt,
column_format=cfmt,
multicolumn_format="r",
))
@
}
\end{table}
An important question, with possible implications about the
prospective cure rates for the group of defermented loans, is
what they look like versus the subset of loans that were already
delinquent before the crisis. This is presented in
Table~\ref{tbl:pre_covid_dq_profile}.
\begin{table}[ht]
\centering
\caption{DQ \emph{vs} Deferment profile}
\label{tbl:pre_covid_dq_profile}
\scalebox{0.70}{
<<pre_covid_dq_profile, echo=False, results="tex">>=
dq_tbl = summary_by_group(
["originator", "loanstatus"], dep_var, hard_df
)
dq_tbl.index.names = ["Originator", "DQ Status"]
cfmt = "".join(["r"] * (dq_tbl.shape[1] + 2))
print(
dq_tbl.iloc[:-1].to_latex(
index=True, multirow=True,
header=header,
formatters=tbl_fmt,
column_format=cfmt,
multicolumn_format="r",
))
@
}
\end{table}
The deferment subset (labeled ``Covid'') has better credit quality (as measured)
by their FICO scores than both ``Current'' and delinquent sub-population
for Originator I. This may imply that the cure rate from deferments may be
better on the deferred sub-population than has been the experience on the
delinquent sub-population. In the case of Originator II, the deferment and
delinquent sets have roughly the same FICO score which is lower
than that on the set of loans that are ``Current''.
\section{Employment}
The economic disruption caused by COVID is in many ways unusual
in that it strikes at the Consumption component of overall GDP.
As such, the disruption is much broader than would be the case,
say, for an investment led recession, caused by a contraction in
an isolated segment of the economy.
In some ways, this resembles the 2008 recession which was caused by a
massive asset writedowns in the banking sector (on a global basis)
leading to an economy-wide credit crunch. To the extent that it strikes
at almost two-thirds of overall economic output, the disruption is naturally
even larger, as has become obvious in the labor market figures released over
the last month. Labor markets are likely to be the key to explaining deferment, and
both the full magnitude of job losses and how quickly they are reversed
is going to be the driver of ultimate loan performance.
The trend in annualized percentage change in weekly initial claims
and their distribution is presented in Figure~\ref{fig:claims_pct_trend}.
The underlying data are the individual state/week observations on claims,
merged with the appropriate loan histories starting March 14th.
When we first did this figure, we thought we had made a mistake but the
percentage changes depicted here are correct - on an year-over-year basis,
initial claims did really increase by aproximately 8000\% at their peak
in early April! The annual percentage changes in claims are standardized
by subtracting the mean and dividing by the standard deviation.
\begin{figure}[ht]
\caption{Weekly Claims (Year-over-Year pct. change): trend and distribution}
\label{fig:claims_pct_trend}
\scalebox{1}{
<<claims_pct_trend, echo=False>>=
s_3_df = out_dict["hier"]["s_3_df"]
fig, ax = plt.subplots(2, 1, figsize=(8, 6.4))
sns.boxplot(s_3_df.sdate.dt.date, s_3_df.pct_ic, ax=ax[0])
sns.distplot(s_3_df.pct_ic, ax=ax[1], kde=False)
ax[0].set_xlabel("Week ending: ");
ax[0].set_ylabel("Year-over-Year pct. change")
ax[1].set_xlabel("Year-over-Year pct. change")
ax[0].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax[1].xaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45, ha='right')
plt.tight_layout()
@
}
\end{figure}
\subsection{Claims}
In Figures~\ref{fig:lc_defer_hazard_by_state} and
~\ref{fig:pr_defer_hazard_by_state}, we depict the
relationship between payment deferment and the claims measure.
The solid trend line is a robust fit for the scatter plot
depicted here. The patterns appear to show a relatively small
relationship (and possibly opposite effects across the 2 originators)
--- in both cases, there is considerable variation across states.
<<defer_hazard_by_state, echo=False>>=
def plot_defer_hazard_by_state(originator):
''' plots deferment hazards by state and week '''
zzz = out_dict["hier"]["s_3_df"]
zzz = zzz[zzz["originator"] == originator].copy()
a_df = zzz.groupby(["state"]).agg(
n=("loan_id", "count"), k=(dep_var, np.sum), outcome=(dep_var, np.mean),
pct_ic=("pct_ic", np.mean)
).reset_index().rename(columns={"outcome": dep_var})
g = sns.FacetGrid(
data=a_df.reset_index(),
)
g.map(sns.regplot, "pct_ic", dep_var, ci=True)
g.ax.xaxis.set_major_formatter(mtick.PercentFormatter(1.0))
# add annotations one by one with a loop
for line in range(0, a_df.shape[0]):
g.ax.text(
a_df["pct_ic"][line]+0.001, a_df[dep_var][line], a_df["state"][line],
horizontalalignment='left', size='medium', color='red',
weight='semibold', alpha=0.20
)
g.ax.figure.set_size_inches(10, 5)
g.ax.set_xlabel("Year-over-Year pct. change")
g.ax.set_ylabel("Deferment hazard")
return g
@
\begin{figure}[htb!]
\caption{Originator I: deferment hazard}
\label{fig:lc_defer_hazard_by_state}
\scalebox{1}{
<<lc_defer_hazard_by_state, echo=False>>=
g = plot_defer_hazard_by_state(omap["LC"])
sns.despine(left=True)
@
}
\end{figure}
\begin{figure}[htb!]
\caption{Originator II: deferment hazard}
\label{fig:pr_defer_hazard_by_state}
\scalebox{1}{
<<pr_defer_hazard_by_state, echo=False>>=
g = plot_defer_hazard_by_state(omap["PR"])
sns.despine(left=True)
@
}
\end{figure}
The modeling exercise will seek to examine how much of the difference in
slopes and the variability across states can be explained by individual
loan attributes and unobservable effects modeled as ``random effects''.
\section{Model}
The modeling framework used in this report draws upon statistical
tools used in the analysis of events with a ``time-until'' component
to them. In our case, the time-until, or ``lifetime'' we are
interested in predicting is the time until a borrower asks the
servicer for a deferment or goes delinquent. Time in this context is measured from
an assumed epoch start date of March 14th, 2020 which we assume
as the start of the COVID-19 crisis for our purposes and is the
same across all loans.
We incorporate state-based differences in the distribution
of hazard rates that manifest themselves in both the pattern of duration dependence
as well as the impact that changes in state-level initial claims have on hazards.
As is detailed in the appendix, duration dependence is captured using a set of
interval-specifc intercepts. All numerical
covariates are standardized by subtracting the mean and dividing by the standard
deviation. Categorical features are encoded using ``dummy variables'' where the
first category is treated as the baseline or reference category.
The model is calibrated to a ``training'' data set that consists of a
random sample of 80\% of the loans from the full dataset, stratified on state.
The paramete estimates derived from the training set are then used to derive
predictions for the remaining loans that constitute the ``test'' sample.
Further details are provided in the appendix.
\section{Estimates}
We now turn to a discussion of the results. We use the \href{https://docs.pymc.io/}{PyMC3} Python
package to estimate the model~\cite{pymc3}. The parameter estimates are presented for each
originator in turn.
\subsection{Hazards}
In this section, we present depictions of the marginal deferment
probabilities (using
\href{https://en.wikipedia.org/wiki/Nelson%E2%80%93Aalen_estimator}{Nelson-Aalen hazards}
),
to set the stage for what we should expect our fully-specified hazards to look like.
The hazard estimates depicted in Figure~\ref{fig:nelson_aalen} were computed using the
\href{https://lifelines.readthedocs.io/en/latest/#}{Lifelines}
Python package~\cite{lifelines}.
\begin{figure}[ht]
\centering
\caption{Hazards}
\label{fig:nelson_aalen}
\scalebox{1}{
<<nelson_aalen, echo=False>>=
T = hard_df.dur
E = hard_df[dep_var]
bandwidth = 1
naf = NelsonAalenFitter()
lc = hard_df["originator"].isin([omap["LC"]])
naf.fit(T[lc],event_observed=E[lc], label="Originator I")
ax = naf.plot_hazard(bandwidth=bandwidth, figsize=(10, 5))
naf.fit(T[~lc], event_observed=E[~lc], label="Originator II")
naf.plot_hazard(ax=ax, bandwidth=bandwidth)
ax.set_xlabel("Weeks since March 14th, 2020")
ax.set_ylabel("Weekly hazard")
_ = plt.xlim(0, hard_df.dur.max() + 1)
@
}
\end{figure}
The hazards rose sharply in the first weeks after the start of the crisis,
to between 2\% and 2.5\%, but have declined since then. They reveal a difference
in operating protocols where it appears that in the case of Originator I,
the initial flurry of claims were processed in a batch and then approved
all at once in the second week. The overall pattern of events and censoring
is presented in Table~\ref{tbl:survival_table_all}. The ``observed'' column
indicates the count of loans where deferment was observed during
the interval specified in the ``event-at'' column on the left. Since the
study inception date is the same for all loans, a large fraction
of the data show up as ``censored''
in the last interval, and reported under the censored column. The other entries
in this column pertain to loans that either prepaid or were charged-off during
the period starting March 14th, 2020 and the cutoff date. The removed
column represents the portion of the ``at-risk'' population that is no longer
at risk since the loan was either censored or experienced an event. The
at-risk figure for the previous interval is decremented by the removed column
for that interval to give a new at-risk number.
\begin{table}[ht]
\centering
\caption{Survival table: all loans}
\label{tbl:survival_table_all}
\scalebox{1}{
<<survival_table_all, echo=False, results="tex">>=
lt_df = lifelines.utils.survival_table_from_events(
hard_df.dur, hard_df[dep_var], collapse=True
)
print(lt_df.to_latex(column_format='rrrrr'))
@
}
\end{table}
Nelson-Aalen hazards for the top 12 states by loan count are
presented in Figure~\ref{fig:na_top_12_states}.
\begin{figure}[htb!]
\centering
\caption{Top 12 states: Nelson-Aalen hazards}
\label{fig:na_top_12_states}
<<na_top_12_states, echo=False>>=
hard_df = out_dict["hier"]["hard_df"]
top_states = hard_df.groupby("state").agg(
n=("loan_id", "count")
).sort_values(by=["n"], ascending=False).iloc[:12].index.to_list()
fig, ax = plt.subplots(4, 3, figsize=(10, 10), sharex=True, sharey=True)
naf = {}
for u, v in zip(top_states, ax.flatten()):
naf[u] = fit_na(u, hard_df, "dur", dep_var)
naf[u].plot_hazard(bandwidth=1, ax=v)
v.set_xlabel("Weeks")
v.set_ylabel("Hazard")
plt.tight_layout()
@
\end{figure}
\subsection{Pooled}
<<pooled_results, echo=False>>=
pooled_result = make_az_data("pooled", out_dict)
pooled_df = out_dict["pooled"]["test"]
pooled_ppc, pooled_out_df = predict(
None, pooled_df, dep_var, out_dict["pooled"], ic_long_df=None,
n_samples=4000, verbose=False
)
@
We first provide a summary of the estimates for the pooled model, in
Table~\ref{tbl:pooled_estimates}. The pooled model treats all observations as
being derived from the same underlying distribution ignoring the impact of differences
driven by loan clusters identified by either region or originator. The estimates
serve to provide a baseline against which the results of the hierarchical model
can be compared and contrasted.
\begin{table}
\caption{Pooled model: population means}
\label{tbl:pooled_estimates}
\scalebox{1}{
<<pooled_estimates, echo=False, results="tex">>=
pooled_b_out = pooled_result.b_out
print(
pooled_b_out[["mean", "sd", "hdi_3%", "hdi_97%", "r_hat"]].to_latex(
column_format="rrrrrr"
)
)
@
}
\end{table}
\subsubsection{Predictive distribution}
We examine the posterior predictive distribution
of the probability of the binary deferment outcome variable versus the mean of the
observed outcome in the hold-out \textbf{test} data set. This is presented in Figure~\ref{fig:pooled_ppc}
where the vertical line represents the observed deferment percent while the barchart shows
the distribution of posterior predicted probabilities in the sample, together with the
95\% Highest Posterior Density (HPD) interval. Note that these are hazards and not
unconditional probabilities.
\begin{figure}[htb!]
\caption{Pooled model: posterior predictive distribution}
\label{fig:pooled_ppc}
\scalebox{1}{
<<pooled_ppc, echo=False>>=
pooled_train_ppc, pooled_train_df = predict(
None, out_dict["hier"]["train"], dep_var,
out_dict["hier"], ic_long_df=None,
n_samples=1000, verbose=False
)
fig = make_ppc_plot(pooled_train_ppc, pooled_train_df, dep_var)
fig.show()
@
}
\end{figure}
The mean of the distribution of predicted hazards matches the average
deferment rate in the sample quite well. We have also examined other
standard metrics for measuring convergence for the MCMC sampler that support the
the validity of the sampling results presented here but have witheld
them in the interest of brevity.
\subsubsection{Predicted hazard}
We depict the fitted hazard and its 2 standard-deviation interval
as a function of duration $t$ in Figure~\ref{fig:pooled_fitted_hazard}
for loans in the test dataset. The observed hazards are also plotted.
Again, when viewed in terms of emprical versus fitted hazards, the
model seems to capture the hold-out data quite well, with most of
the observations within the 2 standard-deviation intervals and the
general pattern of duration dependence captured with the interval-specific
intercepts.
\begin{figure}
\caption{Pooled model: fitted hazard}
\label{fig:pooled_fitted_hazard}
<<pooled_fitted_hazard, echo=False>>=
pctile = np.percentile(pooled_ppc, q=[5, 95], axis=0).T
zzz = pd.concat(
[
pooled_out_df, pd.DataFrame(
np.hstack(
(
pooled_ppc.mean(axis=0).reshape(-1, 1), pooled_ppc.std(axis=0).reshape(-1, 1),
pctile
)
),
columns=["ymean", "ystd", "y5", "y95"], index=pooled_out_df.index
)
], axis=1
)
zzz_df = zzz.groupby("start").agg(
y=(dep_var, np.mean), ymean=("ymean", np.mean), ystd=("ystd", np.mean),
y5=("y5", np.mean), y95=("y95", np.mean)
).reset_index()
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.plot(zzz_df["start"], zzz_df["ymean"], label="Predicted")
ax.scatter(zzz_df["start"], zzz_df["y"], label="Actual")
ax.fill_between(
zzz_df["start"], zzz_df["y5"], zzz_df["y95"], color="red", alpha=0.05, label="95% Interval"
)
ax.set(xlabel='Week', ylabel='Hazard')
_ = ax.legend(loc="upper right")
@
\end{figure}
\subsubsection{Feature impact}
The Average Marginal Effect (AME) of the features in the model is presented in
Figure~\ref{fig:pooled_avg_marginal_effect}. The estimates are converted to
basis points and represent the impact of a 1-unit change in the covariate
on the weekly hazard rate. Since all our numerical covariates are standardized,
the AME represents the impact of a 1 standard-deviation change in the variable.
In the case of categorical covariates, the AME measures the probability impact of a
specific level versus the reference or ``baseline'' category for that variable.
The AME is calculated as follows:
\begin{equation}
\Delta P(y | X\beta) = \beta \exp(X\beta)
\end{equation}
This is an $N$ element vector where $N$ is the number of rows
in the dataset. The $\beta$ coefficient is multiplied by the
average of the term in square brackets to derive the AME
for the covariate. For reference, the hazard is
specified as $P(y | X\beta) = \exp(X\beta)$.
\begin{figure}[htb!]
\caption{Pooled Model: Average Marginal Effects}
\label{fig:pooled_avg_marginal_effect}
<<pooled_avg_marginal_effect, echo=False>>=
plot_ame(out_dict, "pooled", pooled_ppc)
@
\end{figure}
\subsection{Hierarchical}
We now present the results of the multi-level model, where loans
are grouped into nested clusters of originators within states.
Treating all loans without regard to the state they belong to
papers over the regional nature of this crisis. In addition, modeling
all loans within a state as the same without regard to the