-
Notifications
You must be signed in to change notification settings - Fork 0
/
PMCC_FinalProject_Main_Code.R
796 lines (657 loc) · 37.6 KB
/
PMCC_FinalProject_Main_Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
# STAT 172
# Final Project Codespace
# Team Members: Joel Aguirre, Madelyn Osten, Sam Solheim
# This is the main coding environment that will be utilized during the course
# of this project. Whenever the group commits changes to our code, any changes
# made will be implemented here for our final results.
rm(list=ls())
library(rpart) # for fitting classification trees
library(rpart.plot) # for plotting trees
library(ggplot2) # for professional exploratory graphics
library(pROC) # for ROC curves
library(RColorBrewer)
library(randomForest)
library(glmnet) # for fitting lasso, ridge regressions (GLMs)
# READING IN OUR DATA --------------
cats <- read.csv("aac_shelter_cat_outcome_eng.csv", stringsAsFactors = TRUE)
# Read in the data set and then look into the summary and 'strength' of
# variables in the data set. Make sure that variables appear to have been written
# in correctly
summary(cats)
str(cats)
# CREATING BINARY Y VARIABLE --------------
table(cats$outcome_type)
# Adoption Died Disposal Euthanasia
# 3 12732 403 16 1452
# Missing Return to Owner Rto-Adopt Transfer
# 28 1431 33 13323
# For our research, there are three separate cases that would
# go into our positive outcome of a cat going home with someone,
# and these are "Adoption", "Return to Owner", and
# "Rto-Adopt". Additionally, the 3 observations for "" will be put into the
# "Transfer" outcome, so when we create the binary variable this category can simply
# go into the "No" category.
# Convert Y variable from factor to character var
cats$outcome_type <- as.character(cats$outcome_type)
# modify "" to be "Transfer"
cats$outcome_type[cats$outcome_type == ""] <- "Transfer"
table(cats$outcome_type)
# convert to a factor
cats$outcome_type <- as.factor(cats$outcome_type)
str(cats)
table(cats$outcome_type)
# Turning outcome_type back into a factor, order will not matter, as
# this will be turned into a binary outcome for our use-case
# The following creates the binary version of our y variable
# In order to accomplish this, we will use several ifelse statements, separated
# by '|' which is the equivalent of the OR operator in R.
cats$outcome_bin <- as.factor(ifelse(cats$outcome_type
%in% c("Adoption", "Return to Owner",
"Rto-Adopt"), "Yes", "No"))
# Verify that the binary y was properly created
# Should expect (12732+1431+33) = 14196 for "Yes",
# also expect (403+16+1452+28+13326) = 15225 for "No"
summary(cats$outcome_bin)
# The number that was calculated matched the number of 1's in cats$outcome_bin.
# Binary Y variable has been created, now we can go into the next steps.
# CREATION OF ADDITIONAL X VARIABLES --------------
# creating a season variable using the month the cat's outcome occurred
# going by the generally agreed upon distribution of the seasons by month
cats$season_outcome <- as.character(ifelse(cats$outcome_month %in% c(12, 1, 2), "Winter",
ifelse(cats$outcome_month %in% c(3, 4, 5), "Spring",
ifelse(cats$outcome_month %in% c(6, 7, 8), "Summer",
ifelse(cats$outcome_month %in% c(9, 10, 11), "Fall",
"N/A")))))
# check if it worked
unique(cats$season_outcome)
# change to an ordered factor
cats$season_outcome <- factor(cats$season_outcome,
levels = c("Spring", "Summer", "Fall", "Winter"))
# check if it worked
table(cats$season_outcome)
str(cats)
# season_outcome was correctly marked as a factor
# creating a variable counting the number of characters in each name to
# explore the impact of during x variable selection
str(cats$name)
#convert to a character
cats$name <- as.character(cats$name)
# create the numeric variable
cats$name_length <- as.numeric(nchar(cats$name))
# check if it worked as anticipated
unique(cats$name_length)
str(cats)
# Appears to have worked, noticed there are some cats that apparently
# don't come in with names, as there is a length 0 name. Look into this further:
table(cats$name_length)
# It appears the vast majority of cats (12774) don't have names when they come in,
# we can't really consolidate this into other groups, so it will be interesting to
# see how this affects a cat's outcome. It could possibly mean that cat wasn't anyone's pet
# before arriving at the shelter.
# create a consolidated age variable that says what age group a cat
# is in when their outcome is reported
unique(cats$age_upon_outcome)
#convert to a character
cats$age_upon_outcome <- as.character(cats$age_upon_outcome)
# Note: Since we will be creating an age group variable, when we go ahead to x variable selection the age_upon_outcome variable
# will be left out by utilizing a subset of our data.
cats$outcome_age_group[cats$age_upon_outcome %in% c("2 weeks", "1 month", "3 weeks", "4 weeks", "3 days", "6 days", "1 week", "2 days", "5 days", "1 day", "1 weeks", "4 days", "0 years")] <- "0-1 months"
# we decided to make the assumption that "0 years" should go into "0-1 months", primarily because that will be our most-common group
cats$outcome_age_group[cats$age_upon_outcome %in% c("3 months", "2 months", "5 weeks")] <- "1-3 months"
cats$outcome_age_group[cats$age_upon_outcome %in% c("5 months", "4 months", "6 months")] <- "3-6 months"
cats$outcome_age_group[cats$age_upon_outcome %in% c("8 months", "10 months", "11 months", "9 months", "7 months", "1 year")] <- "6-12 months"
cats$outcome_age_group[cats$age_upon_outcome %in% c("3 years", "4 years", "2 years", "7 years", "6 years", "8 years", "10 years", "5 years", "9 years")] <- "1-10 years"
cats$outcome_age_group[cats$age_upon_outcome %in% c("15 years", "16 years", "11 years", "14 years", "13 years", "12 years", "17 years", "20 years", "18 years", "19 years", "22 years")] <- "10+ years"
table(cats$outcome_age_group)
# 6914 + 6135 + 8416 + 634 + 2686 + 4636 = 29421, all cats covered!
# convert to an ordered factor
cats$outcome_age_group <- factor(cats$outcome_age_group,
levels = c("0-1 months", "1-3 months", "3-6 months", "6-12 months", "1-10 years", "10+ years"))
# Check to make sure ordering worked
table(cats$outcome_age_group)
str(cats$outcome_age_group)
# Looks all good! We can go ahead and proceed to the next stage
###### DATA PREPARATION FOR RANDOM FOREST ---------
# Before fitting a random forest, we want to create a subset of our data that does not
# include variables such as "age_upon_outcome" because we do not want to utilize
# variables that went into the creation of some of our newer X variables.
# The data given from these two variables will be very highly correlated, which
# could mess with our x variable selection.
# Additionally, the variable "animal_type" is useless in the case of this database,
# as ALL animals observed were cats. The "age_group" variable provided by the
# dataset will be replaced as well, as there is no context of the scale, the age groups given additionally do not make sense,
# and we also created our own variable for age groups.
# Additionally removed: outcome_age_.days. and outcome_age_.years.
# do not fully make sense in this context, and we have a
# variable that already deals with this.
# The count variable has "1" noted for every single value, therefore this will not be
# useful in creating a model, and will be removed prior to fitting of a random forest.
summary(cats)
str(cats) # Prior creating a subset, we have 41 variables.
cats <- subset(cats, select = -c(age_upon_outcome, animal_id, animal_type,
date_of_birth, datetime, monthyear,
outcome_subtype, outcome_type, sex_upon_outcome,
count, sex_age_outcome, age_group, dob_monthyear,
outcome_age_.days., outcome_age_.years.))
# Check that variables subset out of dataset are correctly removed, and
# that no issues have happened.
summary(cats)
str(cats)
# We took out 15 variables, and now it shows the data set has 26 variables.
# This checks out.
# TROUBLESHOOTING IN TREE FITTING
# In attempting to fit our base forest, we encountered an error that randomForest.default()
# isn't able to handle categorical predictors with more than 53 categories,
# so several additional variables will be removed before attempting again.
cats$name <- factor(cats$name)
str(cats$name) # There are 7410 levels in name, this will need to be removed for certain
# Variables to be removed are name, color, and breed as they all have 53+ levels
cats <- subset(cats, select = -c(name, color, breed))
str(cats) # Checking that everything dropped as expected, dropping 3 additional
# variables leaves us at 23 which checks out
# Attempting to fit our base forest using this still gives us an error about
# not being able to create this with 53+ categorical levels, must mean more than
# 53 total. Going to create one last subset which will include our binary y variable
# and 9 possible x variables we are interested in looking into.
cats <- subset(cats, select = c(outcome_bin, sex, Spay.Neuter, outcome_weekday,
outcome_hour, cfa_breed, domestic_breed,
season_outcome, name_length, outcome_age_group))
summary(cats)
str(cats)
# Double checking that everything worked properly in subsetting, then
# can finally get back to tree fitting and interpretations
## START OF ADDITIONAL CODE FROM 12/3/2022 ##
### TREE FITTING AND INTERPRETATION --------------
# Set the seed and in order to create training and testing data frames
RNGkind(sample.kind = "default")
set.seed(14038450) # picked arbitrary number
train.idx <- sample(x = 1:nrow(cats), size = .8*nrow(cats))
# create training data
train.df <- cats[train.idx, ]
# create testing data
test.df <- cats[-train.idx, ]
str(train.df) # 23536 observations in training data set
str(test.df) # # 5885 observations in testing data set
# totals up to 29421 observations -- all accounted for!
# set seed again before fitting forest
set.seed(14038450)
# fitting our baseline forest
baseforest <- randomForest(outcome_bin ~.,
# including all variables left in our subset of cats
# to see which are important to our interpretations
data = train.df, #TRAINING DATA
ntree = 1000, # this is our B
mtry = 3, # number of x variables to sample
# choose mtry -> sqrt(9) = 3
importance = TRUE)
baseforest
# Looking at the results of our base random forest
# Call:
# randomForest(formula = outcome_bin ~ ., data = train.df, ntree = 1000, mtry = 3, importance = TRUE)
# Type of random forest: classification
# Number of trees: 1000
# No. of variables tried at each split: 3
#
# OOB estimate of error rate: 12.11%
# Confusion matrix:
# No Yes class.error
# No 10454 1712 0.140720
# Yes 1138 10232 0.100088
#
# Calculating the OOB error rate in forest
mean(predict(baseforest) != train.df$outcome_bin)
# 0.1210911
# The OOB error for our base forest is about 12.11%
# TUNING FOREST ------------
# Now go ahead and tune forest
plot(baseforest) # can plot base forest to see general trend of error rate,
# by the number of trees
# create sequence of m values we want to try,
# ranging this from 1 to 9 possible variables being put in each tree in the forest.
mtry <- c(1:9)
# make room for m and oob error (empty data frame)
keeps <- data.frame(m=rep(NA, length(mtry), OOB_error_rate = rep(NA, length(mtry))))
for(idx in 1:length(mtry)){
print(paste0("Fitting m = ",mtry[idx]))
tempforest <- randomForest(outcome_bin ~.,
data = train.df,
ntree = 1000,
mtry = mtry[idx]) # mtry is varying
# record OOB error, corresponding to mtry for each forest fit
keeps[idx, "m"] <- mtry[idx]
keeps[idx, "OOB_error_rate"] <- mean(predict(tempforest) != train.df$outcome_bin)
}
keeps # Show results for each value of m and the corresponding OOB error rate
# m OOB_error_rate
# 1 1 0.1403807
# 2 2 0.1269120
# 3 3 0.1214735
# 4 4 0.1238953
# 5 5 0.1293763
# 6 6 0.1326054
# 7 7 0.1343899
# 8 8 0.1358345
# 9 9 0.1362593
# Create a plot showing these values of m compared to OOB error rates
# plot the OOB error rate vs m
ggplot(data = keeps) +
geom_line(aes(x = m, y = OOB_error_rate)) +
labs(x = "'M' Variables Sampled per Forest", y = "Estimated OOB_error_rate") +
ggtitle("Estimated OOB Error Rate by Number of Sampled Variables")
# Looking both at the chart and the graphical charting of
# our possible values of the OOB error rate compared to the number of variables sampled per
# forest, the lowest OOB error rate we can get occurs when we use
# m = 3, and that error rate is around 12.14% (0.1214735)
# Looking at this, it appears that our initial base model will also be our
# final model.
final_forest <- baseforest # storing our base forest in a new variable for the
# forest we decided to go with
final_forest
# Call:
# randomForest(formula = outcome_bin ~ ., data = train.df, ntree = 1000, mtry = 3, importance = TRUE)
# Type of random forest: classification
# Number of trees: 1000
# No. of variables tried at each split: 3
#
# OOB estimate of error rate: 12.11%
# Confusion matrix:
# No Yes class.error
# No 10454 1712 0.140720
# Yes 1138 10232 0.100088
# PREDICTIONS BASED ON FINAL FOREST --------
# Create ROC Curve
# Assume positive event is "Yes" as we are wanting to determine what
# leads to a cat being adopted or returned to their owner!
pi_hat <- predict(final_forest, test.df, type = "prob")[, "Yes"]
rocCurve <- roc(response = test.df$outcome_bin,
predictor = pi_hat,
levels = c("No", "Yes")) # Order for levels matters
# First is negative event, then positive event
plot(rocCurve, print.thres = TRUE, print.auc = TRUE)
# Interpretations of pi*, specificity, and sensitivity
# If we set pi* = 0.587, then we can achieve a specificity of 0.889
# and a sensitivity of 0.872.
# In other words, when a cat is adopted or returned to their owner, we correctly
# predict this outcome about 87.2% of the time. (specificity interpretation)
# Additionally, when we predict that a cat will not be adopted or returned to their owner,
# we correctly predict this scenario around 88.9% of the time. (specificity interpretation)
# The AUC for our rocCurve is 0.949, which we think is good since
# a better AUC will be closer to 1.
# Make predictions on our test data
# extract pi* - need for good predictions
# (that maximize sensitivity and specificity)
pi_star <- coords(rocCurve, "best", ret = "threshold")$threshold[1]
# Interpretation of pi_star
# If the forest predicts an Adoption / Return to Owner probability
# greater than 0.5865 (pi_star), then we predict that a cat will be
# going home with someone. Otherwise, we predict they will not adopted or
# returned to their owner.
# The following line creates predictions that are consistent with the
# above statements about AUC, pi_hat, specificity, and sensitivity
test.df$forest_pred <- as.factor(ifelse(pi_hat > pi_star, "Yes", "No"))
# INTERPRETATIONS UTILIZING FINAL FOREST -----------
# utilizing random forest to get a ranked list of variable importance
# to be used in our model creation
varImpPlot(final_forest, type = 1)
# all of our "MeanDecreaseAccuracy" values are positive,
# that means each variable helped the model, though to very different degrees
# Somewhat interestingly, we can see that the length of a cat's name (name_length)
# appears to be the most important variable for prediction in our data set
# Other important variables are ranked as follows:
# Grouped closely to each other:
# (2) Spay.Neuter and (3) outcome_age_group
# By itself: (4) outcome_hour
# By itself: (5) outcome_weekday
# Grouped together: (6) sex and (7) season_outcome
# Last group of variables: (8) domestic_breed and (9) cfa_breed
# After gaining further context about which variables appear most important,
# we can utilize the results to create an appropriate model using logistic regression.
# We do this because logistic regression one of its benefits over random forests is
# that we can easily make interpretations based on our results.
# The reason we didn't skip creating a random forest is that it
# allowed us to create predictions and determine which
# variables are most important for a logistic regression model.
## MODEL CREATION THROUGH LOGISTIC REGRESSION
# We chose to create a model with the Bernoulli random variable component and
# the logit link to fit our binary response variable.
m1 <- glm(outcome_bin ~ name_length, data = cats,
family = binomial(link = "logit"))
AIC(m1) # 33440.12
BIC(m1) # 33456.69
# next two important are Spay.Neuter & outcome_age_group
m2 <- glm(outcome_bin ~ name_length + Spay.Neuter + outcome_age_group,
data = cats, family = binomial(link = "logit"))
AIC(m2) # 23206.27
BIC(m2) # 23272.59
# vast improvement from the first model, both AIC and BIC have decreased
# next important is outcome_hour
m3 <- glm(outcome_bin ~ name_length + Spay.Neuter + outcome_age_group +
outcome_hour,
data = cats, family = binomial(link = "logit"))
AIC(m3) # 21489.87
BIC(m3) # 21564.48
# next important is outcome_weekday
m4 <- glm(outcome_bin ~ name_length + Spay.Neuter + outcome_age_group +
outcome_hour + outcome_weekday,
data = cats, family = binomial(link = "logit"))
AIC(m4) # 20974.08
BIC(m4) # 21098.42
# next two important are sex & season_outcome
m5 <- glm(outcome_bin ~ name_length + Spay.Neuter + outcome_age_group +
outcome_hour + outcome_weekday + sex + season_outcome,
data = cats, family = binomial(link = "logit"))
AIC(m5) # 20883.27
BIC(m5) # 21040.77
# next two important are domestic_breed & cfa_breed
m6 <- glm(outcome_bin ~ name_length + Spay.Neuter + outcome_age_group +
outcome_hour + outcome_weekday + sex + season_outcome +
domestic_breed + cfa_breed,
data = cats, family = binomial(link = "logit"))
AIC(m6) # 20822.03
BIC(m6) # 20996.11
# our sixth and final model is our best performing, with the lowest AIC & BIC
# values, we would use this model to make our interpretations
final_model <- glm(outcome_bin ~ name_length + Spay.Neuter + outcome_age_group +
outcome_hour + outcome_weekday + sex + season_outcome +
domestic_breed + cfa_breed,
data = cats, family = binomial(link = "logit"))
summary(final_model)
# VISUALIZATIONS --------------
# let's look at name_length
ggplot(data = cats) +
geom_histogram(aes(x = name_length)) +
labs(x = "Name Length", y = "Amount of Cats") +
ggtitle("Distribution of Name Lengths")
# lots of no-names here, let's approach this variable from another angle
ggplot(data = cats) +
geom_boxplot(aes(x = name_length))
ggplot(data = cats) +
geom_boxplot(aes(x = name_length, y = outcome_bin)) +
labs(x = "Name Length", y = "Adoption Outcome") +
ggtitle("Distribution of Name Lengths")
# (graph included in the supplement document)
ggplot(data = cats) +
geom_violin(aes(x = name_length, y = outcome_bin, fill = outcome_bin)) +
# geom_jitter(aes(x = name_length, y = outcome_bin, fill = outcome_bin)) +
labs(x = "Name Length", y = "Adoption Outcome") +
ggtitle("Distribution of Name Lengths vs Adoption Outcome") +
scale_fill_manual(values = c("#6d7771", "#FFFDD0"),
name = "Adoption \nOutcome")
# can see the frequencies in each graph, more no-named cats that resulted in
# no adoption as opposed to getting adopted
# cats that have more "traditional" names that fall between 4-7 letters
# were adopted at a higher frequency
# let's look at season_outcome
ggplot(data = cats) +
geom_bar(aes(x = season_outcome))
ggplot(data = cats) +
geom_bar(aes(x = season_outcome, fill = outcome_bin, colour = outcome_bin)) +
scale_fill_grey("Adoption \nOutcome")
# (graph included in the supplement document)
ggplot(data = cats) +
geom_bar(aes(x = season_outcome, fill = outcome_bin), position = "fill") +
labs(x = "Season of the Outcome", y = "Amount of Cats") +
ggtitle("Distribution of Seasons vs Adoption Outcome") +
scale_fill_manual(values = c("#6D7771", "#FFFDD0"),
name = "Adoption \nOutcome")
# general trend that the further along in the year, the higher the adoption rate
# could relate to the holidays in the winter, wanting to adopt
# let's look at sex and spay.neuter
ggplot(data = cats) +
geom_bar(aes(x = sex, fill = outcome_bin), position = "fill")
# without fill to understand frequency distributions
# (graph included in the supplement document)
ggplot(data = cats) +
geom_bar(aes(x = outcome_age_group, fill = outcome_bin)) +
labs(x = "Age Group", y = "Proportion") +
facet_wrap(~Spay.Neuter, nrow = 2, labeller = labels) +
scale_fill_manual(values = c("#6D7771", "#FFFDD0"),
name = "Adoption \nOutcome") +
coord_flip() +
ggtitle("Outcomes of Spay/Neuter by Age Group")
# generally a higher amount of cats that have been spayed/neutered
# furthermore, those cats are more likely to get adopted
# let's look at the proportions in a more meaningful way
labels <- as_labeller(c("No" = "Not Spayed or Neutered", "Yes" = "Spayed or Neutered"))
# with fill to understand proportions
# (graph included in the supplement document)
ggplot(data = cats) +
geom_bar(aes(x = outcome_age_group, fill = outcome_bin), position = "fill") +
labs(x = "Age Group", y = "Proportion") +
facet_wrap(~Spay.Neuter, nrow = 2, labeller = labels) +
scale_fill_manual(values = c("#6D7771", "#FFFDD0"),
name = "Adoption \nOutcome") +
coord_flip() +
ggtitle("Outcomes of Spay/Neuter by Age Group")
# higher proportions of adoption for cats that have spayed/neutered
# general downward trend for this subset of cats
# could imply that potential adopters do not care about spay/neutering
# the older the cat is
# R code for getting coefficients and confidence intervals ----------
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -6.278657 0.415560 -15.109 < 2e-16 ***
# name_length 0.277979 0.005857 47.464 < 2e-16 ***
# Spay.NeuterYes 2.733662 0.050528 54.101 < 2e-16 ***
# outcome_age_group1-3 months 2.819162 0.061116 46.128 < 2e-16 ***
# outcome_age_group3-6 months 1.889418 0.076400 24.731 < 2e-16 ***
# outcome_age_group6-12 months 0.658901 0.065440 10.069 < 2e-16 ***
# outcome_age_group1-10 years 0.698195 0.061474 11.358 < 2e-16 ***
# outcome_age_group10+ years 0.640101 0.106874 5.989 2.11e-09 ***
# outcome_hour 0.187533 0.005104 36.742 < 2e-16 ***
# outcome_weekdayMonday -0.215718 0.065935 -3.272 0.00107 **
# outcome_weekdaySaturday 0.823268 0.064877 12.690 < 2e-16 ***
# outcome_weekdaySunday 0.565916 0.065643 8.621 < 2e-16 ***
# outcome_weekdayThursday -0.175065 0.067886 -2.579 0.00991 **
# outcome_weekdayTuesday -0.205153 0.064037 -3.204 0.00136 **
# outcome_weekdayWednesday -0.146139 0.066249 -2.206 0.02739 *
# sexMale 0.240743 0.035865 6.712 1.91e-11 ***
# season_outcomeSummer 0.230451 0.051821 4.447 8.71e-06 ***
# season_outcomeFall -0.009918 0.052539 -0.189 0.85027
# season_outcomeWinter 0.279770 0.055576 5.034 4.80e-07 ***
# domestic_breedTrue -1.163966 0.400486 -2.906 0.00366 **
# cfa_breedTrue -0.604301 0.394852 -1.530 0.12591
# Statistical Analysis -----
beta_hat <- coef(final_model)
#maximum likelihood estimates of odds ratios (exp(beta))
exp(beta_hat)
# (Intercept) 0.001875919
# name_length 1.320457999
# Spay.NeuterYes 15.389133058
# outcome_age_group1-3 months 16.762795658
# outcome_age_group3-6 months 6.615518898
# outcome_age_group6-12 months 1.932666976
# outcome_age_group1-10 years 2.010121017
# outcome_age_group10+ years 1.896673058
# outcome_hour 1.206270344
# outcome_weekdayMonday 0.805962167
# outcome_weekdaySaturday 2.277931356
# outcome_weekdaySunday 1.761059511
# outcome_weekdayThursday 0.839402294
# outcome_weekdayTuesday 0.814522845
# outcome_weekdayWednesday 0.864037972
# sexMale 1.272194284
# season_outcomeSummer 1.259168380
# season_outcomeFall 0.990130720
# season_outcomeWinter 1.322825318
# domestic_breedTrue 0.312245468
# cfa_breedTrue 0.546456449
confint(final_model)#conf int for beta
# 2.5 % 97.5 %
# (Intercept) -7.1089653 -5.47222729
# name_length 0.2665403 0.28949914
# Spay.NeuterYes 2.6352576 2.83334146
# outcome_age_group1-3 months 2.6999704 2.93955884
# outcome_age_group3-6 months 1.7401453 2.03964676
# outcome_age_group6-12 months 0.5308419 0.78738404
# outcome_age_group1-10 years 0.5779620 0.81895635
# outcome_age_group10+ years 0.4310385 0.85006947
# outcome_hour 0.1775772 0.19758586
# outcome_weekdayMonday -0.3450045 -0.08652874
# outcome_weekdaySaturday 0.6962772 0.95060352
# outcome_weekdaySunday 0.4373778 0.69470868
# outcome_weekdayThursday -0.3081601 -0.04203955
# outcome_weekdayTuesday -0.3307162 -0.07968091
# outcome_weekdayWednesday -0.2760152 -0.01631018
# sexMale 0.1704670 0.31106209
# season_outcomeSummer 0.1289115 0.33205743
# season_outcomeFall -0.1129246 0.09303567
# season_outcomeWinter 0.1708804 0.38874634
# domestic_breedTrue -1.9413939 -0.36288649
# cfa_breedTrue -1.3704453 0.18630747
exp(confint(final_model))#conf int for exp(beta)
# 2.5 % 97.5 %
# (Intercept) 8.177407e-04 0.004201863
# name_length 1.305440e+00 1.335758287
# Spay.NeuterYes 1.394690e+01 17.002178181
# outcome_age_group1-3 months 1.487929e+01 18.907503184
# outcome_age_group3-6 months 5.698171e+00 7.687893052
# outcome_age_group6-12 months 1.700363e+00 2.197639960
# outcome_age_group1-10 years 1.782402e+00 2.268131477
# outcome_age_group10+ years 1.538855e+00 2.339809396
# outcome_hour 1.194320e+00 1.218457680
# outcome_weekdayMonday 7.082172e-01 0.917109189
# outcome_weekdaySaturday 2.006270e+00 2.587270659
# outcome_weekdaySunday 1.548641e+00 2.003125436
# outcome_weekdayThursday 7.347977e-01 0.958831857
# outcome_weekdayTuesday 7.184090e-01 0.923410950
# outcome_weekdayWednesday 7.588014e-01 0.983822110
# sexMale 1.185859e+00 1.364873959
# season_outcomeSummer 1.137589e+00 1.393832899
# season_outcomeFall 8.932180e-01 1.097500882
# season_outcomeWinter 1.186349e+00 1.475130321
# domestic_breedTrue 1.435038e-01 0.695665396
# cfa_breedTrue 2.539938e-01 1.204792644
# Linear Predictor for final model -----
# ni = -6.278 + (0.278)name_length + (2.733)Spay.NueterYes
# + (2.818)outcome_age_group1-3 months + (1.889)outcome_age_group3-6 months
# + (0.658)outcome_age_group6-12 months + (0.697)outcome_age_group1-10 years
# + (0.639)outcome_age_group10+ years + (0.187)outcome_hour
# - (0.215)outcome_weekdayMonday + (0.823)outcome_weekdaySaturday
# + (0.565)outcome_weekdaySunday - (0.175)outcome_weekdayThursday
# - (0.205)outcome_weekdayTuesday - (0.146)outcome_weekdayWednesday
# + (0.240)sexMale + (0.230)season_outcomeSummer
# - (0.009)season_outcomeFall + (0.279)season_outcomeWinter
# - (1.164)domestic_breedTrue - (0.604)cfa_breedTrue
# Statistical Coefficient Interpretations -----
# For context of interpretations, the y variable of outcome_bin is "Yes"
# (Cat had outcome of adopted) or "No" (cat did not have outcome of adopted).
#Coefficient Interpretations
#b0: e^b0 is the odds that a cat is adopted when all explanatory variables
# are set to 0. The odds of a cat being adopted that has a name length of 0,
# was not spayed/neutered, was in the outcome age group for 0-1 months,
# had been adopted at the 0th hour of the day, was adopted on Friday,
# is female, was adopted in the season of Spring, was not a domestic breed,
# and was not a CFA(Cat Fanciers' Association) breed is e^-6.27 = 0.0018.
#b1: Holding all factors constant, the odds of a cat being adopted change by
# a factor of e^0.277 = 1.319 for each additional letter/character added to
# the name length of a cat.
#b2: Holding all factors constant, the odds of a cat being adopted that is spayed/neutered
# is e^2.733 = 15.378 times the odds of a cat being adopted that is not spayed/neutered.
#b3: Holding all factors constant, the odds of a cat being adopted that is in the
# outcome age group of 1-3 months is e^2.819 = 16.760 times the odds of a cat being adopted
# that is in the outcome age group of 0-1 months.
#b4: Holding all factors constant, the odds of a cat being adopted that is in the
# outcome age group of 3-6 months is e^1.889 = 6.612 times the odds of a cat being adopted
# that is in the outcome age group of 0-1 months.
#b5: Holding all factors constant, the odds of a cat being adopted that is in the
# outcome age group of 6-12 months is e^0.658 = 1.930 times the odds of a cat being adopted
# that is in the outcome age group of 0-1 months.
#b6: Holding all factors constant, the odds of a cat being adopted that is in the
# outcome age group of 1-10 years is e^0.698 = 2.009 times the odds of a cat being adopted
# that is in the outcome age group of 0-1 months.
#b7: Holding all factors constant, the odds of a cat being adopted that is in the
# outcome age group of 10+ years is e^.640 = 1.896 times the odds of a cat being adopted
# that is in the outcome age group of 0-1 months.
#b8: Holding all factors constant, the odds of a cat being adopted change by
# a factor of e^0.187 = 1.205 for each additional hour in the day that the cat was adopted.
#b9: Holding all factors constant, the odds of a cat being adopted that happened on
# a Monday is e^-0.215 = 0.806 times the odds of a cat being adopted on Friday.
#b10: Holding all factors constant, the odds of a cat being adopted that happened on
# a Saturday is e^0.565 = 1.759 times the odds of a cat being adopted on Friday.
#b11: Holding all factors constant, the odds of a cat being adopted that happened on
# a Sunday is e^0.823 = 2.277 times the odds of a cat being adopted on Friday.
#b12: Holding all factors constant, the odds of a cat being adopted that happened on
# a Thursday is e^-0.175 = 0.839 times the odds of a cat being adopted on Friday.
#b13: Holding all factors constant, the odds of a cat being adopted that happened on
# a Tuesday is e^-0.205 = 0.814 times the odds of a cat being adopted on Friday.
#b14: Holding all factors constant, the odds of a cat being adopted that happened on
# a Wednesday is e^-0.146 = 0.864 times the odds of a cat being adopted on Friday.
#b15: Holding all factors constant, the odds of a cat being adopted that is male is
# e^0.240 = 1.271 times the odds of a cat being adopted that is female.
#b16: Holding all factors constant, the odds of a cat being adopted that happened
# in Spring is e^0.230 = 1.258 times the odds of a cat being adopted in Summer.
#b17: Holding all factors constant, the odds of a cat being adopted that happened
# in Spring is e^-0.009 = 0.991 times the odds of a cat being adopted in Fall.
#b18: Holding all factors constant, the odds of a cat being adopted that happened
# in Spring is e^0.279 = 1.321 times the odds of a cat being adopted in Winter.
#b19: Holding all factors constant, the odds of a cat being adopted that is a
# domestic breed is e^-1.163 = 0.312 times the odds of cat being adopted
# that is not a domestic breed.
#b20: Holding all factors constant, the odds of a cat being adopted that is a
# CFA(Cat Fanciers' Association) breed is e^0.546 = 0.546 times the odds
# of a cat being adopted that was not a CFA breed.
# COnfidence Interval Interpretations
# B1: Holding all factors constant, we are 95% confident that the odds of a
# cat being adopted change by a factor between the interval of (1.305 , 1.335) for
# each additional letter/character added to the name of the cat.
# B2: Holding all factors constant, we are 95% confident that the odds of a spayed/neutered
# cat being adopted is in the range of (13.946 and 17.002) times the odds of a
# non spayed/neutered cat being adopted.
# B3: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that is in outcome age group of 1-3 months is in the range of (14.879 , 18.907)
# times the odds of a cat being adopted that is in the outcome age group of 0-1 months.
# B4: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that is in outcome age group of 3-6 months is in the range of (5.698 , 7.687)
# times the odds of a cat being adopted that is in the outcome age group of 0-1 months.
# B5: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that is in outcome age group of 6-12 months is in the range of (1.700 , 2.197)
# times the odds of a cat being adopted that is in the outcome age group of 0-1 months.
# B6: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that is in outcome age group of 1-10 years is in the range of (1.782 , 2.268)
# times the odds of a cat being adopted that is in the outcome age group of 0-1 months.
# B7: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that is in outcome age group of 10+ years is in the range of (1.538 , 2.339)
# times the odds of a cat being adopted that is in the outcome age group of 0-1 months.
# B8: Holding all factors constant, we are 95% confident that the odds of a
# cat being adopted change by a factor between the interval of (1.305 , 1.335) for
# each hour in the day that a cat had been adopted.
# B9: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that happened on a Monday is in the range of (0.708 , 1.218) times the odds of a cat
# being adopted on a Friday.
# B9: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that happened on a Monday is in the range of (0.708 , 1.218) times the odds of a cat
# being adopted on a Friday.
# B10: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that happened on a Saturday is in the range of (2.006 , 2.587) times the odds of a cat
# being adopted on a Friday.
# B11: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that happened on a Sunday is in the range of (1.548 , 2.003) times the odds of a cat
# being adopted on a Friday.
# B12: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that happened on a Thursday is in the range of (0.073 , 0.958) times the odds of a cat
# being adopted on a Friday.
# B13: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that happened on a Tuesday is in the range of (0.071 , 0.923) times the odds of a cat
# being adopted on a Friday.
# B14: Holding all factors constant, we are 95% confident that the odds of a cat adopted
# that happened on a Wednesday is in the range of (0.075 , 0.983) times the odds of a cat
# being adopted on a Friday.
# B15: Holding all factors constant, we are 95% confident that the odds of a cat being
# adopted that is male is in the range of (1.185 , 1.364) times the odds of a cat being
# adopted that is female.
# B16: Holding all factors constant, we are 95% confident that the odds of a cat being
# adopted in the Summer is in the range of (1.137 , 1.393) times the odds of a cat being
# adopted in the Spring.
# B17: Holding all factors constant, we are 95% confident that the odds of a cat being
# adopted in the Fall is in the range of (1.137 , 1.393) times the odds of a cat being
# adopted in the Spring.
# B18: Holding all factors constant, we are 95% confident that the odds of a cat being
# adopted in the Winter is in the range of (1.137 , 1.393) times the odds of a cat being
# adopted in the Spring.
# B19: Holding all factors constant, we are 95% confident that the odds of a cat being
# adopted that is a domestic breed is in the range of (0.143 , 0.695) times the odds
# of a cat being adopted that is not a domestic breed.
# B20: Holding all factors constant, we are 95% confident that the odds of a cat being
# adopted that is a CFA(Cat Fanciers' Association) breed is in the range of
# (0.253 , 1.204) times the odds of a cat being adopted that is not a
# CFA(Cat Fanciers' Association) breed.