coup_violence.Rmd

---
title: "Machine Learning Models to Predict Violent Coups"
date: December 16, 2019
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


## Getting started

*Introduction*: The goal of this script is to find the strongest predictors of coup violence, coded in the dataset as `were_others_than_the_incumbent_killed`, and then to build the best possible predictive model of coup violence. The dataset used is publicly available from the Cline Center at the University of Illinois, and it includes over 50 coup features for more than 1,000 coups since the 1940s.

```{r load}

library(tidyverse)
require(stargazer)
library(stats)
library(dplyr)
library(janitor)
library(boot)
library(knitr)
library(rpart)
library(rpart.plot)
library(ggplot2)
library(randomForest)
library(gbm)
library(Hmisc)

# Create a dataframe: coup_df

coup_csv <- read_csv("coupdata.csv")
colnames(coup_csv) %<>% stringr::str_replace_all("\\s","_") %>% tolower

coup_df <- data.frame(coup_csv)
coup_df <- coup_df %>%
  filter(!is.na(realized_coup))

coup_df <- coup_df %>% clean_names()

# Manually edit colnames of dataframe

colnames(coup_df)[which(names(coup_df) == "ambigious_coup")] <- "ambiguous_coup"
colnames(coup_df)[which(names(coup_df) == "were_others_then_the_incumbent_injured")] <- "were_others_than_the_incumbent_injured"
```

## Construct potentially relevant variables

New binary category variables include decade (seventies, sixties, etc.) and season (spring, summer and so on).

```{r variables, warning=FALSE}

# Eliminate nulls from the dataset

sum(is.na(coup_df))

coup_df <- coup_df %>%
  mutate(were_ordinary_citizens_involved = (is.na(were_ordinary_citizens_involved) <- 0)) %>%
  mutate(were_ordinary_citizens_involved = as.numeric(were_ordinary_citizens_involved))

sum(is.na(coup_df$were_ordinary_citizens_involved))

# Comment: We assume that except where ordinary citizens were known to be involved, all others event should be coded as having negligible or no ordinary citizen involvement.

# Code coup type as numeric instead of character ("conspiracy", "attempt," etc.)

coup_df <- coup_df %>%
  mutate(coup_type_numeric = type_of_coup)

coup_df$coup_type_numeric <- gsub("conspiracy", "1", coup_df$coup_type_numeric)
coup_df$coup_type_numeric <- gsub("attempt", "2", coup_df$coup_type_numeric)
coup_df$coup_type_numeric <- gsub("coup", "3", coup_df$coup_type_numeric)
coup_df$coup_type_numeric <- as.numeric(coup_df$coup_type_numeric)

# Create season variables

head(coup_df$month_of_event)
typeof(coup_df$month_of_event)

coup_df <- coup_df %>%
  mutate(winter = case_when(
    month_of_event == (12 | 1:2) ~ 1,
    TRUE ~ 0))

coup_df <- coup_df %>%
  mutate(spring = case_when(
    month_of_event == (3:5) ~ 1,
    TRUE ~ 0))

coup_df <- coup_df %>%
  mutate(summer = case_when(
    month_of_event == (6:8) ~ 1,
    TRUE ~ 0))

coup_df <- coup_df %>%
  mutate(autumn = case_when(
    month_of_event == (9:11) ~ 1,
    TRUE ~ 0))

sum(coup_df$summer) == sum(coup_df$spring)

# Create decade variables

typeof(coup_df$year)
head(coup_df$year)

coup_df <- coup_df %>%
  mutate(forties = case_when(
    (year > 1939) & (year < 1950) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(fifties = case_when(
    (year > 1949) & (year < 1960) ~ 1,
    TRUE ~ 0)) %>%
  mutate(sixties = case_when(
    (year > 1959) & (year < 1970) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(seventies = case_when(
    (year > 1969) & (year < 1980) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(eighties = case_when(
    (year > 1979) & (year < 1990) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(nineties = case_when(
    (year > 1989) & (year < 2000) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(aughties = case_when(
    (year > 1999) & (year < 2010) ~ 1,
    TRUE  ~ 0))

sum(coup_df$forties)
sum(coup_df$fifties)
sum(coup_df$sixties)
sum(coup_df$seventies)
sum(coup_df$eighties)
sum(coup_df$nineties)
sum(coup_df$aughties)

```


## Export the data as csv

For use in Tableau or Python, export the post-data wrangling clean dataset.

```{r export, warning=FALSE}

# write.csv(coup_df,"C:/Users/cindy/Dropbox/MBA2/Indep Study/coup_data_tableau.csv", row.names = FALSE)

```

## Drop irrelevant variables

Use select(-x) to drop variables that are irrrelevant to coup success.

```{r drop, warning=FALSE}

coup_df <- coup_df %>% select(-x54, -x53, -country, -coup_id, -cow_code, -day_of_event)
coup_df <- coup_df %>% select(-year, -month_of_event, -type_of_coup)

colnames(coup_df)

```

## Split the data into train and test

```{r split, warning=FALSE}

set.seed(1)
train <- sample_frac(coup_df, 0.7)
test <- dplyr::setdiff(coup_df, train)

sum(is.na(train))

head(test)

```

## Create a simple logistic regression model for coup violence with a promising predictor: `were_rebel_soldiers_involved_in_the_coup`

```{r rebel, warning=FALSE}

mean(coup_df$were_others_than_the_incumbent_killed)
filter(coup_df, were_rebel_soldiers_involved_in_the_coup == 1) %>% summarise(mean(were_others_than_the_incumbent_killed), na.rm = TRUE)

# Over 50% of rebel-backed coups are violent, while only about 23% of attempted coups overall are violent

logit_form_rebel <- formula(were_others_than_the_incumbent_killed ~ were_rebel_soldiers_involved_in_the_coup)

rebel_lm <- glm(logit_form_rebel, data = train, family = binomial(link = 'logit'))

summary(rebel_lm)

# Comment: A coup's status as being rebel-backed is a strong predictor of violence, based on the logistic regression. How well does it do at prediction?

train <- train %>% 
  mutate(rebel_logit = rebel_lm$fitted.values, 
         rebel_violence_pred = as.numeric(rebel_logit > 0.5))

error_rate_train <- train %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != rebel_violence_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_train

train %>% 
  group_by(were_others_than_the_incumbent_killed, rebel_violence_pred) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)

# Comment: The model had a 22.3% failure rate in the training data, but using 0.5 prediction threshold, the model predicts no violence ever. If the threshold is raised to 0.3, things look different...

train <- train %>% 
  mutate(rebel_logit = rebel_lm$fitted.values, 
         rebel_violence_pred = as.numeric(rebel_logit > 0.3))

error_rate_train <- train %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != rebel_violence_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_train

train %>% 
  group_by(were_others_than_the_incumbent_killed, rebel_violence_pred) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)

test <- test %>%
  mutate(rebel_violence_pred = predict(rebel_lm,., type = "response"), 
         rebel_violence_pred = as.numeric(rebel_violence_pred > 0.3))

error_rate_test <- test %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != rebel_violence_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_test

test %>% 
  group_by(were_others_than_the_incumbent_killed, rebel_violence_pred) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)

# Comment: Using a prediction threshold of 0.3, the error rate in the training data is almost unchanged (22.6%), but the model does then at least predict some coups successfully. However, in both the training and test data, far fewer than half of violent coups are categorized as such. This not a useful model.


```

## Create a full regression model of coup success for all potential predictors

```{r full_model, warning=FALSE}

logit_form_all_violence <- formula(were_others_than_the_incumbent_killed ~ realized_coup + unrealized + military_coup + 
                              rebel_coup + palace_coup + popular_revolt + dissident_actions + foreign_coup + 
                              internationally_mediated_transitions + forced_resignation + counter_coup + auto_coup + 
                              attempted_coup + coup_conspiracies + ambiguous_coup + did_the_incumbent_flee_the_country + 
                              were_military_actors_involved_in_the_coup + were_other_security_officials_involved_in_the_coup +
                              were_rebel_soldiers_involved_in_the_coup + were_non_military_government_officials_involved_in_the_coup +
                              were_non_government_political_actors_involved_in_the_coup + 
                              were_political_radicals_involved_in_the_coup + were_business_leaders_involved_in_the_coup +
                              were_organized_labor_activists_involved_in_the_coup + were_students_or_academics_involved_in_the_coup +
                              were_religious_leaders_involved_in_the_coup + were_ethnic_group_leaders_involved + 
                              were_ordinary_citizens_involved + were_mercenaries_involved + 
                              were_non_government_foreign_actors_involved + was_a_foreign_gov_involved + 
                              winter + spring + summer + autumn + forties + fifties + sixties + seventies + eighties + nineties + 
                              aughties)

lm_all_violence <- glm(logit_form_all_violence, data = train, family = binomial(link = 'logit'))

summary(lm_all_violence)

# While this model has few independently significant predictors, it is interesting that the involvement of students and academics is the strong and most significant predictor of coup violence (highest coefficient, lowest p-value)

length(lm_all_violence$fitted.values)

train <- train %>% 
  mutate(all_logit = lm_all_violence$fitted.values, 
         all_violence_pred = as.numeric(all_logit > 0.5))

error_rate_train <- train %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != all_violence_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_train

train %>% 
  group_by(were_others_than_the_incumbent_killed, all_violence_pred) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)

# Comment: The model had only a 20.4% failure rate in the training data. Many violent coups were correctly predicted, but most were not. What about the test data?

test <- test %>%
  mutate(all_violence_pred = predict(lm_all_violence,., type = "response"), 
         all_violence_pred = as.numeric(all_violence_pred > 0.5))

error_rate_test <- test %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != all_violence_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_test

# Comment: The model had a 30.3% failure rate in the test data. As before, most violent coups were false negatives.

test %>% 
  group_by(were_others_than_the_incumbent_killed, all_violence_pred) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)


```

## Forward Stepwise Model Selection Method: Choosing the best combination of predictors

The following algorithm selects the logistic regression model with the lowest error rate by testing all possible combinations of predictors.

```{r stepwise, warning = FALSE}

# Function to calculate error rate for any logistic regression model
cv_fun <- function(f) {
  glmfit <- glm(f, data = train, family = binomial(link = 'logit'))
  stepwise_pred <- predict(glmfit, newdata = test, type = "response", warn= -1)
  stepwise_binary <- as.numeric(stepwise_pred > 0.25)
  log_error <- case_when((test$were_others_than_the_incumbent_killed == stepwise_binary) ~ 0,
      TRUE ~ 1)
  mean(log_error)
}

# Initial settings
X <- list("realized_coup", "unrealized", "military_coup", "rebel_coup", "palace_coup", 
              "popular_revolt", "dissident_actions", "foreign_coup", "internationally_mediated_transitions", "forced_resignation",
              "counter_coup", "auto_coup", "attempted_coup", "coup_conspiracies", "ambiguous_coup", 
              "did_the_incumbent_flee_the_country", "were_military_actors_involved_in_the_coup",
              "were_other_security_officials_involved_in_the_coup", "were_rebel_soldiers_involved_in_the_coup",
              "were_non_military_government_officials_involved_in_the_coup",
              "were_non_government_political_actors_involved_in_the_coup", "were_political_radicals_involved_in_the_coup",
              "were_business_leaders_involved_in_the_coup", "were_organized_labor_activists_involved_in_the_coup",
              "were_students_or_academics_involved_in_the_coup", "were_religious_leaders_involved_in_the_coup",
              "were_ethnic_group_leaders_involved", "were_ordinary_citizens_involved", "were_mercenaries_involved",
              "were_non_government_foreign_actors_involved", "was_a_foreign_gov_involved", "winter",
              "spring", "summer", "autumn", "forties", "fifties", "sixties", "seventies", "eighties", "nineties", "aughties")

length(X)
f_best = vector("list", length(X) + 1)
f_best_cv = vector("list", length(X) + 1)
i <- 1

while(length(X) > 1) {
  if (i > 1) {
    # Best model from previous iteration
    f_old <- f_best[[i-1]]
    
    # Unused controls
    X <- as.character(f_old)[3] %>% 
      strsplit(split = " + ", fixed = TRUE) %>% 
      unlist() %>% 
      setdiff(X, .)
    
    # All models adding one unused control to f_old
    f_new <- paste(as.character(f_old)[2], "~", as.character(f_old)[3], "+", X) %>% 
      lapply(as.formula, env = .GlobalEnv)
    
    # Error rate for all new models
    f_new_cv <- lapply(f_new, cv_fun)
    
    # Save best model and error rate
    f_best[[i]] <- f_new[[which.min(f_new_cv)]]
    f_best_cv[[i]] <- f_new_cv[[which.min(f_new_cv)]]
  } else {
    # Constant-only model and error rate
    f_best[[i]] <- formula(were_others_than_the_incumbent_killed ~ 1)
    f_best_cv[[i]] <- cv_fun(formula(were_others_than_the_incumbent_killed ~ 1))
  }
  i <- i + 1
  print(length(X))
}

# Which model has the lowest error rate in the test data?

f_best_forward <- f_best[[which.min(f_best_cv)]]
f_best_forward
f_best_forward_error <- f_best_cv[[which.min(f_best_cv)]]
f_best_forward_error

```


## Examine the best model from the selection algorithm

It is important to see if the model can be tweaked to lower false negatives, since violent coups are catastrophic enough that we should err on the side of predicting too many, not too few.

To do so, we must look closely at the model's confusion matrices.

```{r selected_model, warning = FALSE}

# The selected model, given below, has a 30.0% error rate. This is better than previous models, but not ideal.

logit_form_stepwise <- formula(were_others_than_the_incumbent_killed ~ 1 + rebel_coup + realized_coup + 
    unrealized + dissident_actions + internationally_mediated_transitions + 
    ambiguous_coup + were_other_security_officials_involved_in_the_coup + 
    were_religious_leaders_involved_in_the_coup + were_ordinary_citizens_involved + 
    winter + autumn + fifties + sixties + were_mercenaries_involved + 
    eighties + were_non_government_foreign_actors_involved + 
    were_organized_labor_activists_involved_in_the_coup + forced_resignation + 
    counter_coup + were_political_radicals_involved_in_the_coup + 
    aughties + aughties + were_business_leaders_involved_in_the_coup + 
    coup_conspiracies + military_coup + palace_coup)

lm_coup_stepwise <- glm(logit_form_stepwise, data = train, family = binomial(link = 'logit'))

summary(lm_coup_stepwise)


train <- train %>% 
  mutate(step_logit = lm_coup_stepwise$fitted.values, 
         step_violent_pred = as.numeric(step_logit > 0.25))

error_rate_train <- train %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != step_violent_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_train

train %>% 
  group_by(were_others_than_the_incumbent_killed, step_violent_pred) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)

sum(train$were_others_than_the_incumbent_arrested)

# Comment: Despite an overall error rate that sounds poor (34.7%), the model produces relatively  useful predictions, predicting only 68% of violent coups (122/175).


# Test on test daya

test <- test %>%
  mutate(step_violent_pred = predict(lm_coup_stepwise,., type = "response"), 
         step_violent_pred = as.numeric(step_violent_pred > 0.25))

error_rate_test <- test %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != step_violent_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_test

test %>% 
  group_by(were_others_than_the_incumbent_killed, step_violent_pred) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)

# Comment: At a prediction threshold of 0.25, around most violent coups (74%) are correctly classified (overall error rate: 23.6%), although there is an increase in false positives, such that false positives are about as numerous as true positives. Still, this could be a useful model.

```

## Tree-based Classification: Decision Tree Models

```{r tree, warning=FALSE}

# Use the full formula with all predictors employed in the full logistic regression: `logit_form_all_violence`

# Train the decision tree

tree_violence <- rpart(logit_form_all_violence, data = train)

tree_violence

# Visualize the tree

plot(tree_violence, uniform = TRUE)
text(tree_violence, pretty=0)

# Prune the tree

index <- which.min(tree_violence$cptable[ , "xerror"])
tree_min <- tree_violence$cptable[index, "CP"]
prune_tree <- prune(tree_violence, cp = tree_min)

prune_tree

# Visualize the pruned tree

plot(prune_tree, uniform = TRUE)
text(prune_tree, pretty=0)

prp(prune_tree, extra = 1, box.palette = "auto")

# Comment: A few branches were trimmed

# Predict using the test data and transform into predictions

tree_predict <- predict(tree_violence, newdata = test)
tree_predict <- as.numeric(tree_predict > 0.25)

# Means squared error (MSE)

mean((tree_predict - test$were_others_than_the_incumbent_killed)^2)

# Comment: 32.4% MSE, but MSE is different from the error rate in previous models, so let's use a different metric

# Compare predicted success coups with actual successful coups, i.e., the error rate on successful coups

sum(test$were_others_than_the_incumbent_killed)
sum(tree_predict)

sum(test$were_others_than_the_incumbent_killed - tree_predict)

test %>% 
  mutate(tree_predict = tree_predict) %>%
  filter(were_others_than_the_incumbent_killed == 1) %>%
  summarise(sum(were_others_than_the_incumbent_killed - tree_predict))

# Comment: Of 68 violent coups, this model predicted 14, or 20.5%. This is slightly worse than the stepwise selected logistic regression. However, we can use a lower prediction threshold, as was done above.

# Lower the prediction threshold

tree_predict <- predict(tree_violence, newdata = test)
tree_predict <- as.numeric(tree_predict > 0.25)

error_rate_test <- test %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != tree_predict) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_test

test %>% 
  mutate(tree_predict = tree_predict) %>%
  group_by(were_others_than_the_incumbent_killed, tree_predict) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)

# Comment: With  lower prediction threshold (0.25), the error rate is still only 38.6%, and the vast majority of violent coups are predicted (72/78, or 92%), and the number of false positives is tolerable.

# Increase prediction threshold

tree_predict <- predict(tree_violence, newdata = test)
tree_predict <- as.numeric(tree_predict > 0.5)

error_rate_test <- test %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != tree_predict) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_test

test %>% 
  mutate(tree_predict = tree_predict) %>%
  group_by(were_others_than_the_incumbent_killed, tree_predict) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)

# Despite a slightly lower error rate, a threshold of 0.5 misses the vast majority of violent coups. 


```

## Using Random Forest to predict Coup Violence

Random Forest is a model building method that builds a decision tree by randomly leaving out whole attributes to prevent overfiting.

```{r random, warning=FALSE}

rf_fit <- randomForest(logit_form_all_violence, data = train, mtry = 58, importance = TRUE)

summary(rf_fit)

# Visualize error rates on the training data

plot(rf_fit, uniform = TRUE, main = "Errors By Tree Size, Random Forest Models, Coup Violence")

# Test

rf_test <- predict(rf_fit, newdata = test)

# MSE

rf_mse <- mean((rf_test - test$were_others_than_the_incumbent_killed)^2)

rf_mse


# Comment: At 19%, this is a good MSE result

# Prediction Error Rate Test: 0.5 prediction threshold

rf_test_binary_1 <- as.numeric(rf_test > 0.5)

error_rate_test <- test %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != rf_test_binary_1) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_test

test %>% 
  mutate(rf_test_binary_1 = rf_test_binary_1) %>%
  group_by(were_others_than_the_incumbent_killed, rf_test_binary_1) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)


# Comment: Despite a good MSE, binary prediction results are worse. Less than 20% of violent coups are predicted. What happens if the prediction threshold is lowered to 0.25?

rf_test_binary_2 <- as.numeric(rf_test > 0.25)

error_rate_test <- test %>% 
  mutate(log_error = were_others_than_the_incumbent_killed != rf_test_binary_2) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_test

test %>% 
  mutate(rf_test_binary_2 = rf_test_binary_2) %>%
  group_by(were_others_than_the_incumbent_killed, rf_test_binary_2) %>% 
  summarise(n = n()) %>% 
  spread(were_others_than_the_incumbent_killed, n)

# Comment: At this lower prediction level, the error rate rises slightly from 31% to 33%, but now 74% of violent coups are predicted.

# Determine node purity and MSE by predictor

# varImpPlot(rf_fit)
impToPlot <- importance(rf_fit, scale=FALSE)

# write.csv(impToPlot,"varImpPlot_violence.csv", row.names = TRUE)



```

# *Conclusion*: Machine learning models are partly effective in predicting coup violence. A logistic regression selected by a forward stepwise algorithm with a lowered prediction threshold of (0.25) reduced the number of false negatives (our main goal) to <10%, while the overall error rate was only 23%. The decision tree classification model performed somewhat well too, if the prediction threshold is again lowered. The Random Forest model was least effective in this case.