econ_predictors_of_coups.Rmd

---
title: "Machine Learning Models to Predict Coups Using Economic Data "
date: December 16, 2019
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


## Getting started

*Introduction*: The goal of this script is to find the strongest predictors that a country will have a coup in a given year, and then to build a the best possible predictive model of coup occurrence.

Two datasets are used in these models. First, I have used a coup-focused dataset publicly available from the Cline Center at the University of Illinois, which includes over 50 coup features for more than 1,000 coups since the 1940s. Second, I have used a historical economic dataset covering the same period from the University of Groningen.

```{r load}

library(tidyverse)
require(stargazer)
library(stats)
library(dplyr)
library(janitor)
library(boot)
library(knitr)
library(rpart)
library(rpart.plot)
library(ggplot2)
library(randomForest)
library(gbm)
library(Hmisc)

# Create a dataframe: coup_df

coup_csv <- read_csv("coupdata.csv")
colnames(coup_csv) %<>% stringr::str_replace_all("\\s","_") %>% tolower

coup_df <- data.frame(coup_csv)
coup_df <- coup_df %>%
  filter(!is.na(realized_coup))
?clean_names
coup_df <- coup_df %>% clean_names()

# Manually edit colnames of dataframe

colnames(coup_df)[which(names(coup_df) == "ambigious_coup")] <- "ambiguous_coup"
colnames(coup_df)[which(names(coup_df) == "were_others_then_the_incumbent_injured")] <- "were_others_than_the_incumbent_injured"
```

## Construct potentially relevant variables

New binary category variables include decade (seventies, sixties, etc.) and season (spring, summer and so on).

```{r variables, warning=FALSE}

# Eliminate nulls from the dataset

sum(is.na(coup_df))

coup_df <- coup_df %>%
  mutate(were_ordinary_citizens_involved = (is.na(were_ordinary_citizens_involved) <- 0)) %>%
  mutate(were_ordinary_citizens_involved = as.numeric(were_ordinary_citizens_involved))

sum(is.na(coup_df$were_ordinary_citizens_involved))

# Comment: We assume that except where ordinary citizens were known to be involved, all others event should be coded as having negligible or no ordinary citizen involvement.

# Code coup type as numeric instead of character ("conspiracy", "attempt," etc.)

coup_df <- coup_df %>%
  mutate(coup_type_numeric = type_of_coup)

coup_df$coup_type_numeric <- gsub("conspiracy", "1", coup_df$coup_type_numeric)
coup_df$coup_type_numeric <- gsub("attempt", "2", coup_df$coup_type_numeric)
coup_df$coup_type_numeric <- gsub("coup", "3", coup_df$coup_type_numeric)
coup_df$coup_type_numeric <- as.numeric(coup_df$coup_type_numeric)

# Create season variables

head(coup_df$month_of_event)
typeof(coup_df$month_of_event)

coup_df <- coup_df %>%
  mutate(winter = case_when(
    month_of_event == (12 | 1:2) ~ 1,
    TRUE ~ 0))

coup_df <- coup_df %>%
  mutate(spring = case_when(
    month_of_event == (3:5) ~ 1,
    TRUE ~ 0))

coup_df <- coup_df %>%
  mutate(summer = case_when(
    month_of_event == (6:8) ~ 1,
    TRUE ~ 0))

coup_df <- coup_df %>%
  mutate(autumn = case_when(
    month_of_event == (9:11) ~ 1,
    TRUE ~ 0))

sum(coup_df$summer) == sum(coup_df$spring)

# Create decade variables

typeof(coup_df$year)
head(coup_df$year)

coup_df <- coup_df %>%
  mutate(forties = case_when(
    (year > 1939) & (year < 1950) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(fifties = case_when(
    (year > 1949) & (year < 1960) ~ 1,
    TRUE ~ 0)) %>%
  mutate(sixties = case_when(
    (year > 1959) & (year < 1970) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(seventies = case_when(
    (year > 1969) & (year < 1980) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(eighties = case_when(
    (year > 1979) & (year < 1990) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(nineties = case_when(
    (year > 1989) & (year < 2000) ~ 1,
    TRUE  ~ 0)) %>%
  mutate(aughties = case_when(
    (year > 1999) & (year < 2010) ~ 1,
    TRUE  ~ 0))

sum(coup_df$forties)
sum(coup_df$fifties)
sum(coup_df$sixties)
sum(coup_df$seventies)
sum(coup_df$eighties)
sum(coup_df$nineties)
sum(coup_df$aughties)

```

## Load econonomic history dataset as dataframe

```{r econ_df, warning=FALSE}

econ_csv <- read_csv("pwt91.csv")
colnames(econ_csv) %<>% stringr::str_replace_all("\\s","_") %>% tolower
econ_df <- data.frame(econ_csv)

```


## Construct relevant variables: population growth, GDP per capita, per capita GDP growth, employment rate

```{r econ_variables, warning=FALSE}

# Population growth over the past 5 years

econ_df <- econ_df %>%
  group_by(country) %>%
  mutate(popGrowth = pop/lag(pop, n = 5)) %>%
  mutate(popGrowth = (popGrowth - 1))

# GDP per capita should use gdpe (expenditures), not gdpo (output), because the former is the standard measure

econ_df <- econ_df %>% 
  group_by(country) %>%
  mutate(gdpPerCap = rgdpe/pop)

# Cumulative Per capita GDP growth over previous 5 years, excluduing the current year

econ_df <- econ_df %>%
  group_by(country) %>%
  mutate(fiveYrPerCapitaChange = (lag(gdpPerCap, n = 1)/lag(gdpPerCap, n = 5))) %>%
  mutate(fiveYrPerCapitaChange = (fiveYrPerCapitaChange - 1))

mean(econ_df$fiveYrPerCapitaChange, na.rm = TRUE)

# Employment rate

econ_df <- econ_df %>%
  group_by(country) %>%
  mutate(empRate = emp/pop)

# Employment rate lagged by one year

econ_df <- econ_df %>%
  group_by(country) %>%
  mutate(laggedEmpRate = lag(empRate, n = 1))

mean(econ_df$laggedEmpRate, na.rm = TRUE)

# Cumulative change in exchange rate over previous 3 years, excluduing the current year

econ_df <- econ_df %>%
  group_by(country) %>%
  mutate(threeYrXRchange = (lag(xr, n = 1)/lag(xr, n = 3))) %>%
  mutate(threeYrXRchange = (threeYrXRchange - 1))

mean(econ_df$xr, na.rm = TRUE)

# Export dataframe as csv for Tableau or Python

# write.csv(econ_df,"econ_data_tableau.csv", row.names = FALSE)

```

## Join coup dataframe with economic history dataframe

```{r export, warning=FALSE}

inner_join <- inner_join(coup_df, econ_df, by = c("year" = "year", "country" = "country"))

head(inner_join)

joined_data <- merge(coup_df, econ_df, by = c("year" = "year", "country" = "country"), all= TRUE)

# joined_data <- left_join(coup_df, econ_df, by = c("year" = "year", "country" = "country"))

colnames(joined_data)
sum(is.na(joined_data))


## Add and improve variables to indicate that a coup event happened or was successful

joined_data <- joined_data %>%
  mutate(coup_binary = coup_id) %>%
  mutate(coup_binary = case_when(
    !is.na(coup_binary) == TRUE ~ 1,
    is.na(coup_binary) ~ 0))

sum(joined_data$coup_binary)
mean(joined_data$coup_binary)

joined_data[["realized_coup"]][is.na(joined_data["realized_coup"])] <- 0
mean(joined_data$realized_coup)

## Export joined dataframes as csv

# write.csv(joined_data,"joined_data.csv", row.names = FALSE)
# write.csv(inner_join,"inner_join.csv", row.names = FALSE)


```



## Split the data into train and test

```{r split, warning=FALSE}

set.seed(1)
train_full <- sample_frac(joined_data, 0.7)
test_full <- dplyr::setdiff(joined_data, train_full)

set.seed(1)
train_inner <- sample_frac(inner_join, 0.7)
test_inner <- dplyr::setdiff(inner_join, train_inner)


```

## Create a simple logistic regression model of employment rate (employmed people / population) to predict coup attempts of any kind

```{r employment, warning=FALSE}

train_emp_coup <- train_full %>%
  filter(!is.na(coup_binary)) %>%
  filter(!is.na(laggedEmpRate))

nrow(train_emp_coup)
sum(train_emp_coup$coup_binary)
mean(train_emp_coup$laggedEmpRate)

test_emp_coup <- test_full %>%
  filter(!is.na(coup_binary)) %>%
  filter(!is.na(laggedEmpRate))

# Build the regression model

empRate_form <- formula(coup_binary ~ laggedEmpRate)

empRate_lm <- glm(empRate_form, data = train_emp_coup, family = binomial(link = 'logit'))

summary(empRate_lm)

# Comment: A country's employment rate (total employment / population) is strongly and very significantly associated with a reduced chance of a coup event

train_emp_coup <- train_emp_coup %>% 
  mutate(empRate_logit = empRate_lm$fitted.values, 
         empRate_pred = as.numeric(empRate_logit > 0.5))

error_rate_train <- train_emp_coup %>% 
  mutate(log_error = coup_binary != empRate_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_train

train_emp_coup %>% 
  group_by(coup_binary, empRate_pred) %>% 
  summarise(n = n()) %>% 
  spread(coup_binary, n)

sum(train_emp_coup$coup_binary)

# Comment: The model had a 7% failure rate in the training data, but only because coups are rare, so any model that predicts a coup *won't ever occur* is a safe bet. This model is therefore not useful.


```

## Create a full logistic regression model of all potential economic predictors to predict coup attempts of any kind

```{r full_model, warning=FALSE}

# Filter data to remove null values of potential predictors

train_all <- train_full %>%
  filter(!is.na(coup_binary)) %>%
  filter(!is.na(popGrowth)) %>%
  filter(!is.na(fiveYrPerCapitaChange)) %>%
  filter(!is.na(laggedEmpRate)) %>%
  filter(!is.na(threeYrXRchange))

test_all <- test_full %>%
  filter(!is.na(coup_binary)) %>%
  filter(!is.na(popGrowth)) %>%
  filter(!is.na(fiveYrPerCapitaChange)) %>%
  filter(!is.na(laggedEmpRate)) %>%
  filter(!is.na(threeYrXRchange))

# Build the full logistic regression model

logit_form_all_econ <- formula(coup_binary ~ popGrowth + fiveYrPerCapitaChange + laggedEmpRate + threeYrXRchange + gdpPerCap)

lm_all_econ <- glm(logit_form_all_econ, data = train_all, family = binomial(link = 'logit'))

stargazer(lm_all_econ, type = "text")

train_all <- train_all %>% 
  mutate(all_logit = lm_all_econ$fitted.values, 
         all_econ_pred = as.numeric(all_logit > 0.5))

error_rate_train <- train_all %>% 
  mutate(log_error = coup_binary != all_econ_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_train

train_all %>% 
  group_by(coup_binary, all_econ_pred) %>% 
  summarise(n = n()) %>% 
  spread(coup_binary, n)

sum(train_all$coup_binary)

# Comment: The model had only a 6.5% failure rate in the training data, but it only predicted 3 coups. In fact, 380 occured.


```

## Forward Stepwise Model Selection Method: Choosing the best combination of predictors

The following algorithm selects the logistic regression model with the lowest error rate by testing all possible combinations of predictors.

```{r stepwise, warning = FALSE}

# Function to calculate error rate for any logistic regression model
cv_fun <- function(f) {
  glmfit <- glm(f, data = train_all, family = binomial(link = 'logit'))
  stepwise_pred <- predict(glmfit, newdata = test_all, type = "response")
  stepwise_binary <- as.numeric(stepwise_pred > 0.5)
  log_error <- case_when((test_all$coup_binary == stepwise_binary) ~ 0,
                         TRUE ~ 1)
  mean(log_error)
}

# Initial settings
X <- list("popGrowth", "fiveYrPerCapitaChange", "laggedEmpRate", "threeYrXRchange", "gdpPerCap")

length(X)
f_best = vector("list", length(X) + 1)
f_best_cv = vector("list", length(X) + 1)
i <- 1

while(length(X) > 1) {
  if (i > 1) {
    # Best model from previous iteration
    f_old <- f_best[[i-1]]
    
    # Unused controls
    X <- as.character(f_old)[3] %>% 
      strsplit(split = " + ", fixed = TRUE) %>% 
      unlist() %>% 
      setdiff(X, .)
    
    # All models adding one unused control to f_old
    f_new <- paste(as.character(f_old)[2], "~", as.character(f_old)[3], "+", X) %>% 
      lapply(as.formula, env = .GlobalEnv)
    
    # NLL for all new models
    f_new_cv <- lapply(f_new, cv_fun)
    
    # Save best model and NLL
    f_best[[i]] <- f_new[[which.min(f_new_cv)]]
    f_best_cv[[i]] <- f_new_cv[[which.min(f_new_cv)]]
  } else {
    # Constant-only model and NLL
    f_best[[i]] <- formula(coup_binary ~ 1)
    f_best_cv[[i]] <- cv_fun(formula(coup_binary ~ 1))
  }
  i <- i + 1
  print(length(X))
}

# Which model has the lowest error rate in the test data?

f_best_forward <- f_best[[which.min(f_best_cv)]]
f_best_forward
f_best_forward_error <- f_best_cv[[which.min(f_best_cv)]]
f_best_forward_error


```


## Examine the best model from the selection algorithm

It is important to see if the model can be tweaked to lower false negatives, since coups are rare and significant enough that we should err on the side of predicting too many, not too few. This tweaking can be done by changing the threshold at which a value counts as a positive prediction from 0.5 to something lower, like 0.25.

To do so, we must look closely at the model's confusion matrices.

```{r selected_model, warning = FALSE}

# The model, given below, has a low error rate and significant p-values, but this is mostly because the model recognized which countries would NOT have coups. By contrast, the model only successfully predicted 2 coups out of 380. Lowering the prediction threshold could be appropriate. As a result, I have marked fitted values above 0.1 as a prediction, coded as `1`.

logit_form_stepwise <- formula(coup_binary ~ 1 + fiveYrPerCapitaChange + laggedEmpRate)

lm_econ_stepwise <- glm(logit_form_stepwise, data = train_all, family = binomial(link = 'logit'))

summary(lm_econ_stepwise)

train_all <- train_all %>% 
  mutate(step_logit = lm_econ_stepwise$fitted.values, 
         step_econ_pred = as.numeric(step_logit > 0.1))

error_rate_train <- train_all %>% 
  mutate(log_error = coup_binary != step_econ_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_train

train_all %>% 
  group_by(coup_binary, step_econ_pred) %>% 
  summarise(n = n()) %>% 
  spread(coup_binary, n)

mean(train_all$coup_binary)
sum(train_all$coup_binary)

test_all <- test_all %>%
  mutate(step_logit = predict(lm_econ_stepwise,., type = "response"), 
         step_econ_pred = as.numeric(step_logit > 0.1))

error_rate_test <- test_all %>% 
  mutate(log_error = coup_binary != step_econ_pred) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_test

test_all %>% 
  group_by(coup_binary, step_econ_pred) %>% 
  summarise(n = n()) %>% 
  spread(coup_binary, n)

sum(test_all$coup_binary)

# Comment: With a lower threshold (0.1), the model predicts far more coups, but still misses 2/3 of coup attempts. In addition,
# false positives increased dramatically, at ratio of 7:1 against true positives. The model is better than useless, though.


```

## Tree-based Classification: Decision Tree Models

```{r tree, warning=FALSE}

# Use the full formula with all predictors employed in the full logistic regression: `logit_form_all_econ`

# Train the decision tree

tree_econ <- rpart(logit_form_all_econ, data = train_all)

tree_econ

# Visualize the tree

plot(tree_econ, uniform = TRUE)
text(tree_econ, pretty=0)

# Prune the tree

index <- which.min(tree_econ$cptable[ , "xerror"])
tree_min <- tree_econ$cptable[index, "CP"]
prune_tree <- prune(tree_econ, cp = tree_min)

prune_tree

# Comment: No pruning is needed

# Visualize the pruned tree

plot(prune_tree, uniform = TRUE)
text(prune_tree, pretty=0)

# Predict using the test data and transform into predictions

tree_predict <- predict(tree_econ, newdata = test_all)
tree_predict <- as.numeric(tree_predict > 0.5)

# Means squared error (MSE)

mean((tree_predict - test_all$coup_binary)^2)

# Comment: 32.4% MSE, but MSE is different from the error rate in previous models, so let's use a different metric

# Compare predicted success coups with actual successful coups, i.e., the error rate on successful coups

# Lower the prediction threshold

tree_predict <- predict(tree_econ, newdata = test_all)
tree_predict <- as.numeric(tree_predict > 0.5)

error_rate_test <- test_all %>% 
  mutate(log_error = coup_binary != tree_predict) %>% 
  summarise(log_error_rate = sum(log_error)/n())
error_rate_test

test_all %>% 
  mutate(tree_predict = tree_predict) %>%
  group_by(coup_binary, tree_predict) %>% 
  summarise(n = n()) %>% 
  spread(coup_binary, n)

# Comment: This model has minimized its error rate, but also predicted only 9 coups in thousands of rows, of which 3 occurred. In fact, there were 158.

```


# *Conclusion*: Economic predictors, or at least the ones I have constructed, cannot be used to successfully predict coups on their own. However, logistic regression models for coup attempts (`coup_binary`) are very statistically significant, and the variables selected in a stepwise forward selection algorithm were 5-year growth in gdp per capita and employment as a share of population, which the latter having a higher coefficient. When both are maximized, a coup event is less likely.