DOMin_Mean_Range_trends.Rmd

---
title: "Assessing Climate Change Effects in VA Reservoirs"
subtitle: "Part 3: Deriving Trends for Dissolved Oxygen (DOMax, DOMin, DOMean, DORange)"
author: "Andrew Cameron"
output:  
  html_document:
    toc: true
    toc_depth: 4
    toc_float: true
---

```{r setup, include=TRUE, warning=FALSE, message=FALSE}
knitr::opts_chunk$set(echo = TRUE)

# Install any missing packages
install_packages <- function(packages) {
  new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
  if(length(new_packages) > 0) {
    install.packages(new_packages, dependencies = TRUE)
  }
}

packages <- c("ggplot2", "tidyverse", "mblm")
install_packages(packages)

library(ggplot2)
library(tidyverse)
library(mblm)
```

## Data Preprocessing

```{r `read in data`}
trendSites.df <- openxlsx::read.xlsx("input_data/data_original.xlsx", sheet = 2)
allData.df <- openxlsx::read.xlsx("input_data/data_original.xlsx", sheet = 3)

# Convert `date` column from Excel encoded date to a more legible date format. Otherwise date shows as numeric value, e.g. '44230'.
    allData.df$Date <- allData.df$Date * 86400      # 86400 = seconds in a day.
    allData.df$Date <- as.POSIXct(allData.df$Date, origin = "1899-12-30", tz = "UTC")
    
```

```{r 'remove anomlies'}
# a surface temperature (Tmax) of 2.8 C in August
# 13774 	6ACNR000.00 	8/28/1990 	0.3 	NA 	NA 	NA 	NA

which(grepl("13774", allData.df$X1))
allData.df[13770, (5:8)] <- NA

```

```{r `subset and filter data`}
station_IDS <- trendSites.df$FDT_STA_ID
excluded_sites <- c("2-JKS053.48",
                    "2-XDD000.40",
                    "4AROA192.55",
                    "4AROA196.05",
                    "5ASRN000.66",
                    "6ACNR000.00",
                    "6APNR008.15") # only surface water measurements ever taken

# subset 34 trend sites (41 - 7 excluded)
trendsites <- allData.df %>%
  mutate(MonthNum = lubridate::month(Date),
         Year = lubridate::year(Date)) %>%
  filter(FDT_STA_ID %in% station_IDS) %>%
  filter(!FDT_STA_ID %in% excluded_sites) %>%
  filter(MonthNum %in% 5:10)

# apply filters (based on 'Filter' column in "Station Specific Trend Sites.xlsx")
specialFiltersNeeded <- c("4ABWR017.42", "4AROA038.49", "4AROA192.94", "9-NEW098.32", "9-PKC004.16")

  special1 <- trendsites %>%
    filter(FDT_STA_ID == "4ABWR017.42") %>%
    filter(MaxDepth.x >= 5)
  
  special2 <- trendsites %>%
    filter(FDT_STA_ID == "4AROA038.49") %>%
    filter(MaxDepth.x >= 6)
  
  special3 <- trendsites %>%
    filter(FDT_STA_ID == "4AROA192.94") %>%
    filter(MaxDepth.x >= 3)
  
  special4 <- trendsites %>%
    filter(FDT_STA_ID == "9-NEW098.32") %>%
    filter(MaxDepth.x >= 6)
  
  special5 <- trendsites %>%
    filter(FDT_STA_ID == "9-PKC004.16") %>%
    filter(MaxDepth.x >= 3)


sites_less85max <- trendsites %>%
  filter(!FDT_STA_ID %in% specialFiltersNeeded) %>%
  filter(MaxDepth.x >= .85*MeanDepth)

# Bring back into single df
subset.df <- rbind(sites_less85max, special1, special2, special3, special4, special5)


# "add additional filter excluding trend results for profiles where NObs<10 (I count there are ~7 of these)"
# applying this filter to the df for deriving trends (`subset.df`) complicates the code substantially because it results in there not being a corresponding MonthNum 5:10 for each station ID. Filter applied after the trends are derived but before plotting. 

countPerGroup <- subset.df %>%
  group_by(FDT_STA_ID, MonthNum) %>%
  summarise(Nobs = n()) %>%
  mutate(key = paste(FDT_STA_ID, MonthNum, sep="_"))

insufficient_Nobs <- countPerGroup %>%
  filter(Nobs < 10)

insufficient_Nobs <- insufficient_Nobs$key

```

## Minimum Dissovled Oxygen

```{r `derive DOMin by month` }
monthlyDOMin <- subset.df %>%
  group_by(FDT_STA_ID, MonthNum) %>%
  summarize(Mean = mean(DOMin, na.rm = TRUE),
            Min = min(DOMin, na.rm = TRUE),
            Max = max(DOMin, na.rm = TRUE),
            Nobs = n(),
            .groups = 'drop') %>%
  mutate(Month = month.name[MonthNum])

 # write_csv(monthlyDOMin, "output_data/DOMin_stats_monthlyByStation.csv")

cols_to_keep <- colnames(subset.df)

df.DOMin <- subset.df %>%
  left_join(monthlyDOMin, by = c("FDT_STA_ID", "MonthNum")) %>%
  select(all_of(cols_to_keep), Mean) 

```

```{r `DOMin fit regressions for each month-station`, warning=FALSE}
# fit a linear regression for DOMean vs. year by month-station using the mblm function (median based linear model) as this is thought to be less sensitive to outliers.

#Usage
#mblm(formula, dataframe, repeated = TRUE)
  #Arguments
    #formula A formula of type y ~ x (only linear models are accepted)
    #dataframe Optional dataframe
    #repeated If set to true, model is computed using repeated medians. If false, a single median estimators are calculated

# remove NA values -- mblm() does not handle NA as lm() does
df.DOMin <- df.DOMin %>%
  filter(!is.na(DOMin))

# create list to store regression results
modelSummaries <- list()

for (station_id in unique(df.DOMin$FDT_STA_ID)) {
  for (month_num in 5:10) {
    
    month_station <- df.DOMin %>%
      filter(FDT_STA_ID == station_id & MonthNum == month_num)

      model <- mblm(DOMin ~ Year, data = month_station)
      mod.sum <- summary.mblm(model)
      
      # store results
      modelSummaries[[paste(station_id, month_num, sep = "_")]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients[1,1]
        )
  }
}

## --------DOMin, with 1975 as y intercept-------------
q <- df.DOMin %>%
  mutate(Year1975 = Year - 1975) %>%
  filter(MonthNum == 8)

modelSummaries_DOMin1975 <- list()

for (station_id in unique(q$FDT_STA_ID)) {
      data = subset(q, FDT_STA_ID == station_id)
      model <- mblm(DOMin ~ Year1975, data = data)
      mod.sum <- summary.mblm(model)
      
      # store results
      modelSummaries_DOMin1975[[paste(station_id)]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year1975", "MAD"] ,
        pvalue = mod.sum$coefficients["Year1975", 4],
        intercept = mod.sum$coefficients[1,1]
        )
  }


# mblm does not return a standard error. Instead, summary.mblm can be used to extract the MAD or Median Absolute Deviation." It's a robust measure of variability that is less sensitive to outliers than the standard deviation, which is commonly used in traditional statistical analyses. 

```

```{r `DOMin add reg stats to summary df`}
# create station-month key
monthlyDOMin <- monthlyDOMin %>%
  mutate( key = paste(FDT_STA_ID, MonthNum, sep="_"),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intercept = NA)

for (i in 1:nrow(monthlyDOMin)) {
  key = monthlyDOMin$key[i]
  
  monthlyDOMin$model_slope[i] <-  modelSummaries[[key]]$slope
   monthlyDOMin$model_MAD[i] <-  modelSummaries[[key]]$MAD
    monthlyDOMin$model_pval[i] <-  modelSummaries[[key]]$pvalue
      monthlyDOMin$model_intercept[i] <-  modelSummaries[[key]]$intercept
  
}

DOMin_final <- monthlyDOMin %>%
  filter(!key %in% insufficient_Nobs)

#--------------------------------------------
# reg stats for August DOMin non-normalized with 1975 as y intercept
monthly_DOMin1975 <- monthlyDOMin %>%
  mutate( model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intercept = NA) %>%
  filter(MonthNum == 8)

for (i in 1:nrow(monthly_DOMin1975)) {
  key = monthly_DOMin1975$FDT_STA_ID[i]
  
  monthly_DOMin1975$model_slope[i] <-  modelSummaries_DOMin1975[[key]]$slope
   monthly_DOMin1975$model_MAD[i] <-  modelSummaries_DOMin1975[[key]]$MAD
    monthly_DOMin1975$model_pval[i] <-  modelSummaries_DOMin1975[[key]]$pvalue
     monthly_DOMin1975$model_intercept[i] <-  modelSummaries_DOMin1975[[key]]$intercept
  
}


DOMin1975_final <- monthly_DOMin1975 %>%
  filter(!key %in% insufficient_Nobs)

#------------------------------------------------
write_csv(DOMin_final, "output_data/DOMin_mblmModelStatistics.csv")

# do any month-stations return a 0 slope with sig pval? In this case, no
wonky_results <- DOMin_final %>%
  filter(model_slope == 0 & model_pval <= 0.05)

paste(nrow(wonky_results), "station-months returned a slope of 0 with a significant p-value in DOMin trend analysis.", sep=" ")

```

## Mean Dissovled Oxygen

```{r `derive DOMean by month` }
monthlyDOMean <- subset.df %>%
  group_by(FDT_STA_ID, MonthNum) %>%
  summarize(Mean = mean(DOMean, na.rm = TRUE),
            Min = min(DOMean, na.rm = TRUE),
            Max = max(DOMean, na.rm = TRUE),
            Nobs = n(),
            .groups = 'drop') %>%
  mutate(Month = month.name[MonthNum])

 # write_csv(monthlyDOMean, "output_data/DOMean_stats_monthlyByStation.csv")

cols_to_keep <- colnames(subset.df)

df.DOMean <- subset.df %>%
  left_join(monthlyDOMean, by = c("FDT_STA_ID", "MonthNum")) %>%
  select(all_of(cols_to_keep), Mean)
```

```{r `DOMean fit regressions for each month-station`, warning = FALSE}
# fit a linear regression for DOMean vs. year by month-station using the mblm function (median based linear model) as this is thought to be less sensitive to outliers.

#Usage
#mblm(formula, dataframe, repeated = TRUE)
  #Arguments
    #formula A formula of type y ~ x (only linear models are accepted)
    #dataframe Optional dataframe
    #repeated If set to true, model is computed using repeated medians. If false, a single median estimators are calculated

# remove NA values -- mblm() does not handle NA as lm() does
df.DOMean <- df.DOMean %>%
  filter(!is.na(DOMean))


# create list to store regression results
modelSummaries <- list()

for (station_id in unique(df.DOMean$FDT_STA_ID)) {
  for (month_num in 5:10) {
    
    month_station <- df.DOMean %>%
      filter(FDT_STA_ID == station_id & MonthNum == month_num)

      model <- mblm(DOMean ~ Year, data = month_station)
      mod.sum <- summary.mblm(model)
      
      # store results
      modelSummaries[[paste(station_id, month_num, sep = "_")]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients[1,1]
        )
  }
}

## --------DOMin, with 1975 as y intercept-------------
q <- df.DOMean %>%
  mutate(Year1975 = Year - 1975) %>%
  filter(MonthNum == 8)

modelSummaries_DOMean1975 <- list()

for (station_id in unique(q$FDT_STA_ID)) {
      data = subset(q, FDT_STA_ID == station_id)
      model <- mblm(DOMean ~ Year1975, data = data)
      mod.sum <- summary.mblm(model)
      
      # store results
      modelSummaries_DOMean1975[[paste(station_id)]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year1975", "MAD"] ,
        pvalue = mod.sum$coefficients["Year1975", 4],
        intercept = mod.sum$coefficients[1,1]
        )
  }


# mblm does not return a standard error. Instead, summary.mblm can be used to extract the MAD or Median Absolute Deviation." It's a robust measure of variability that is less sensitive to outliers than the standard deviation, which is commonly used in traditional statistical analyses. 

```

```{r `DOMean add reg stats to summary df`}
# create station-month key
monthlyDOMean <- monthlyDOMean %>%
  mutate( key = paste(FDT_STA_ID, MonthNum, sep="_"),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intercept = NA)

for (i in 1:nrow(monthlyDOMean)) {
  key = monthlyDOMean$key[i]
  
  monthlyDOMean$model_slope[i] <-  modelSummaries[[key]]$slope
   monthlyDOMean$model_MAD[i] <-  modelSummaries[[key]]$MAD
    monthlyDOMean$model_pval[i] <-  modelSummaries[[key]]$pvalue
    monthlyDOMean$model_intercept[i] <-  modelSummaries[[key]]$intercept
  
}


DOMean_final <- monthlyDOMean %>%
  filter(!key %in% insufficient_Nobs)

#--------------------------------------------
# reg stats for August DOMean non-normalized with 1975 as y intercept
monthly_DOMean1975 <- monthlyDOMean %>%
  mutate( model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intercept = NA) %>%
  filter(MonthNum == 8)

for (i in 1:nrow(monthly_DOMean1975)) {
  key = monthly_DOMean1975$FDT_STA_ID[i]
  
  monthly_DOMean1975$model_slope[i] <-  modelSummaries_DOMean1975[[key]]$slope
   monthly_DOMean1975$model_MAD[i] <-  modelSummaries_DOMean1975[[key]]$MAD
    monthly_DOMean1975$model_pval[i] <-  modelSummaries_DOMean1975[[key]]$pvalue
     monthly_DOMean1975$model_intercept[i] <-  modelSummaries_DOMean1975[[key]]$intercept
  
}


DOMean1975_final <- monthly_DOMean1975 %>%
  filter(!key %in% insufficient_Nobs)

#------------------------------------------------

write_csv(DOMean_final, "output_data/DOMean_mblmModelStatistics.csv")

# do any month-stations return a 0 slope with sig pval? In this case, no
wonky_results <- DOMean_final %>%
  filter(model_slope == 0 & model_pval <= 0.05)

paste(nrow(wonky_results), "station-months returned a slope of 0 with a significant p-value in DOMean trend analysis.", sep=" ")

```

## Dissovled Oxygen Range

```{r `derive DORange by month` }
monthlyDORange <- subset.df %>%
  group_by(FDT_STA_ID, MonthNum) %>%
  summarize(Mean = mean(DORange, na.rm = TRUE),
            Min = min(DORange, na.rm = TRUE),
            Max = max(DORange, na.rm = TRUE),
            Nobs = n(),
            .groups = 'drop') %>%
  mutate(Month = month.name[MonthNum])

 # write_csv(monthlyDORange, "output_data/DORange_stats_monthlyByStation.csv")

cols_to_keep <- colnames(subset.df)

df.DORange <- subset.df %>%
  left_join(monthlyDORange, by = c("FDT_STA_ID", "MonthNum")) %>%
  select(all_of(cols_to_keep), Mean)  

```

```{r `DORange fit regressions for each month-station`, warning = FALSE}
# fit a linear regression for DORange vs. year by month-station using the mblm function (median based linear model) as this is thought to be less sensitive to outliers.

#Usage
#mblm(formula, dataframe, repeated = TRUE)
  #Arguments
    #formula A formula of type y ~ x (only linear models are accepted)
    #dataframe Optional dataframe
    #repeated If set to true, model is computed using repeated medians. If false, a single median estimators are calculated

# remove NA values -- mblm() does not handle NA as lm() does
df.DORange <- df.DORange %>%
  filter(!is.na(DORange))

# create list to store regression results
modelSummaries <- list()

for (station_id in unique(df.DORange$FDT_STA_ID)) {
  for (month_num in 5:10) {
    
    month_station <- df.DORange %>%
      filter(FDT_STA_ID == station_id & MonthNum == month_num)

      model <- mblm(DORange ~ Year, data = month_station)
      mod.sum <- summary.mblm(model)
      
      # store results
      modelSummaries[[paste(station_id, month_num, sep = "_")]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients[1,1]
        )
  }
}

## --------DORange, with 1975 as y intercept-------------
q <- df.DORange %>%
  mutate(Year1975 = Year - 1975) %>%
  filter(MonthNum == 8)

modelSummaries_DORange1975 <- list()

for (station_id in unique(q$FDT_STA_ID)) {
      data = subset(q, FDT_STA_ID == station_id)
      model <- mblm(DORange ~ Year1975, data = data)
      mod.sum <- summary.mblm(model)
      
      # store results
      modelSummaries_DORange1975[[paste(station_id)]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year1975", "MAD"] ,
        pvalue = mod.sum$coefficients["Year1975", 4],
        intercept = mod.sum$coefficients[1,1]
        )
  }


# mblm does not return a standard error. Instead, summary.mblm can be used to extract the MAD or Median Absolute Deviation." It's a robust measure of variability that is less sensitive to outliers than the standard deviation, which is commonly used in traditional statistical analyses. 

```

```{r `DORange add reg stats to summary df`}
# create station-month key
monthlyDORange <- monthlyDORange %>%
  mutate( key = paste(FDT_STA_ID, MonthNum, sep="_"),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intercept = NA)

for (i in 1:nrow(monthlyDORange)) {
  key = monthlyDORange$key[i]
  
  monthlyDORange$model_slope[i] <-  modelSummaries[[key]]$slope
   monthlyDORange$model_MAD[i] <-  modelSummaries[[key]]$MAD
    monthlyDORange$model_pval[i] <-  modelSummaries[[key]]$pvalue
     monthlyDORange$model_intercept[i] <-  modelSummaries[[key]]$intercept
  
}


DORange_final <- monthlyDORange %>%
  filter(!key %in% insufficient_Nobs)

#--------------------------------------------
# reg stats for August DORange non-normalized with 1975 as y intercept
monthly_DORange1975 <- monthlyDORange %>%
  mutate( model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intercept = NA) %>%
  filter(MonthNum == 8)

for (i in 1:nrow(monthly_DORange1975)) {
  key = monthly_DORange1975$FDT_STA_ID[i]
  
  monthly_DORange1975$model_slope[i] <-  modelSummaries_DORange1975[[key]]$slope
   monthly_DORange1975$model_MAD[i] <-  modelSummaries_DORange1975[[key]]$MAD
    monthly_DORange1975$model_pval[i] <-  modelSummaries_DORange1975[[key]]$pvalue
     monthly_DORange1975$model_intercept[i] <-  modelSummaries_DORange1975[[key]]$intercept
  
}


DORange1975_final <- monthly_DORange1975 %>%
  filter(!key %in% insufficient_Nobs)

#------------------------------------------------

write_csv(DORange_final, "output_data/DORange_mblmModelStatistics.csv")

# do any month-stations return a 0 slope with sig pval? In this case, no
wonky_results <- DORange_final %>%
  filter(model_slope == 0 & model_pval <= 0.05)

paste(nrow(wonky_results), "station-months returned a slope of 0 with a significant p-value in DORange trend analysis.", sep=" ")
```

## Max Dissolved Oxygen

```{r `derive DOMax by month`, message = FALSE }
# This does not require filtering; surface only measurements are fine (?)
# Subset data more inclusively than in above calcs

DOMax.df <- allData.df %>%
  mutate(MonthNum = lubridate::month(Date),
         Year = lubridate::year(Date)) %>%
  filter(FDT_STA_ID %in% station_IDS) %>%
  filter(!FDT_STA_ID %in% c("6APNR008.15", "6ACNR000.00")) %>%
  filter(MonthNum %in% 5:10)

countPerGroup_domax <- DOMax.df %>%
  group_by(FDT_STA_ID, MonthNum) %>%
  summarise(Nobs = n()) %>%
  mutate(key = paste(FDT_STA_ID, MonthNum, sep="_"))

insufficient_Nobs_domax <- countPerGroup_domax %>%
  filter(Nobs < 10)

insufficient_Nobs_domax <- insufficient_Nobs_domax$key

#--------------------------

monthlyDOMax <- DOMax.df %>%
  group_by(FDT_STA_ID, MonthNum) %>%
  summarize(Mean = mean(DOMax, na.rm = TRUE),
            Min = min(DOMax, na.rm = TRUE),
            Max = max(DOMax, na.rm = TRUE),
            Nobs = n(),
            .groups = 'drop') %>%
  mutate(Month = month.name[MonthNum])

 # write_csv(monthlyDOMin, "output_data/DOMin_stats_monthlyByStation.csv")

cols_to_keep <- colnames(DOMax.df)

df.DOMax <- DOMax.df %>%
  left_join(monthlyDOMax, by = c("FDT_STA_ID", "MonthNum")) %>%
  select(all_of(cols_to_keep), Mean) 

```

```{r `DOMax fit regressions for each month-station_DOMax`, warning=FALSE}
# fit a linear regression for DOMean vs. year by month-station using the mblm function (median based linear model) as this is thought to be less sensitive to outliers.

#Usage
#mblm(formula, dataframe, repeated = TRUE)
  #Arguments
    #formula A formula of type y ~ x (only linear models are accepted)
    #dataframe Optional dataframe
    #repeated If set to true, model is computed using repeated medians. If false, a single median estimators are calculated

# remove NA values -- mblm() does not handle NA as lm() does
df.DOMax <- df.DOMax %>%
  filter(!is.na(DOMax))

# create list to store regression results
modelSummaries <- list()

for (station_id in unique(df.DOMax$FDT_STA_ID)) {
  for (month_num in 5:10) {
    
    month_station <- df.DOMax %>%
      filter(FDT_STA_ID == station_id & MonthNum == month_num)

      model <- mblm(DOMax ~ Year, data = month_station)
      mod.sum <- summary.mblm(model)
      
      # store results
      modelSummaries[[paste(station_id, month_num, sep = "_")]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year", "MAD"] ,
        pvalue = mod.sum$coefficients["Year", 4],
        intercept = mod.sum$coefficients[1,1]
        )
  }
}

## --------DOMax, with 1975 as y intercept-------------
q <- df.DOMax %>%
  mutate(Year1975 = Year - 1975) %>%
  filter(MonthNum == 8)

modelSummaries_DOMax1975 <- list()

for (station_id in unique(q$FDT_STA_ID)) {
      data = subset(q, FDT_STA_ID == station_id)
      model <- mblm(DOMax ~ Year1975, data = data)
      mod.sum <- summary.mblm(model)
      
      # store results
      modelSummaries_DOMax1975[[paste(station_id)]] <- list(
        slope = mod.sum$coefficients[2,1],
        MAD = mod.sum$coefficients["Year1975", "MAD"] ,
        pvalue = mod.sum$coefficients["Year1975", 4],
        intercept = mod.sum$coefficients[1,1]
        )
  }


# mblm does not return a standard error. Instead, summary.mblm can be used to extract the MAD or Median Absolute Deviation." It's a robust measure of variability that is less sensitive to outliers than the standard deviation, which is commonly used in traditional statistical analyses. 

```

```{r `DOMax add reg stats to summary df_DOMax`}
# create station-month key
monthlyDOMax <- monthlyDOMax %>%
  mutate( key = paste(FDT_STA_ID, MonthNum, sep="_"),
          model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intercept = NA)

for (i in 1:nrow(monthlyDOMax)) {
  key = monthlyDOMax$key[i]
  
  monthlyDOMax$model_slope[i] <-  modelSummaries[[key]]$slope
   monthlyDOMax$model_MAD[i] <-  modelSummaries[[key]]$MAD
    monthlyDOMax$model_pval[i] <-  modelSummaries[[key]]$pvalue
     monthlyDOMax$model_intercept[i] <-  modelSummaries[[key]]$intercept
  
}


DOMax_final <- monthlyDOMax %>%
  filter(!key %in% insufficient_Nobs)

#--------------------------------------------
# reg stats for August DOMax non-normalized with 1975 as y intercept
monthly_DOMax1975 <- monthlyDOMax %>%
  mutate( model_slope = NA,
          model_MAD = NA,
          model_pval = NA,
          model_intercept = NA) %>%
  filter(MonthNum == 8)

for (i in 1:nrow(monthly_DOMax1975)) {
  key = monthly_DOMax1975$FDT_STA_ID[i]
  
  monthly_DOMax1975$model_slope[i] <-  modelSummaries_DOMax1975[[key]]$slope
   monthly_DOMax1975$model_MAD[i] <-  modelSummaries_DOMax1975[[key]]$MAD
    monthly_DOMax1975$model_pval[i] <-  modelSummaries_DOMax1975[[key]]$pvalue
     monthly_DOMax1975$model_intercept[i] <-  modelSummaries_DOMax1975[[key]]$intercept
  
}


DOMax1975_final <- monthly_DOMax1975 %>%
  filter(!key %in% insufficient_Nobs)

#------------------------------------------------

write_csv(DOMax_final, "output_data/DOMax_mblmModelStatistics.csv")

# do any month-stations return a 0 slope with sig pval? In this case, no
wonky_results <- DOMax_final %>%
  filter(model_slope == 0 & model_pval <= 0.05)

paste(nrow(wonky_results), "station-months returned a slope of 0 with a significant p-value in DOMax trend analysis.", sep=" ")

```

### Single df with all 1975 y intercept model results

```{r}
a <- DOMin1975_final %>%
  mutate(variable = "DOMin")
b <- DOMean1975_final %>%
  mutate(variable = "DOMean")
c <- DORange1975_final %>%
  mutate(variable = "DORange")
d <- DOMax1975_final %>%
  mutate(variable = "DOMax")

e <- rbind(a, b, c, d)

readr::write_csv(e, "output_data/1975_y_intercept_DOVars_model_results.csv")


```

## Data viz

"a box-whisker plot showing the distribution of values by month across all stations."

```{r `custom func`}
# custom function to prepare plotting data for each variable
makePlotData <- function(df) {

# derive mean warming and incorporate back into df for plotting
df.allMonths <- df %>%
  group_by(FDT_STA_ID) %>%
  summarize(mean_slope = mean(model_slope, na.rm = TRUE),
            .groups = "drop") %>%
  mutate(Month = "All Months",
         trend= "N/A")

df.plot <- bind_rows(df, df.allMonths)

# populate model slope column with mean slope where Month == "All Months"
df.plot <- df.plot %>%
  mutate(model_slope = if_else(Month == "All Months", mean_slope, model_slope))


# set month as factor
df.plot <- df.plot %>%
  mutate(Month = factor(Month, levels = c("January", "February", "March", 
                                          "April", "May", "June", "July", 
                                          "August", "September", "October", 
                                          "November", "December", "All Months")),
         trend = ifelse(is.na(df.plot$model_pval), "N/A",
                        ifelse(df.plot$model_pval <= 0.05, "significant", "not significant")))

df.plot$trend <- factor(df.plot$trend, levels = c("significant", "not significant", "N/A"))

return(df.plot)

}

```

### DOMin

```{r}
DOMin_plot.df <- makePlotData(DOMin_final)

ggplot(DOMin_plot.df, aes(x = Month, y = model_slope, fill = Month)) +
  geom_boxplot(alpha = .85) +
  labs(title = "Variation in Warming Rates by Month (DOMin)",
       x = "Month",
       y = "T Trend (C/y)",
       fill = "Month") +
  theme_minimal() +
  theme(axis.title.x = element_text(margin = margin(t = 10)),
        legend.position = "none")

ggsave("output_data/DOMin_variationByMonth.jpg")

```

### DOMean

```{r}
DOMean_plot.df <- makePlotData(DOMean_final)

ggplot(DOMean_plot.df, aes(x = Month, y = model_slope, fill = Month)) +
  geom_boxplot(alpha = .85) +
  labs(title = "Variation in Warming Rates by Month (DOMean)",
       x = "Month",
       y = "T Trend (C/y)",
       fill = "Month") +
  theme_minimal() +
  theme(axis.title.x = element_text(margin = margin(t = 10)),
        legend.position = "none")

ggsave("output_data/DOMean_variationByMonth.jpg")

```

### TRange

```{r}
DORange_plot.df <- makePlotData(DORange_final)

ggplot(DORange_plot.df, aes(x = Month, y = model_slope, fill = Month)) +
  geom_boxplot(alpha = .85) +
  labs(title = "Variation in Warming Rates by Month (DORange)",
       x = "Month",
       y = "T Trend (C/y)",
       fill = "Month") +
  theme_minimal() +
  theme(axis.title.x = element_text(margin = margin(t = 10)),
        legend.position = "none")

ggsave("output_data/DORange_variationByMonth.jpg")

```