Analysis_Script.Rmd

---
title: "Analysis Script - Bias in AI"
author: "Vorisek_CN"
date: "30 12 2021"
output:
  html_document: default
  pdf_document: default
  word_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Prepare Analysis
*Load relevant libraries*
```{r}
library(readxl)
library(tidyverse)
library(writexl)
library(knitr)
library(arsenal)
library(data.table)
library(ggplot2)
library(dplyr)
library(ggplot2)
library(ggpubr)
```
*Bar theme*

colors: https://www.datanovia.com/en/blog/ggplot-colors-best-tricks-you-will-love/

```{r}
# define theme and colors for barplots
bar_theme <-   theme(rect = element_blank(),
        axis.ticks.y = element_line(color = "grey40"),
        panel.grid.major.x = element_line(color = "grey90"))
bar_color <-  "#C4961A"
bar_width <- 0.6
```
*Read in file created in "Data_Preparation.Rmd" and "Data Cleaning.Rmd"*
```{r}
analysis <- read_excel("AnalysisFile_Bias_AI.xlsx")
View(analysis)
```

# Demographics
**Prepare Data**
```{r}
analysis$arbeitsland = as.factor(analysis$arbeitsland)
analysis$geschlecht = as.factor(analysis$geschlecht)
analysis$sektor = as.factor(analysis$sektor)
analysis$quelle = as.factor(analysis$quelle)
analysis$alter = as.numeric(analysis$alter)
```
# Age group
```{r}
setDT(analysis)   
```
### create category age group
```{r}
analysis[,agegroup4:= cut(alter,4,include.lowest=TRUE,
           labels=c("<30","30-40","40-50",">50"))]
analysis[,table(agegroup4)]
```


***
## How did this questionnaire reach you?
```{r}
source <- analysis %>% 
  group_by(quelle) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))

source$quelle <- factor(source$quelle,levels = c(1,2,3,4,5,6), labels = c("Twitter","Linkedin","Email distribution\nlist of my institution", "Personal contact", "Other", "Not specified"))

knitr::kable(source)
```

```{r}
source_plot <- ggplot(source) + geom_col(aes(reorder(quelle, N, sum), N),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Number of Participants", title = "How did this questionnaire reach you?") +
  scale_y_continuous(breaks = seq(0,100,10)) +
  coord_flip() +
  bar_theme

source_plot
```


## Which country do you work in?
```{r}
country <- analysis %>% 
  group_by(arbeitsland) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))

knitr::kable(country)
```

```{r}
country_plot <- ggplot(country) + geom_col(aes(reorder(arbeitsland, N, sum), N),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Number of Participants", title = "Which country do you work in?") +
  scale_y_continuous(breaks = seq(0,150,25)) +
  coord_flip() +
  bar_theme

country_plot
```


## What is your gender?
```{r}
gender <- analysis %>% 
  group_by(geschlecht) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))

gender$geschlecht <- factor(gender$geschlecht,levels = c(1,2,3,4,5), labels = c("Female","Male","Diverse", "Undefined", "Not specified"))

knitr::kable(gender)
```
```{r}
gender_plot <- ggplot(gender) + geom_col(aes(reorder(geschlecht, N, sum), N),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Number of Participants", title = "What is your gender?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme

gender_plot
```


## What is your age?
*check for normality*
```{r}
hist(analysis$alter)
summary.default(analysis$alter)
```
## Age Group
```{r}
ageGroup <- analysis %>% 
  group_by(agegroup4) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))


knitr::kable(ageGroup)
```

group by median 
```{r}
analysis$agegroup2 <- as.factor(ifelse(analysis$alter <30, 1, 0))
```

```{r}
ageGroup_2 <- analysis %>% 
  group_by(agegroup2) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))


knitr::kable(ageGroup_2)
```

## In which area do you work in?
```{r}
section <- analysis %>% 
  group_by(sektor) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))

section$sektor <- factor(section$sektor,levels = c(1,2,3,4,5), labels = c("Science","Clinical Work","Industry", "Other", "Not specified"))

knitr::kable(section)
```
```{r}
ggplot(section) + geom_col(aes(reorder(sektor, N, sum), N),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Number of Participants", title = "In which area do you work in?") +
  scale_y_continuous(breaks = seq(0,200,50)) +
  coord_flip() +
  bar_theme
```


## At what stage are you or have you been developing AI-projects?
```{r}
which(colnames(analysis) == "ki_entwicklungsphase___1")
which(colnames(analysis) == "ki_entwicklungsphase___8")
```
*create dataset*
```{r}
kiphase<-analysis %>% select(19:26)

view(kiphase)
```
**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
kiphase_analysis <-data.frame(Freq = colSums(kiphase),
           Pct.of.Resp = (colSums(kiphase)/sum(kiphase))*100,
           Pct.of.Cases = (colSums(kiphase)/nrow(kiphase))*100)

rownames(kiphase_analysis) <- c("Project planning","Data acquisition/\ndata preprocessing","Data annotation","Identification\nof AI algorithms", "Training and\noptimization", "Practical testing\nof AI algorithms", "Other", "Not specified")

knitr::kable(kiphase_analysis)
```
*create bar graph*
*rownames as column*
```{r}
kiphase_analysis1 <- kiphase_analysis                                         
kiphase_analysis1$row_names <- row.names(kiphase_analysis1)                     
kiphase_analysis1 
```
*graph*
```{r}
ggplot(kiphase_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of participants", title = "At what stage are you or have you been\ndeveloping AI-projects??") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## In which medical specialty are you developing AI solutions?
```{r}
which(colnames(analysis) == "med_fachbereich___1")
which(colnames(analysis) == "med_fachbereich___37")
```
*create dataset*
```{r}
specialty<-analysis %>% select(28:64)
view(specialty)
```
**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
specialty_analysis <-data.frame(Freq = colSums(specialty),
           Pct.of.Resp = (colSums(specialty)/sum(specialty))*100,
           Pct.of.Cases = (colSums(specialty)/nrow(specialty))*100)


knitr::kable(specialty_analysis)
```
*create bar graph*
*rownames as column*
```{r}
specialty_analysis1 <- specialty_analysis                                         
specialty_analysis1$row_names <- row.names(specialty_analysis1)                     
specialty_analysis1 
```
*graph*
```{r}
ggplot(specialty_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of participants", title = "In which medical specialty are you developing AI solutions?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## In which of the following areas of AI are you developing?
```{r}
which(colnames(analysis) == "ki_art___1")
which(colnames(analysis) == "ki_art___4")
```
*create dataset*
```{r}
areas<-analysis %>% select( 66:69)
view(areas)
```
**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
areas_analysis <-data.frame(Freq = colSums(areas),
           Pct.of.Resp = (colSums(areas)/sum(areas))*100,
           Pct.of.Cases = (colSums(areas)/nrow(areas))*100)
```

```{r}
rownames(areas_analysis) <- c("Machine Learning","Deep Learning","Other type of AI", "Not specified")
knitr::kable(areas_analysis)
```
*create bar graph*
*rownames as column*
```{r}
areas_analysis1 <- areas_analysis                                         
areas_analysis1$row_names <- row.names(areas_analysis1)                     
areas_analysis1 
```
*graph*
```{r}
ggplot(areas_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of participants", title = "In which of the following areas of AI are you developing?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## Please specify the type of Machine Learning (ML) you aredeveloping:
```{r}
which(colnames(analysis) == "ki_art_ml___1")
which(colnames(analysis) == "ki_art_ml___6")
```

```{r}
ml <-analysis %>% select( 70:75)
view(ml)
```

**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
ml_analysis <-data.frame(Freq = colSums(ml),
           Pct.of.Resp = (colSums(ml)/sum(ml))*100,
           Pct.of.Cases = (colSums(ml)/nrow(ml))*100)

rownames(ml_analysis) <- c("Supervised ML","Semi-supervised ML","Unsupervised ML", "Reinforcement learning", "Other", "Not specified")

knitr::kable(ml_analysis)
```
*create bar graph*
*rownames as column*
```{r}
ml_analysis1 <- ml_analysis                                         
ml_analysis1$row_names <- row.names(ml_analysis1)                     
ml_analysis1 
```
*graph*
```{r}
ggplot(ml_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of participants", title = "Please specify the type of Machine Learning you aredeveloping") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## Please specify the type of Deep Learning you are developing:
```{r}
which(colnames(analysis) == "ki_art_dl___1")
which(colnames(analysis) == "ki_art_dl___5")
```

```{r}
dl <-analysis %>% select( 76:80)
view(dl)
```
**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
dl_analysis <-data.frame(Freq = colSums(dl),
           Pct.of.Resp = (colSums(dl)/sum(dl))*100,
           Pct.of.Cases = (colSums(dl)/nrow(dl))*100)

rownames(dl_analysis) <- c("Convolutional Networks","Recurrent neural Networks","Autoencoders", "Other", "Not specified")

knitr::kable(dl_analysis)
```
*create bar graph*
*rownames as column*
```{r}
dl_analysis1 <- dl_analysis                                         
dl_analysis1$row_names <- row.names(dl_analysis1)                     
dl_analysis1 
```
*graph*
```{r}
ggplot(dl_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of participants", title = "Please specify the type of Deep Learning you are developing") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## In which of the following areas of AI are you developing?
```{r}
which(colnames(analysis) == "ki_bereich___1")
which(colnames(analysis) == "ki_bereich___7")
```

```{r}
areas_2 <-analysis %>% select(81:87)
view(areas_2)
```
**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
areas_2_analysis <-data.frame(Freq = colSums(areas_2),
           Pct.of.Resp = (colSums(areas_2)/sum(areas_2))*100,
           Pct.of.Cases = (colSums(areas_2)/nrow(areas_2))*100)
```

```{r}
rownames(areas_2_analysis) <- c("Natural Language Processing","Clinical Decision Support","Image processing", "Computer vision", "Robotics", "Other", "Not specified")
knitr::kable(areas_2_analysis)
```
*create bar graph*
*rownames as column*
```{r}
areas_2_analysis1 <- areas_2_analysis                                         
areas_2_analysis1$row_names <- row.names(areas_2_analysis1)                     
areas_2_analysis1 
```
*graph*
```{r}
ggplot(areas_2_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of participants", title = " In which of the following areas of AI are you developing?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


# Main Outcomes
## How familiar are you with biases in AI?
```{r}
familiar <- analysis %>% 
  group_by(bias_ki) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))

familiar$bias_ki <- factor(familiar$bias_ki,levels = c(1,2,3), labels = c("I have never heard of biases in AI","I have heard of biases in AI but I can not think ofconcrete examples","I have heard of biases in AI and I do know specifi cuse cases"))

knitr::kable(familiar)
```

```{r}
ggplot(familiar) + geom_col(aes(reorder(bias_ki, N, sum), N),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Number of Participants", title = "How familiar are you with biases in AI?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## Where do you think biases in AI can occur?
```{r}
which(colnames(analysis) == "bias_feld___1")
which(colnames(analysis) == "bias_feld___6")
```

```{r}
occur <-analysis %>% select(90:95)
view(occur)
```

**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
occur_2_analysis <-data.frame(Freq = colSums(occur),
           Pct.of.Resp = (colSums(occur)/sum(occur))*100,
           Pct.of.Cases = (colSums(occur)/nrow(occur))*100)
```

```{r}
rownames(occur_2_analysis) <- c("Methodology \nof algorithms","Societal factors","Bias due to data\nvalidation or \ndata security", "Not specified", "Other", "None of the above")
knitr::kable(occur_2_analysis)
```
*create bar graph*
*rownames as column*
```{r}
occur_2_analysis1 <- occur_2_analysis                                         
occur_2_analysis1$row_names <- row.names(occur_2_analysis1)                     
occur_2_analysis1 
```
*graph*
```{r}
ggplot(occur_2_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of participants", title = "Where do you think biases in AI can occur?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## What data do you use to train AI algorithms?
```{r}
which(colnames(analysis) == "ki_trainingsdaten___1")
which(colnames(analysis) == "ki_trainingsdaten___4")
```

```{r}
train <-analysis %>% select(97:100)
view(train)
```

**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
train_2_analysis <-data.frame(Freq = colSums(train),
           Pct.of.Resp = (colSums(train)/sum(train))*100,
           Pct.of.Cases = (colSums(train)/nrow(train))*100)
```

```{r}
rownames(train_2_analysis) <- c("Image data","Audio data","Text data", "Not specified")
knitr::kable(train_2_analysis)
```
*create bar graph*
*rownames as column*
```{r}
train_2_analysis1 <- train_2_analysis                                         
train_2_analysis1$row_names <- row.names(train_2_analysis1)                     
train_2_analysis1 
```
*graph*
```{r}
ggplot(train_2_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of respondents", title = "What data do you use to train AI algorithms?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## What is the origin of your data?
```{r}
which(colnames(analysis) == "ki_daten_quelle___1")
which(colnames(analysis) == "ki_daten_quelle___6")
```

```{r}
origin <-analysis %>% select( 101:106)
view(origin)
```

**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
origin_2_analysis <-data.frame(Freq = colSums(origin),
           Pct.of.Resp = (colSums(origin)/sum(origin))*100,
           Pct.of.Cases = (colSums(origin)/nrow(origin))*100)
```

```{r}
rownames(origin_2_analysis) <- c("Wearables","Registries","Database from\none center", "Multi-center\ndatabase", "Other", "Not specified")

knitr::kable(origin_2_analysis)
```
*create bar graph*
*rownames as column*
```{r}
origin_2_analysis1 <- origin_2_analysis                                         
origin_2_analysis1$row_names <- row.names(origin_2_analysis1)                     
origin_2_analysis1 
```
*graph*
```{r}
ggplot(origin_2_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of respondents", title = "What is the origin of your data?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## Do you work with national or international data?
```{r}
national <- analysis %>% 
  group_by(nat_internat_daten) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))

national$nat_internat_daten <- factor(national$nat_internat_daten,levels = c(1,2,3, 4), labels = c("National data","International data","National and international data", "Not specified"))

knitr::kable(national)
```

```{r}
ggplot(national) + geom_col(aes(reorder(nat_internat_daten, N, sum), N),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Number of Participants", title = "Do you work with national or international data?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## Do you think that using standardized data (internationalsemantic and syntactic standards such as HL7 FHIR, SNOMEDCT) when training AI algorithms can prevent bias? (radio)
```{r}
standard <- analysis %>% 
  group_by(standard_daten) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))

standard$standard_daten <- factor(standard$standard_daten,levels = c(1,2,3), labels = c("Yes","No","Not specified"))

knitr::kable(standard)
```

```{r}
ggplot(standard) + geom_col(aes(reorder(standard_daten, Percent, sum), Percent),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of Participants", title = "Do you think that using standardized data\nwhen training AI algorithms can prevent bias?") +
  scale_y_continuous(breaks = seq(0,100,10)) +
  coord_flip() +
  bar_theme
```


## Do you know any of the following preventive measures to avoid bias in AI applications?
```{r}
which(colnames(analysis) == "massnahmen_bias___1")
which(colnames(analysis) == "massnahmen_bias___6")
```

```{r}
measures <-analysis %>% select( 110:115)
view(measures)
```

**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
measures_analysis <-data.frame(Freq = colSums(measures),
           Pct.of.Resp = (colSums(measures)/sum(measures))*100,
           Pct.of.Cases = (colSums(measures)/nrow(measures))*100)
```

```{r}
rownames(measures_analysis) <- c("Employ Explainable\nArtificial Intelligence (XAI)","Collecting sociodemographic\ndata points", "Statistical analysis", "Software evaluating\nfairness in AI", "I do not know\nany of them", "Other")

knitr::kable(measures_analysis)
```
*create bar graph*
*rownames as column*
```{r}
measures_analysis1 <- measures_analysis                                         
measures_analysis1$row_names <- row.names(measures_analysis1)                     
measures_analysis1 
```
*graph*
```{r}
ggplot(measures_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of participants", title = "Do you know any of the following preventive\nmeasures to avoid bias in AI?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## What sociodemographic data would you collect to prevent biases in AI?
```{r}
which(colnames(analysis) == "soziodem_daten___1")
which(colnames(analysis) == "soziodem_daten___7")
```
```{r}
soziodem <-analysis %>% select( 117:123)
view(soziodem)
```

**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
soziodem_analysis <-data.frame(Freq = colSums(soziodem),
           Pct.of.Resp = (colSums(soziodem)/sum(soziodem))*100,
           Pct.of.Cases = (colSums(soziodem)/nrow(soziodem))*100)
```

```{r}
rownames(soziodem_analysis) <- c("Biological gender","Social gender","Age", "Origin", "Other", "None", "Not specified")

knitr::kable(soziodem_analysis)
```
*create bar graph*
*rownames as column*
```{r}
soziodem_analysis1 <- soziodem_analysis                                         
soziodem_analysis1$row_names <- row.names(soziodem_analysis1)                     
soziodem_analysis1 
```
*graph*
```{r}
ggplot(soziodem_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of participants", title = "What sociodemographic data would you collect to prevent biases in AI?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## What would you use the collected sociodemographic data for?
```{r}
which(colnames(analysis) == "soziodem_verwendung___1")
which(colnames(analysis) == "soziodem_verwendung___4")
```
```{r}
collect <-analysis %>% select(125:128)
view(collect)
nrow(collect)
```
**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
collect_analysis <-data.frame(Freq = colSums(collect),
           Pct.of.Resp = (colSums(collect)/sum(collect))*100,
           Pct.of.Cases = (colSums(collect)/nrow(collect))*100)
```

```{r}
rownames(collect_analysis) <- c("AI modelling","Analysis","Data acquisition", "Not specified")

knitr::kable(collect_analysis)
```
*create bar graph*
*rownames as column*
```{r}
collect_analysis1 <- collect_analysis                                         
collect_analysis1$row_names <- row.names(collect_analysis1)                     
collect_analysis1 
```
*graph*
```{r}
ggplot(collect_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of Participants", title = "What would you use the collected sociodemographic data for?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## How would you rate the level of fairness of AI in your AI development? (radio)
```{r}
levelfair <- analysis %>% 
  group_by(fairness_ki) %>% 
  summarize(N = n()) %>% 
  mutate(Percent = N / sum(N) * 100,
         Percent = round(Percent, 1)) %>% 
  arrange(desc(N))

levelfair$fairness_ki <- factor(levelfair$fairness_ki,levels = c(1,2,3,4,5,6), labels = c("Not fair at all","Barely fair","Moderately fair", "Fair", "Very fair", "Not specifi ed"))

knitr::kable(levelfair)
```

```{r}
ggplot(levelfair) + geom_col(aes(reorder(fairness_ki, Percent, sum), Percent),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Number of Participants", title = "How would you rate the level of fairness of AI in your AI development?") +
  scale_y_continuous(breaks = seq(0,200,25)) +
  coord_flip() +
  bar_theme
```


## What do you think is preventing fair AI in your use case?
```{r}
which(colnames(analysis) == "fairness_ki_block___1")
which(colnames(analysis) == "fairness_ki_block___7")
which(colnames(analysis) == "fairness_ki")
```
```{r}
prevent <-analysis %>% select(130:136, 129)
view(prevent)
nrow(prevent)
```
*Show the fi eld ONLY if:*
*[fairness_ki_1] = '1' or [fairness_ki_1] = '2' or [fairness_ki_1] ='3' or [fairness_ki_1] = '4' or [fairness_ki_1] = '6'*
```{r}
prevent <- prevent[(prevent$fairness_ki==1    | prevent$fairness_ki==2    | prevent$fairness_ki==3 | prevent$fairness_ki==4| prevent$fairness_ki==6),]
nrow(prevent)
```
*drop column*
```{r}
prevent <- subset(prevent, select = -c(fairness_ki))
```

**Multi-response analysis**
*Pct.of.cases is the percentage of respondents who selected that particular option. You'll notice that these don't add up to 100% because people could choose more than one option.Pct.of.resp. is the percentage of responses (so not like I said in the video respondents, but responses).  These do add up to 100%. *
```{r}
prevent_analysis <-data.frame(Freq = colSums(prevent),
           Pct.of.Resp = (colSums(prevent)/sum(prevent))*100,
           Pct.of.Cases = (colSums(prevent)/nrow(prevent))*100)
```
*Create table*
```{r}
rownames(prevent_analysis) <- c("Lack of resources","Lack of support from \nsuperiors/institution","Lack of knowledge", "Lack of fair data", "Lack of guidelines/\nrecommendations", "Other", "Not specified")

knitr::kable(prevent_analysis)
```
*create bar graph*
*rownames as column*
```{r}
prevent_analysis1 <- prevent_analysis                                         
prevent_analysis1$row_names <- row.names(prevent_analysis1)                     
prevent_analysis1 
```

```{r}
ggplot(prevent_analysis1) + geom_col(aes(reorder(row_names, Pct.of.Cases, sum), Pct.of.Cases),
                              fill = bar_color,
                              width = bar_width) +
  labs(x = "", y = "Percentage of Participants", title = "What is preventing fair AI in your use case?") +
  scale_y_continuous(breaks = seq(0,100,10)) +
  coord_flip() +
  bar_theme
```

***

# Subanalysis

```{r}
library(arsenal)
```

## create dataset for subanalysis
```{r}
subanalysis <-analysis %>% select(fairness_ki, geschlecht, alter, sektor, agegroup2, agegroup4)
view(subanalysis)
nrow(subanalysis)
```
```{r}
is.factor(subanalysis$geschlecht)
is.factor(subanalysis$fairness_ki)
is.factor(subanalysis$sektor)
is.factor(subanalysis$agegroup2)
is.factor(subanalysis$agegroup4)
is.numeric(subanalysis$alter)
```
```{r}
subanalysis$fairness_ki = as.factor(subanalysis$fairness_ki)
subanalysis$fairness_ki <- factor(subanalysis$fairness_ki,levels = c(1,2,3,4,5,6), labels = c("Not fair at all","Barely fair","Moderately fair", "Fair", "Very fair", "Not specified"))
subanalysis$geschlecht <- factor(subanalysis$geschlecht,levels = c(1,2,3,4), labels = c("Female","Male","Diverse", "Undefined"))
```


```{r}
library(data.table)
```
### convert to data table
```{r}
is.data.table(subanalysis) == TRUE
```

# Write File for Analysis
```{r}
write_xlsx(subanalysis, 'subanalysis.xlsx')
```

## Age Group 4
```{r}
table(subanalysis$fairness_ki, subanalysis$agegroup4)
table2 = table(subanalysis$fairness_ki, subanalysis$agegroup4)
chisq.test(table2)
chisq.test(table2)$expected
```
```{r}
round(prop.table(table2,2),2)
```
Age Group 2
```{r}
table(subanalysis$fairness_ki, subanalysis$agegroup2)
table2 = table(subanalysis$fairness_ki, subanalysis$agegroup2)
chisq.test(table2)
chisq.test(table2)$expected
```
```{r}
round(prop.table(table2,2),2)
```


## gender
```{r}
subanalysis <- subanalysis %>% drop_na(geschlecht)
nrow(subanalysis)
```
## Chi Square
```{r}
table(subanalysis$fairness_ki, subanalysis$geschlecht)
table1 = table(subanalysis$fairness_ki, subanalysis$geschlecht)
chisq.test(table1)
chisq.test(table1)$expected
```
```{r}
fisher.test(table1,conf.int = T,conf.level = 0.99)
```


###column%
```{r}
round(prop.table(table1,2),2)
```


## Sektor
subanalysis$sektor <- factor(subanalysis$sektor,levels = c(1,2,3,4,5), labels = c("Science","Clinical Work","Industry", "Other", "Not specified"))

```{r}
subanalysis_2 <- subset(subanalysis, subanalysis$sektor != 4 & subanalysis$sektor !=  5)
```

```{r}
subanalysis_2$sektor <- factor(subanalysis_2$sektor,levels = c(1,2,3), labels = c("Science","Clinical Work","Industry"))
```

```{r}
table(subanalysis_2$fairness_ki, subanalysis_2$sektor)
table1 = table(subanalysis_2$fairness_ki, subanalysis_2$sektor)
chisq.test(table1)
chisq.test(table1)$expected
```


###column%
```{r}
round(prop.table(table1,2),2)
```


https://jakec007.github.io/2021-06-23-R-likert/
http://reganmian.net/blog/2013/10/02/likert-graphs-in-r-embedding-metadata-for-easier-plotting/
https://github.com/jbryer/likert/blob/master/R/likert-package.R