added multivariate eda

bit2r · May 15, 2024 · da18e31 · da18e31
1 parent e0dcf62
commit da18e31
Show file tree

Hide file tree

Showing 11 changed files with 3,116 additions and 100 deletions.
diff --git a/02_eda/multivariate_stat/index.qmd b/02_eda/multivariate_stat/index.qmd
@@ -0,0 +1,391 @@
+---
+title: "단변량 기술통계"
+author: "이광춘"
+date: today
+image: thumbnail.png
+categories: [단변량, 기술통계, 범주형, 연속형, 막대그래프, 원그래프, 히스토그램, 상자그림, 시각화]
+editor_options: 
+  chunk_output_type: console
+---
+
+내장 데이터셋과 외부 CSV 파일에서 단변량 변수를 선택하면 범주형, 숫자형을 인식하고 변수에 따라 적절한  기술통계를 계산하고 시각화할 수 있다.
+
+# Shiny 앱
+
+:::{.column-page}
+
+```{shinylive-r}
+#| label: shinylive-desc-stat-multivariate
+#| viewerWidth: 800
+#| viewerHeight: 600
+#| standalone: true
+
+library(shiny)
+library(ggplot2)
+library(GGally)
+library(vcd)
+library(dplyr)
+
+# 내장 데이터셋 리스트 (iris 제외)
+datasets <- c("mtcars", "faithful", "ChickWeight")
+
+# UI 정의
+ui <- fluidPage(
+  titlePanel("2변량 이상 기술통계 분석"),
+  sidebarLayout(
+    sidebarPanel(
+      fileInput("file", "CSV 파일 업로드", accept = ".csv"),
+      radioButtons("dataset", "데이터셋 선택",
+                   choices = c("내장 데이터셋", "업로드한 CSV 파일"),
+                   selected = "내장 데이터셋"),
+      uiOutput("datasetInput"),
+      uiOutput("categoricalInput"),
+      uiOutput("numericInput")
+    ),
+    mainPanel(
+      tabsetPanel(
+        tabPanel("시각화", plotOutput("plot")),
+        tabPanel("요약 통계",
+                 verbatimTextOutput("summary"),
+                 verbatimTextOutput("correlation")
+        )
+      )
+    )
+  )
+)
+
+# 서버 로직
+server <- function(input, output, session) {
+  # 데이터셋 선택 UI 생성
+  output$datasetInput <- renderUI({
+    if (input$dataset == "내장 데이터셋") {
+      selectInput("selectedDataset", "내장 데이터셋 선택", choices = datasets)
+    } else {
+      selectInput("selectedDataset", "업로드한 CSV 파일 선택", choices = input$file$name)
+    }
+  })
+
+  # 선택된 데이터셋
+  selected_data <- reactive({
+    if (input$dataset == "내장 데이터셋") {
+      if (!is.null(input$selectedDataset)) {
+        get(input$selectedDataset)
+      } else {
+        NULL
+      }
+    } else {
+      if (!is.null(input$file)) {
+        read.csv(input$file$datapath)
+      } else {
+        NULL
+      }
+    }
+  })
+
+  # 범주형 변수 선택 UI 생성
+  output$categoricalInput <- renderUI({
+    data <- selected_data()
+    if (!is.null(data)) {
+      vars <- names(data)
+      categorical_vars <- vars[sapply(data, is.factor) | sapply(data, is.character)]
+      checkboxGroupInput("categoricalVariables", "범주형 변수 선택", choices = categorical_vars)
+    }
+  })
+
+  # 숫자형 변수 선택 UI 생성
+  output$numericInput <- renderUI({
+    data <- selected_data()
+    if (!is.null(data)) {
+      vars <- names(data)
+      numeric_vars <- vars[sapply(data, is.numeric)]
+      checkboxGroupInput("numericVariables", "연속형 변수 선택", choices = numeric_vars)
+    }
+  })
+
+  # 선택된 범주형 변수
+  selected_categorical_vars <- reactive({
+    input$categoricalVariables
+  })
+
+  # 선택된 숫자형 변수
+  selected_numeric_vars <- reactive({
+    input$numericVariables
+  })
+
+  # 선택된 모든 변수
+  selected_vars <- reactive({
+    c(selected_categorical_vars(), selected_numeric_vars())
+  })
+
+  # 요약 통계 출력
+  output$summary <- renderPrint({
+    req(selected_vars())
+    data <- selected_data()
+    categorical_vars <- selected_categorical_vars()
+    numeric_vars <- selected_numeric_vars()
+
+    if (length(categorical_vars) == 1 && length(numeric_vars) == 1) {
+      # 범주형 변수 1개와 연속형 변수 1개 선택 시
+      grouped_summary <- data %>%
+        group_by(across(all_of(categorical_vars))) %>%
+        summarise(
+          mean = mean(get(numeric_vars), na.rm = TRUE),
+          median = median(get(numeric_vars), na.rm = TRUE),
+          sd = sd(get(numeric_vars), na.rm = TRUE),
+          variance = var(get(numeric_vars), na.rm = TRUE),
+          min = min(get(numeric_vars), na.rm = TRUE),
+          max = max(get(numeric_vars), na.rm = TRUE),
+          q1 = quantile(get(numeric_vars), 0.25, na.rm = TRUE),
+          q3 = quantile(get(numeric_vars), 0.75, na.rm = TRUE)
+        )
+
+      print(grouped_summary)
+    } else if (length(selected_vars()) >= 1) {
+      summary(data[, selected_vars(), drop = FALSE])
+    } else {
+      "1개 이상의 변수를 선택해주세요."
+    }
+  })
+
+  # 상관관계 출력
+  output$correlation <- renderPrint({
+    req(selected_vars())
+    data <- selected_data()
+    categorical_vars <- selected_categorical_vars()
+    numeric_vars <- selected_numeric_vars()
+
+    if (length(categorical_vars) == 2) {
+      # 범주형 변수 2개 선택 시
+      contingency_table <- table(data[, categorical_vars], useNA = "ifany")
+      chisq_test <- chisq.test(contingency_table)
+
+      cat("Contingency Table:\n")
+      print(contingency_table)
+      cat("\nChi-squared Test:\n")
+      print(chisq_test)
+    } else if (length(categorical_vars) > 0 && length(numeric_vars) > 0) {
+      "범주형 변수와 연속형 변수 간의 상관관계는 계산할 수 없습니다."
+    } else if (length(categorical_vars) >= 2) {
+      "범주형 변수가 너무 많습니다. 2개만 선택해 주세요."
+    } else if (length(numeric_vars) >= 2) {
+      cor_matrix <- cor(data[, numeric_vars, drop = FALSE], use = "pairwise.complete.obs")
+      print(cor_matrix)
+    } else {
+      "2개 이상의 동일한 유형의 변수를 선택해주세요."
+    }
+  })
+
+  # 범주형 변수를 factor로 변환
+  data_with_factors <- reactive({
+    data <- selected_data()
+    categorical_vars <- selected_categorical_vars()
+
+    if (length(categorical_vars) > 0) {
+      data[, categorical_vars] <- lapply(data[, categorical_vars, drop = FALSE], as.factor)
+    }
+
+    data
+  })
+
+  # 시각화 출력
+  output$plot <- renderPlot({
+    req(selected_vars())
+    data <- data_with_factors()
+    categorical_vars <- selected_categorical_vars()
+    numeric_vars <- selected_numeric_vars()
+
+    if (length(numeric_vars) == 1 && length(categorical_vars) == 0) {
+      # 연속형 변수 1개 선택 시
+      ggplot(data, aes_string(x = numeric_vars[1])) +
+        geom_histogram(aes(y = ..density..), fill = "lightblue", color = "black", bins = 30) +
+        geom_density(color = "red") +
+        labs(title = "Histogram and Density Plot", x = numeric_vars[1], y = "Density") +
+        theme_minimal()
+    } else if (length(numeric_vars) == 2 && length(categorical_vars) == 0) {
+      # 연속형 변수 2개 선택 시
+      ggplot(data, aes_string(x = numeric_vars[1], y = numeric_vars[2])) +
+        geom_point() +
+        labs(title = "Scatter Plot", x = numeric_vars[1], y = numeric_vars[2]) +
+        theme_minimal()
+    } else if (length(numeric_vars) == 1 && length(categorical_vars) == 1) {
+      # 연속형 변수 1개 + 범주형 변수 1개 선택 시
+      ggplot(data, aes_string(x = categorical_vars[1], y = numeric_vars[1])) +
+        geom_boxplot() +
+        labs(title = "Box Plot", x = categorical_vars[1], y = numeric_vars[1]) +
+        theme_minimal()
+    } else if (length(numeric_vars) == 2 && length(categorical_vars) == 1) {
+      # 연속형 변수 2개 + 범주형 변수 1개 선택 시
+      ggplot(data, aes_string(x = numeric_vars[1], y = numeric_vars[2], color = categorical_vars[1])) +
+        geom_point() +
+        labs(title = "Scatter Plot with Group Color", x = numeric_vars[1], y = numeric_vars[2], color = categorical_vars[1]) +
+        theme_minimal()
+    } else if (length(categorical_vars) == 2) {
+      # 범주형 변수 2개 선택 시
+      if (nrow(table(data[, categorical_vars])) == 0) {
+        plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
+        text(1, 1, "선택한 범주형 변수의 조합에 해당하는 데이터가 없습니다.")
+      } else {
+        mosaic(table(data[, categorical_vars]), shade = TRUE, legend = TRUE)
+      }
+    } else if (length(categorical_vars) == 1) {
+      # 범주형 변수 1개 선택 시
+      p1 <- ggplot(data, aes_string(x = categorical_vars[1])) +
+        geom_bar(fill = "lightblue", color = "black") +
+        labs(title = "Bar Plot", x = categorical_vars[1], y = "Count") +
+        theme_minimal()
+
+      p2 <- ggplot(data, aes_string(x = factor(1), fill = categorical_vars[1])) +
+        geom_bar(width = 1) +
+        coord_polar("y", start = 0) +
+        labs(title = "Pie Chart", fill = categorical_vars[1]) +
+        theme_void() +
+        theme(legend.position = "right")
+
+      gridExtra::grid.arrange(p1, p2, ncol = 2)
+    } else if (length(selected_vars()) >= 3) {
+      # 3개 이상의 변수 선택 시
+      if (length(numeric_vars) < 2) {
+        plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
+        text(1, 1, "Pair Plot을 그리려면 연속형 변수가 2개 이상 필요합니다.")
+      } else {
+        ggpairs(data[, selected_vars(), drop = FALSE], title = "Pair Plot")
+      }
+    } else {
+      plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
+      text(1, 1, "1개 이상의 변수를 선택해주세요.")
+    }
+  })
+}
+# 앱 실행
+shinyApp(ui, server)
+
+```
+
+:::
+
+
+# 코딩
+
+```{webr-r}
+#| label: desc-stat-multivariate-webr
+
+# 필요한 라이브러리 로드
+library(ggplot2)
+library(GGally)
+library(vcd)
+library(dplyr)
+library(purrr)
+
+# 데이터 로드 (예시로 mtcars 데이터셋 사용)
+data <- mtcars
+
+# 변수 선택
+categorical_vars <- c("vs", "am", "gear", "carb")
+numeric_vars <- c("mpg", "disp", "hp", "drat", "wt", "qsec")
+
+# 범주형 변수를 팩터로 변환
+data[, categorical_vars] <- map(data[, categorical_vars], as.factor)
+
+# 시각화 함수
+visualize_data <- function(data, categorical_vars, numeric_vars) {
+  if (length(numeric_vars) == 1 && length(categorical_vars) == 0) {
+    # 연속형 변수 1개 선택 시
+    ggplot(data, aes_string(x = numeric_vars[1])) +
+      geom_histogram(aes(y = ..density..), fill = "lightblue", color = "black", bins = 30) +
+      geom_density(color = "red") +
+      labs(title = "히스토그램과 밀도 그래프", x = numeric_vars[1], y = "밀도") +
+      theme_minimal()
+  } else if (length(numeric_vars) == 2 && length(categorical_vars) == 0) {
+    # 연속형 변수 2개 선택 시
+    ggplot(data, aes_string(x = numeric_vars[1], y = numeric_vars[2])) +
+      geom_point() +
+      labs(title = "산점도", x = numeric_vars[1], y = numeric_vars[2]) +
+      theme_minimal()
+  } else if (length(numeric_vars) == 1 && length(categorical_vars) == 1) {
+    # 연속형 변수 1개 + 범주형 변수 1개 선택 시
+    ggplot(data, aes_string(x = categorical_vars[1], y = numeric_vars[1])) +
+      geom_boxplot() +
+      labs(title = "상자 그림", x = categorical_vars[1], y = numeric_vars[1]) +
+      theme_minimal()
+  } else if (length(numeric_vars) == 2 && length(categorical_vars) == 1) {
+    # 연속형 변수 2개 + 범주형 변수 1개 선택 시
+    ggplot(data, aes_string(x = numeric_vars[1], y = numeric_vars[2], color = categorical_vars[1])) +
+      geom_point() +
+      labs(title = "그룹별 색상이 적용된 산점도", x = numeric_vars[1], y = numeric_vars[2], color = categorical_vars[1]) +
+      theme_minimal()
+  } else if (length(categorical_vars) == 2) {
+    # 범주형 변수 2개 선택 시
+    if (nrow(table(data[, categorical_vars])) == 0) {
+      plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
+      text(1, 1, "선택한 범주형 변수의 조합에 해당하는 데이터가 없습니다.")
+    } else {
+      mosaic(table(data[, categorical_vars]), shade = TRUE, legend = TRUE)
+    }
+  } else if (length(categorical_vars) == 1) {
+    # 범주형 변수 1개 선택 시
+    p1 <- ggplot(data, aes_string(x = categorical_vars[1])) +
+      geom_bar(fill = "lightblue", color = "black") +
+      labs(title = "막대 그래프", x = categorical_vars[1], y = "개수") +
+      theme_minimal()
+    
+    p2 <- ggplot(data, aes_string(x = factor(1), fill = categorical_vars[1])) +
+      geom_bar(width = 1) +
+      coord_polar("y", start = 0) +
+      labs(title = "원 그래프", fill = categorical_vars[1]) +
+      theme_void() +
+      theme(legend.position = "right")
+    
+    gridExtra::grid.arrange(p1, p2, ncol = 2)
+  } else if (length(numeric_vars) >= 3) {
+    # 3개 이상의 연속형 변수 선택 시
+    ggpairs(data[, numeric_vars, drop = FALSE], title = "변수 간 관계 그래프")
+  } else {
+    plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
+    text(1, 1, "1개 이상의 변수를 선택해주세요.")
+  }
+}
+
+# 요약 통계량 함수
+summarize_data <- function(data, categorical_vars, numeric_vars) {
+  if (length(categorical_vars) == 1 && length(numeric_vars) == 1) {
+    # 범주형 변수 1개와 연속형 변수 1개 선택 시
+    grouped_summary <- data %>%
+      group_by(across(all_of(categorical_vars))) %>%
+      summarise(
+        평균 = mean(get(numeric_vars), na.rm = TRUE),
+        중앙값 = median(get(numeric_vars), na.rm = TRUE),
+        표준편차 = sd(get(numeric_vars), na.rm = TRUE),
+        분산 = var(get(numeric_vars), na.rm = TRUE),
+        최소값 = min(get(numeric_vars), na.rm = TRUE),
+        최대값 = max(get(numeric_vars), na.rm = TRUE),
+        `1사분위수` = quantile(get(numeric_vars), 0.25, na.rm = TRUE),
+        `3사분위수` = quantile(get(numeric_vars), 0.75, na.rm = TRUE)
+      )
+    
+    print(grouped_summary)
+  } else if (length(categorical_vars) == 2) {
+    # 범주형 변수 2개 선택 시
+    contingency_table <- table(data[, categorical_vars], useNA = "ifany")
+    chisq_test <- chisq.test(contingency_table)
+    
+    cat("분할표:\n")
+    print(contingency_table)
+    cat("\n카이제곱 검정:\n")
+    print(chisq_test)
+  } else if (length(numeric_vars) >= 2) {
+    # 연속형 변수 2개 이상 선택 시
+    cor_matrix <- cor(data[, numeric_vars, drop = FALSE], use = "pairwise.complete.obs")
+    print(cor_matrix)
+  } else {
+    cat("요약 통계량을 계산할 수 없습니다.\n")
+    cat("범주형 변수 1개와 연속형 변수 1개, 범주형 변수 2개, 또는 연속형 변수 2개 이상을 선택해주세요.\n")
+  }
+}
+
+# 시각화 수행
+visualize_data(data, categorical_vars, numeric_vars)
+
+# 요약 통계량 계산
+summarize_data(data, categorical_vars, numeric_vars)
+```
+