added ci

bit2r · May 15, 2024 · 2a4a8b8 · 2a4a8b8
1 parent db81c93
commit 2a4a8b8
Show file tree

Hide file tree

Showing 10 changed files with 2,938 additions and 6 deletions.
diff --git a/05_infer/ci/index.qmd b/05_infer/ci/index.qmd
@@ -0,0 +1,368 @@
+---
+title: "신뢰구간"
+author: "이광춘"
+date: today
+categories: [추론, 신뢰구간]
+image: "thumbnail.png"
+editor_options: 
+  chunk_output_type: console
+---
+
+신뢰구간은 통계학에서 모수 추정에 사용되는 중요한 개념입니다. 
+신뢰구간은 표본에서 계산된 추정량이 실제 모수를 포함할 확률을 나타내는 구간을 의미합니다. 신뢰구간은 모수 추정의 불확실성을 고려하여 모수 추정에 대한 더 정확한 정보를 제공합니다. 대표적으로 모집단 평균과 모집단 비율에 대한 신뢰구간을 계산하는 방법을 시각적으로 확인할 수 있습니다.
+
+# Shiny 앱
+
+:::{.column-page}
+
+```{shinylive-r}
+#| label: shinylive-infer-ci
+#| viewerWidth: 800
+#| viewerHeight: 600
+#| standalone: true
+
+library(shiny)
+library(ggplot2)
+library(tidyverse)
+library(showtext)
+showtext_auto()  
+
+ui <- fluidPage(
+  title = "신뢰구간 모의실험",
+
+  fluidRow(
+    column(3,
+           h2('신뢰구간'),
+           hr(),
+           radioButtons("case", "평균과 비율 추정 선택:",
+                        choices = c("모집단 평균", "모집단 비율"),
+                        inline = TRUE,
+                        selected = "모집단 평균"),
+           hr(),
+           conditionalPanel(
+             condition = "input.case == '모집단 비율'",
+             sliderInput("pop_prop",
+                         "모집단 비율:",
+                         min = 0,
+                         max = 1,
+                         step = 0.05,
+                         value = 0.4)
+           ),
+           conditionalPanel(
+             condition = "input.case == '모집단 평균'",
+             numericInput("pop_mean", "모집단 평균:", value = 10),
+             numericInput("pop_sd", "모집단 표준편차:", value = 5, min = 0)
+           ),
+           hr(),
+           sliderInput("sample_size",
+                       "표본 크기:",
+                       min = 10,
+                       max = 1000,
+                       step = 10,
+                       value = 100),
+           sliderInput("n_samples",
+                       "표본 수:",
+                       min = 0,
+                       max = 500,
+                       step = 10,
+                       value = 10),
+           sliderInput("conf_level",
+                       "신뢰 수준:",
+                       min = 5,
+                       max = 99,
+                       step = 1,
+                       value = 95)
+    ),
+    column(8,
+           mainPanel(
+             plotOutput('IntervalPlot', height = '500px'),
+             verbatimTextOutput("summary"),
+             verbatimTextOutput("ci")
+           )
+    )
+  )
+)
+
+server <- function(input, output) {
+
+  output$IntervalPlot <- renderPlot({
+    n_samples = input$n_samples
+    sample_size = input$sample_size
+    alpha = 1 - (input$conf_level / 100)
+
+    if (input$case == "모집단 비율") {
+      n_population <- 100000
+      population <- rbernoulli(n_population, p = input$pop_prop)
+      true_value <- mean(population)
+
+      sample_stats <- vector('numeric', n_samples)
+      upper_cis <- vector('numeric', n_samples)
+      lower_cis <- vector('numeric', n_samples)
+      contains_true_value <- vector('numeric', n_samples)
+      for (i in 1:n_samples) {
+        sample <- sample(population, sample_size, replace = FALSE)
+        sample_stat <- mean(sample)
+        sample_std <- (sqrt(sample_stat * (1 - sample_stat)) / sqrt(sample_size))
+        z_value <- qnorm(alpha/2, lower.tail = FALSE)
+        margin_of_error <- z_value * sample_std
+
+        sample_stats[i] <- sample_stat
+        upper_cis[i] <- sample_stat + margin_of_error
+        lower_cis[i] <- sample_stat - margin_of_error
+        contains_true_value[i] <- ifelse(true_value >= lower_cis[i] & true_value <= upper_cis[i], 1, 0)
+      }
+
+      title_text <- paste(input$conf_level, "% 신뢰구간 (모집단 비율)", sep = " ")
+      y_label <- "표본 비율"
+
+    } else if (input$case == "모집단 평균") {
+      n_population <- 100000
+      population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd)
+      true_value <- mean(population)
+
+      sample_stats <- vector('numeric', n_samples)
+      upper_cis <- vector('numeric', n_samples)
+      lower_cis <- vector('numeric', n_samples)
+      contains_true_value <- vector('numeric', n_samples)
+      for (i in 1:n_samples) {
+        sample <- sample(population, sample_size, replace = FALSE)
+        sample_stat <- mean(sample)
+        sample_std <- sd(sample) / sqrt(sample_size)
+        t_value <- qt(alpha/2, df = sample_size - 1, lower.tail = FALSE)
+        margin_of_error <- t_value * sample_std
+
+        sample_stats[i] <- sample_stat
+        upper_cis[i] <- sample_stat + margin_of_error
+        lower_cis[i] <- sample_stat - margin_of_error
+        contains_true_value[i] <- ifelse(true_value >= lower_cis[i] & true_value <= upper_cis[i], 1, 0)
+      }
+
+      title_text <- paste(input$conf_level, "% 신뢰구간 (모집단 평균)", sep = " ")
+      y_label <- "표본 평균"
+    }
+
+    subtitle_text <- paste0('관측된 포함률: ',
+                            round(mean(contains_true_value), 2) * 100, '%')
+
+    contains_true_value_f <- ifelse(contains_true_value==1, "포함", "미포함")
+    contains_true_value_f <- factor(contains_true_value_f)
+
+    p <- tibble(
+      sample_stat = sample_stats,
+      upper_ci = upper_cis,
+      lower_ci = lower_cis) %>%
+      mutate(sample_number = as.factor(row_number())) %>%
+      ggplot(aes(x = sample_number, y = sample_stat)) +
+      coord_flip() +
+      ggtitle(title_text, subtitle = subtitle_text) +
+      ylab(y_label) +
+      xlab("") +
+      theme_minimal() +
+      theme(axis.text.y = element_blank(),
+            axis.ticks.y = element_blank(),
+            axis.line.x = element_line(color="black", size = 0.5),
+            legend.position = "none") +
+      scale_x_discrete(breaks = NULL)
+
+    p <- p + geom_hline(aes(yintercept = true_value),
+                        linetype = 'dashed', size = 1)
+    p <- p +
+      geom_linerange(aes(ymin = lower_ci, ymax = upper_ci, color=contains_true_value_f)) +
+      geom_point(aes(color=contains_true_value_f), size=2) +
+      scale_color_manual(values=c("포함" = "#009E73", "미포함" = "#D55E00"))
+
+
+    p
+  })
+
+  output$summary <- renderPrint({
+    n_samples = input$n_samples
+    sample_size = input$sample_size
+
+    if (input$case == "모집단 비율") {
+      n_population <- 100000
+      population <- rbernoulli(n_population, p = input$pop_prop)
+
+      sample_stats <- vector('numeric', n_samples)
+      for (i in 1:n_samples) {
+        sample <- sample(population, sample_size, replace = FALSE)
+        sample_stats[i] <- mean(sample)
+      }
+
+      cat("표본 비율의 평균(점추정):\n")
+      print(mean(sample_stats))
+
+    } else if (input$case == "모집단 평균") {
+      n_population <- 100000
+      population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd)
+
+      sample_stats <- vector('numeric', n_samples)
+      for (i in 1:n_samples) {
+        sample <- sample(population, sample_size, replace = FALSE)
+        sample_stats[i] <- mean(sample)
+      }
+
+      cat("표본 평균의 평균(점 추정):\n")
+      print(mean(sample_stats))
+    }
+  })
+
+  output$ci <- renderPrint({
+    n_samples = input$n_samples
+    sample_size = input$sample_size
+    alpha = 1 - (input$conf_level / 100)
+
+    if (input$case == "모집단 비율") {
+      n_population <- 100000
+      population <- rbernoulli(n_population, p = input$pop_prop)
+
+      sample_stats <- vector('numeric', n_samples)
+      for (i in 1:n_samples) {
+        sample <- sample(population, sample_size, replace = FALSE)
+        sample_stats[i] <- mean(sample)
+      }
+
+      se <- sqrt(mean(sample_stats) * (1 - mean(sample_stats)) / sample_size)
+      lower_ci <- mean(sample_stats) - qnorm(1 - alpha/2) * se
+      upper_ci <- mean(sample_stats) + qnorm(1 - alpha/2) * se
+
+      cat(paste0(input$conf_level, "% 신뢰구간 (모집단 비율):\n"))
+      cat(paste0("[", round(lower_ci, 2), ", ", round(upper_ci, 2), "]"))
+
+    } else if (input$case == "모집단 평균") {
+      n_population <- 100000
+      population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd)
+
+      sample_stats <- vector('numeric', n_samples)
+      for (i in 1:n_samples) {
+        sample <- sample(population, sample_size, replace = FALSE)
+        sample_stats[i] <- mean(sample)
+      }
+
+      se <- sd(sample_stats) / sqrt(n_samples)
+      lower_ci <- mean(sample_stats) - qt(1 - alpha/2, df = n_samples - 1) * se
+      upper_ci <- mean(sample_stats) + qt(1 - alpha/2, df = n_samples - 1) * se
+
+      cat(paste0(input$conf_level, "% 신뢰구간 (모집단 평균):\n"))
+      cat(paste0("[", round(lower_ci, 2), ", ", round(upper_ci, 2), "]"))
+    }
+  })
+}
+
+shinyApp(ui = ui, server = server)
+```
+
+:::
+
+
+
+# 코딩
+
+```{webr-r}
+library(ggplot2)
+
+# 설정값 지정
+n_samples <- 100
+sample_size <- 50
+conf_level <- 0.95
+
+# 모집단 비율에 대한 신뢰구간 계산 및 시각화
+pop_prop <- 0.4
+n_population <- 100000
+population <- rbernoulli(n_population, p = pop_prop)
+true_prop <- mean(population)
+
+sample_props <- vector('numeric', n_samples)
+upper_cis_prop <- vector('numeric', n_samples)
+lower_cis_prop <- vector('numeric', n_samples)
+contains_true_prop <- vector('numeric', n_samples)
+
+for (i in 1:n_samples) {
+  sample <- sample(population, sample_size, replace = FALSE)
+  sample_prop <- mean(sample)
+  sample_std <- sqrt(sample_prop * (1 - sample_prop) / sample_size)
+  z_value <- qnorm((1 - conf_level) / 2, lower.tail = FALSE)
+  margin_of_error <- z_value * sample_std
+  
+  sample_props[i] <- sample_prop
+  upper_cis_prop[i] <- sample_prop + margin_of_error
+  lower_cis_prop[i] <- sample_prop - margin_of_error
+  contains_true_prop[i] <- ifelse(true_prop >= lower_cis_prop[i] & true_prop <= upper_cis_prop[i], 1, 0)
+}
+
+prop_df <- data.frame(
+  sample_prop = sample_props,
+  upper_ci = upper_cis_prop,
+  lower_ci = lower_cis_prop,
+  contains_true_prop = contains_true_prop
+)
+
+prop_plot <- ggplot(prop_df, aes(x = 1:n_samples, y = sample_prop)) +
+  geom_point() +
+  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci, color = factor(contains_true_prop))) +
+  geom_hline(yintercept = true_prop, linetype = "dashed", color = "red") +
+  labs(title = "모집단 비율에 대한 신뢰구간",
+       x = "표본 번호",
+       y = "표본 비율") +
+  scale_color_manual(values = c("0" = "gray", "1" = "blue"),
+                     labels = c("포함 안 함", "포함"),
+                     name = "참값 포함 여부") +
+  theme_minimal()
+
+print(prop_plot)
+
+cat("모집단 비율에 대한 신뢰구간 포함률:", mean(contains_true_prop), "\n")
+
+# 모집단 평균에 대한 신뢰구간 계산 및 시각화
+pop_mean <- 10
+pop_sd <- 2
+n_population <- 100000
+population <- rnorm(n_population, mean = pop_mean, sd = pop_sd)
+true_mean <- mean(population)
+
+sample_means <- vector('numeric', n_samples)
+upper_cis_mean <- vector('numeric', n_samples)
+lower_cis_mean <- vector('numeric', n_samples)
+contains_true_mean <- vector('numeric', n_samples)
+
+for (i in 1:n_samples) {
+  sample <- sample(population, sample_size, replace = FALSE)
+  sample_mean <- mean(sample)
+  sample_std <- sd(sample) / sqrt(sample_size)
+  t_value <- qt((1 - conf_level) / 2, df = sample_size - 1, lower.tail = FALSE)
+  margin_of_error <- t_value * sample_std
+  
+  sample_means[i] <- sample_mean
+  upper_cis_mean[i] <- sample_mean + margin_of_error
+  lower_cis_mean[i] <- sample_mean - margin_of_error
+  contains_true_mean[i] <- ifelse(true_mean >= lower_cis_mean[i] & true_mean <= upper_cis_mean[i], 1, 0)
+}
+
+mean_df <- data.frame(
+  sample_mean = sample_means,
+  upper_ci = upper_cis_mean,
+  lower_ci = lower_cis_mean,
+  contains_true_mean = contains_true_mean
+)
+
+mean_plot <- ggplot(mean_df, aes(x = 1:n_samples, y = sample_mean)) +
+  geom_point() +
+  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci, color = factor(contains_true_mean))) +
+  geom_hline(yintercept = true_mean, linetype = "dashed", color = "red") +
+  labs(title = "모집단 평균에 대한 신뢰구간",
+       x = "표본 번호",
+       y = "표본 평균") +
+  scale_color_manual(values = c("0" = "gray", "1" = "blue"),
+                     labels = c("포함 안 함", "포함"),
+                     name = "참값 포함") +
+  theme_minimal()
+
+
+print(mean_plot)
+
+cat("모집단 평균에 대한 신뢰구간 포함률:", mean(contains_true_mean), "\n")
+```
+
+
+
+