Skip to content

Commit

Permalink
added ci
Browse files Browse the repository at this point in the history
  • Loading branch information
statkclee committed May 15, 2024
1 parent db81c93 commit 2a4a8b8
Show file tree
Hide file tree
Showing 10 changed files with 2,938 additions and 6 deletions.
368 changes: 368 additions & 0 deletions 05_infer/ci/index.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,368 @@
---
title: "신뢰구간"
author: "이광춘"
date: today
categories: [추론, 신뢰구간]
image: "thumbnail.png"
editor_options:
chunk_output_type: console
---

신뢰구간은 통계학에서 모수 추정에 사용되는 중요한 개념입니다.
신뢰구간은 표본에서 계산된 추정량이 실제 모수를 포함할 확률을 나타내는 구간을 의미합니다. 신뢰구간은 모수 추정의 불확실성을 고려하여 모수 추정에 대한 더 정확한 정보를 제공합니다. 대표적으로 모집단 평균과 모집단 비율에 대한 신뢰구간을 계산하는 방법을 시각적으로 확인할 수 있습니다.

# Shiny 앱

:::{.column-page}

```{shinylive-r}
#| label: shinylive-infer-ci
#| viewerWidth: 800
#| viewerHeight: 600
#| standalone: true
library(shiny)
library(ggplot2)
library(tidyverse)
library(showtext)
showtext_auto()
ui <- fluidPage(
title = "신뢰구간 모의실험",
fluidRow(
column(3,
h2('신뢰구간'),
hr(),
radioButtons("case", "평균과 비율 추정 선택:",
choices = c("모집단 평균", "모집단 비율"),
inline = TRUE,
selected = "모집단 평균"),
hr(),
conditionalPanel(
condition = "input.case == '모집단 비율'",
sliderInput("pop_prop",
"모집단 비율:",
min = 0,
max = 1,
step = 0.05,
value = 0.4)
),
conditionalPanel(
condition = "input.case == '모집단 평균'",
numericInput("pop_mean", "모집단 평균:", value = 10),
numericInput("pop_sd", "모집단 표준편차:", value = 5, min = 0)
),
hr(),
sliderInput("sample_size",
"표본 크기:",
min = 10,
max = 1000,
step = 10,
value = 100),
sliderInput("n_samples",
"표본 수:",
min = 0,
max = 500,
step = 10,
value = 10),
sliderInput("conf_level",
"신뢰 수준:",
min = 5,
max = 99,
step = 1,
value = 95)
),
column(8,
mainPanel(
plotOutput('IntervalPlot', height = '500px'),
verbatimTextOutput("summary"),
verbatimTextOutput("ci")
)
)
)
)
server <- function(input, output) {
output$IntervalPlot <- renderPlot({
n_samples = input$n_samples
sample_size = input$sample_size
alpha = 1 - (input$conf_level / 100)
if (input$case == "모집단 비율") {
n_population <- 100000
population <- rbernoulli(n_population, p = input$pop_prop)
true_value <- mean(population)
sample_stats <- vector('numeric', n_samples)
upper_cis <- vector('numeric', n_samples)
lower_cis <- vector('numeric', n_samples)
contains_true_value <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stat <- mean(sample)
sample_std <- (sqrt(sample_stat * (1 - sample_stat)) / sqrt(sample_size))
z_value <- qnorm(alpha/2, lower.tail = FALSE)
margin_of_error <- z_value * sample_std
sample_stats[i] <- sample_stat
upper_cis[i] <- sample_stat + margin_of_error
lower_cis[i] <- sample_stat - margin_of_error
contains_true_value[i] <- ifelse(true_value >= lower_cis[i] & true_value <= upper_cis[i], 1, 0)
}
title_text <- paste(input$conf_level, "% 신뢰구간 (모집단 비율)", sep = " ")
y_label <- "표본 비율"
} else if (input$case == "모집단 평균") {
n_population <- 100000
population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd)
true_value <- mean(population)
sample_stats <- vector('numeric', n_samples)
upper_cis <- vector('numeric', n_samples)
lower_cis <- vector('numeric', n_samples)
contains_true_value <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stat <- mean(sample)
sample_std <- sd(sample) / sqrt(sample_size)
t_value <- qt(alpha/2, df = sample_size - 1, lower.tail = FALSE)
margin_of_error <- t_value * sample_std
sample_stats[i] <- sample_stat
upper_cis[i] <- sample_stat + margin_of_error
lower_cis[i] <- sample_stat - margin_of_error
contains_true_value[i] <- ifelse(true_value >= lower_cis[i] & true_value <= upper_cis[i], 1, 0)
}
title_text <- paste(input$conf_level, "% 신뢰구간 (모집단 평균)", sep = " ")
y_label <- "표본 평균"
}
subtitle_text <- paste0('관측된 포함률: ',
round(mean(contains_true_value), 2) * 100, '%')
contains_true_value_f <- ifelse(contains_true_value==1, "포함", "미포함")
contains_true_value_f <- factor(contains_true_value_f)
p <- tibble(
sample_stat = sample_stats,
upper_ci = upper_cis,
lower_ci = lower_cis) %>%
mutate(sample_number = as.factor(row_number())) %>%
ggplot(aes(x = sample_number, y = sample_stat)) +
coord_flip() +
ggtitle(title_text, subtitle = subtitle_text) +
ylab(y_label) +
xlab("") +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.x = element_line(color="black", size = 0.5),
legend.position = "none") +
scale_x_discrete(breaks = NULL)
p <- p + geom_hline(aes(yintercept = true_value),
linetype = 'dashed', size = 1)
p <- p +
geom_linerange(aes(ymin = lower_ci, ymax = upper_ci, color=contains_true_value_f)) +
geom_point(aes(color=contains_true_value_f), size=2) +
scale_color_manual(values=c("포함" = "#009E73", "미포함" = "#D55E00"))
p
})
output$summary <- renderPrint({
n_samples = input$n_samples
sample_size = input$sample_size
if (input$case == "모집단 비율") {
n_population <- 100000
population <- rbernoulli(n_population, p = input$pop_prop)
sample_stats <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stats[i] <- mean(sample)
}
cat("표본 비율의 평균(점추정):\n")
print(mean(sample_stats))
} else if (input$case == "모집단 평균") {
n_population <- 100000
population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd)
sample_stats <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stats[i] <- mean(sample)
}
cat("표본 평균의 평균(점 추정):\n")
print(mean(sample_stats))
}
})
output$ci <- renderPrint({
n_samples = input$n_samples
sample_size = input$sample_size
alpha = 1 - (input$conf_level / 100)
if (input$case == "모집단 비율") {
n_population <- 100000
population <- rbernoulli(n_population, p = input$pop_prop)
sample_stats <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stats[i] <- mean(sample)
}
se <- sqrt(mean(sample_stats) * (1 - mean(sample_stats)) / sample_size)
lower_ci <- mean(sample_stats) - qnorm(1 - alpha/2) * se
upper_ci <- mean(sample_stats) + qnorm(1 - alpha/2) * se
cat(paste0(input$conf_level, "% 신뢰구간 (모집단 비율):\n"))
cat(paste0("[", round(lower_ci, 2), ", ", round(upper_ci, 2), "]"))
} else if (input$case == "모집단 평균") {
n_population <- 100000
population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd)
sample_stats <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stats[i] <- mean(sample)
}
se <- sd(sample_stats) / sqrt(n_samples)
lower_ci <- mean(sample_stats) - qt(1 - alpha/2, df = n_samples - 1) * se
upper_ci <- mean(sample_stats) + qt(1 - alpha/2, df = n_samples - 1) * se
cat(paste0(input$conf_level, "% 신뢰구간 (모집단 평균):\n"))
cat(paste0("[", round(lower_ci, 2), ", ", round(upper_ci, 2), "]"))
}
})
}
shinyApp(ui = ui, server = server)
```

:::



# 코딩

```{webr-r}
library(ggplot2)
# 설정값 지정
n_samples <- 100
sample_size <- 50
conf_level <- 0.95
# 모집단 비율에 대한 신뢰구간 계산 및 시각화
pop_prop <- 0.4
n_population <- 100000
population <- rbernoulli(n_population, p = pop_prop)
true_prop <- mean(population)
sample_props <- vector('numeric', n_samples)
upper_cis_prop <- vector('numeric', n_samples)
lower_cis_prop <- vector('numeric', n_samples)
contains_true_prop <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_prop <- mean(sample)
sample_std <- sqrt(sample_prop * (1 - sample_prop) / sample_size)
z_value <- qnorm((1 - conf_level) / 2, lower.tail = FALSE)
margin_of_error <- z_value * sample_std
sample_props[i] <- sample_prop
upper_cis_prop[i] <- sample_prop + margin_of_error
lower_cis_prop[i] <- sample_prop - margin_of_error
contains_true_prop[i] <- ifelse(true_prop >= lower_cis_prop[i] & true_prop <= upper_cis_prop[i], 1, 0)
}
prop_df <- data.frame(
sample_prop = sample_props,
upper_ci = upper_cis_prop,
lower_ci = lower_cis_prop,
contains_true_prop = contains_true_prop
)
prop_plot <- ggplot(prop_df, aes(x = 1:n_samples, y = sample_prop)) +
geom_point() +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci, color = factor(contains_true_prop))) +
geom_hline(yintercept = true_prop, linetype = "dashed", color = "red") +
labs(title = "모집단 비율에 대한 신뢰구간",
x = "표본 번호",
y = "표본 비율") +
scale_color_manual(values = c("0" = "gray", "1" = "blue"),
labels = c("포함 안 함", "포함"),
name = "참값 포함 여부") +
theme_minimal()
print(prop_plot)
cat("모집단 비율에 대한 신뢰구간 포함률:", mean(contains_true_prop), "\n")
# 모집단 평균에 대한 신뢰구간 계산 및 시각화
pop_mean <- 10
pop_sd <- 2
n_population <- 100000
population <- rnorm(n_population, mean = pop_mean, sd = pop_sd)
true_mean <- mean(population)
sample_means <- vector('numeric', n_samples)
upper_cis_mean <- vector('numeric', n_samples)
lower_cis_mean <- vector('numeric', n_samples)
contains_true_mean <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_mean <- mean(sample)
sample_std <- sd(sample) / sqrt(sample_size)
t_value <- qt((1 - conf_level) / 2, df = sample_size - 1, lower.tail = FALSE)
margin_of_error <- t_value * sample_std
sample_means[i] <- sample_mean
upper_cis_mean[i] <- sample_mean + margin_of_error
lower_cis_mean[i] <- sample_mean - margin_of_error
contains_true_mean[i] <- ifelse(true_mean >= lower_cis_mean[i] & true_mean <= upper_cis_mean[i], 1, 0)
}
mean_df <- data.frame(
sample_mean = sample_means,
upper_ci = upper_cis_mean,
lower_ci = lower_cis_mean,
contains_true_mean = contains_true_mean
)
mean_plot <- ggplot(mean_df, aes(x = 1:n_samples, y = sample_mean)) +
geom_point() +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci, color = factor(contains_true_mean))) +
geom_hline(yintercept = true_mean, linetype = "dashed", color = "red") +
labs(title = "모집단 평균에 대한 신뢰구간",
x = "표본 번호",
y = "표본 평균") +
scale_color_manual(values = c("0" = "gray", "1" = "blue"),
labels = c("포함 안 함", "포함"),
name = "참값 포함") +
theme_minimal()
print(mean_plot)
cat("모집단 평균에 대한 신뢰구간 포함률:", mean(contains_true_mean), "\n")
```




Loading

0 comments on commit 2a4a8b8

Please sign in to comment.