-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
2,938 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,368 @@ | ||
--- | ||
title: "신뢰구간" | ||
author: "이광춘" | ||
date: today | ||
categories: [추론, 신뢰구간] | ||
image: "thumbnail.png" | ||
editor_options: | ||
chunk_output_type: console | ||
--- | ||
|
||
신뢰구간은 통계학에서 모수 추정에 사용되는 중요한 개념입니다. | ||
신뢰구간은 표본에서 계산된 추정량이 실제 모수를 포함할 확률을 나타내는 구간을 의미합니다. 신뢰구간은 모수 추정의 불확실성을 고려하여 모수 추정에 대한 더 정확한 정보를 제공합니다. 대표적으로 모집단 평균과 모집단 비율에 대한 신뢰구간을 계산하는 방법을 시각적으로 확인할 수 있습니다. | ||
|
||
# Shiny 앱 | ||
|
||
:::{.column-page} | ||
|
||
```{shinylive-r} | ||
#| label: shinylive-infer-ci | ||
#| viewerWidth: 800 | ||
#| viewerHeight: 600 | ||
#| standalone: true | ||
library(shiny) | ||
library(ggplot2) | ||
library(tidyverse) | ||
library(showtext) | ||
showtext_auto() | ||
ui <- fluidPage( | ||
title = "신뢰구간 모의실험", | ||
fluidRow( | ||
column(3, | ||
h2('신뢰구간'), | ||
hr(), | ||
radioButtons("case", "평균과 비율 추정 선택:", | ||
choices = c("모집단 평균", "모집단 비율"), | ||
inline = TRUE, | ||
selected = "모집단 평균"), | ||
hr(), | ||
conditionalPanel( | ||
condition = "input.case == '모집단 비율'", | ||
sliderInput("pop_prop", | ||
"모집단 비율:", | ||
min = 0, | ||
max = 1, | ||
step = 0.05, | ||
value = 0.4) | ||
), | ||
conditionalPanel( | ||
condition = "input.case == '모집단 평균'", | ||
numericInput("pop_mean", "모집단 평균:", value = 10), | ||
numericInput("pop_sd", "모집단 표준편차:", value = 5, min = 0) | ||
), | ||
hr(), | ||
sliderInput("sample_size", | ||
"표본 크기:", | ||
min = 10, | ||
max = 1000, | ||
step = 10, | ||
value = 100), | ||
sliderInput("n_samples", | ||
"표본 수:", | ||
min = 0, | ||
max = 500, | ||
step = 10, | ||
value = 10), | ||
sliderInput("conf_level", | ||
"신뢰 수준:", | ||
min = 5, | ||
max = 99, | ||
step = 1, | ||
value = 95) | ||
), | ||
column(8, | ||
mainPanel( | ||
plotOutput('IntervalPlot', height = '500px'), | ||
verbatimTextOutput("summary"), | ||
verbatimTextOutput("ci") | ||
) | ||
) | ||
) | ||
) | ||
server <- function(input, output) { | ||
output$IntervalPlot <- renderPlot({ | ||
n_samples = input$n_samples | ||
sample_size = input$sample_size | ||
alpha = 1 - (input$conf_level / 100) | ||
if (input$case == "모집단 비율") { | ||
n_population <- 100000 | ||
population <- rbernoulli(n_population, p = input$pop_prop) | ||
true_value <- mean(population) | ||
sample_stats <- vector('numeric', n_samples) | ||
upper_cis <- vector('numeric', n_samples) | ||
lower_cis <- vector('numeric', n_samples) | ||
contains_true_value <- vector('numeric', n_samples) | ||
for (i in 1:n_samples) { | ||
sample <- sample(population, sample_size, replace = FALSE) | ||
sample_stat <- mean(sample) | ||
sample_std <- (sqrt(sample_stat * (1 - sample_stat)) / sqrt(sample_size)) | ||
z_value <- qnorm(alpha/2, lower.tail = FALSE) | ||
margin_of_error <- z_value * sample_std | ||
sample_stats[i] <- sample_stat | ||
upper_cis[i] <- sample_stat + margin_of_error | ||
lower_cis[i] <- sample_stat - margin_of_error | ||
contains_true_value[i] <- ifelse(true_value >= lower_cis[i] & true_value <= upper_cis[i], 1, 0) | ||
} | ||
title_text <- paste(input$conf_level, "% 신뢰구간 (모집단 비율)", sep = " ") | ||
y_label <- "표본 비율" | ||
} else if (input$case == "모집단 평균") { | ||
n_population <- 100000 | ||
population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd) | ||
true_value <- mean(population) | ||
sample_stats <- vector('numeric', n_samples) | ||
upper_cis <- vector('numeric', n_samples) | ||
lower_cis <- vector('numeric', n_samples) | ||
contains_true_value <- vector('numeric', n_samples) | ||
for (i in 1:n_samples) { | ||
sample <- sample(population, sample_size, replace = FALSE) | ||
sample_stat <- mean(sample) | ||
sample_std <- sd(sample) / sqrt(sample_size) | ||
t_value <- qt(alpha/2, df = sample_size - 1, lower.tail = FALSE) | ||
margin_of_error <- t_value * sample_std | ||
sample_stats[i] <- sample_stat | ||
upper_cis[i] <- sample_stat + margin_of_error | ||
lower_cis[i] <- sample_stat - margin_of_error | ||
contains_true_value[i] <- ifelse(true_value >= lower_cis[i] & true_value <= upper_cis[i], 1, 0) | ||
} | ||
title_text <- paste(input$conf_level, "% 신뢰구간 (모집단 평균)", sep = " ") | ||
y_label <- "표본 평균" | ||
} | ||
subtitle_text <- paste0('관측된 포함률: ', | ||
round(mean(contains_true_value), 2) * 100, '%') | ||
contains_true_value_f <- ifelse(contains_true_value==1, "포함", "미포함") | ||
contains_true_value_f <- factor(contains_true_value_f) | ||
p <- tibble( | ||
sample_stat = sample_stats, | ||
upper_ci = upper_cis, | ||
lower_ci = lower_cis) %>% | ||
mutate(sample_number = as.factor(row_number())) %>% | ||
ggplot(aes(x = sample_number, y = sample_stat)) + | ||
coord_flip() + | ||
ggtitle(title_text, subtitle = subtitle_text) + | ||
ylab(y_label) + | ||
xlab("") + | ||
theme_minimal() + | ||
theme(axis.text.y = element_blank(), | ||
axis.ticks.y = element_blank(), | ||
axis.line.x = element_line(color="black", size = 0.5), | ||
legend.position = "none") + | ||
scale_x_discrete(breaks = NULL) | ||
p <- p + geom_hline(aes(yintercept = true_value), | ||
linetype = 'dashed', size = 1) | ||
p <- p + | ||
geom_linerange(aes(ymin = lower_ci, ymax = upper_ci, color=contains_true_value_f)) + | ||
geom_point(aes(color=contains_true_value_f), size=2) + | ||
scale_color_manual(values=c("포함" = "#009E73", "미포함" = "#D55E00")) | ||
p | ||
}) | ||
output$summary <- renderPrint({ | ||
n_samples = input$n_samples | ||
sample_size = input$sample_size | ||
if (input$case == "모집단 비율") { | ||
n_population <- 100000 | ||
population <- rbernoulli(n_population, p = input$pop_prop) | ||
sample_stats <- vector('numeric', n_samples) | ||
for (i in 1:n_samples) { | ||
sample <- sample(population, sample_size, replace = FALSE) | ||
sample_stats[i] <- mean(sample) | ||
} | ||
cat("표본 비율의 평균(점추정):\n") | ||
print(mean(sample_stats)) | ||
} else if (input$case == "모집단 평균") { | ||
n_population <- 100000 | ||
population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd) | ||
sample_stats <- vector('numeric', n_samples) | ||
for (i in 1:n_samples) { | ||
sample <- sample(population, sample_size, replace = FALSE) | ||
sample_stats[i] <- mean(sample) | ||
} | ||
cat("표본 평균의 평균(점 추정):\n") | ||
print(mean(sample_stats)) | ||
} | ||
}) | ||
output$ci <- renderPrint({ | ||
n_samples = input$n_samples | ||
sample_size = input$sample_size | ||
alpha = 1 - (input$conf_level / 100) | ||
if (input$case == "모집단 비율") { | ||
n_population <- 100000 | ||
population <- rbernoulli(n_population, p = input$pop_prop) | ||
sample_stats <- vector('numeric', n_samples) | ||
for (i in 1:n_samples) { | ||
sample <- sample(population, sample_size, replace = FALSE) | ||
sample_stats[i] <- mean(sample) | ||
} | ||
se <- sqrt(mean(sample_stats) * (1 - mean(sample_stats)) / sample_size) | ||
lower_ci <- mean(sample_stats) - qnorm(1 - alpha/2) * se | ||
upper_ci <- mean(sample_stats) + qnorm(1 - alpha/2) * se | ||
cat(paste0(input$conf_level, "% 신뢰구간 (모집단 비율):\n")) | ||
cat(paste0("[", round(lower_ci, 2), ", ", round(upper_ci, 2), "]")) | ||
} else if (input$case == "모집단 평균") { | ||
n_population <- 100000 | ||
population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd) | ||
sample_stats <- vector('numeric', n_samples) | ||
for (i in 1:n_samples) { | ||
sample <- sample(population, sample_size, replace = FALSE) | ||
sample_stats[i] <- mean(sample) | ||
} | ||
se <- sd(sample_stats) / sqrt(n_samples) | ||
lower_ci <- mean(sample_stats) - qt(1 - alpha/2, df = n_samples - 1) * se | ||
upper_ci <- mean(sample_stats) + qt(1 - alpha/2, df = n_samples - 1) * se | ||
cat(paste0(input$conf_level, "% 신뢰구간 (모집단 평균):\n")) | ||
cat(paste0("[", round(lower_ci, 2), ", ", round(upper_ci, 2), "]")) | ||
} | ||
}) | ||
} | ||
shinyApp(ui = ui, server = server) | ||
``` | ||
|
||
::: | ||
|
||
|
||
|
||
# 코딩 | ||
|
||
```{webr-r} | ||
library(ggplot2) | ||
# 설정값 지정 | ||
n_samples <- 100 | ||
sample_size <- 50 | ||
conf_level <- 0.95 | ||
# 모집단 비율에 대한 신뢰구간 계산 및 시각화 | ||
pop_prop <- 0.4 | ||
n_population <- 100000 | ||
population <- rbernoulli(n_population, p = pop_prop) | ||
true_prop <- mean(population) | ||
sample_props <- vector('numeric', n_samples) | ||
upper_cis_prop <- vector('numeric', n_samples) | ||
lower_cis_prop <- vector('numeric', n_samples) | ||
contains_true_prop <- vector('numeric', n_samples) | ||
for (i in 1:n_samples) { | ||
sample <- sample(population, sample_size, replace = FALSE) | ||
sample_prop <- mean(sample) | ||
sample_std <- sqrt(sample_prop * (1 - sample_prop) / sample_size) | ||
z_value <- qnorm((1 - conf_level) / 2, lower.tail = FALSE) | ||
margin_of_error <- z_value * sample_std | ||
sample_props[i] <- sample_prop | ||
upper_cis_prop[i] <- sample_prop + margin_of_error | ||
lower_cis_prop[i] <- sample_prop - margin_of_error | ||
contains_true_prop[i] <- ifelse(true_prop >= lower_cis_prop[i] & true_prop <= upper_cis_prop[i], 1, 0) | ||
} | ||
prop_df <- data.frame( | ||
sample_prop = sample_props, | ||
upper_ci = upper_cis_prop, | ||
lower_ci = lower_cis_prop, | ||
contains_true_prop = contains_true_prop | ||
) | ||
prop_plot <- ggplot(prop_df, aes(x = 1:n_samples, y = sample_prop)) + | ||
geom_point() + | ||
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci, color = factor(contains_true_prop))) + | ||
geom_hline(yintercept = true_prop, linetype = "dashed", color = "red") + | ||
labs(title = "모집단 비율에 대한 신뢰구간", | ||
x = "표본 번호", | ||
y = "표본 비율") + | ||
scale_color_manual(values = c("0" = "gray", "1" = "blue"), | ||
labels = c("포함 안 함", "포함"), | ||
name = "참값 포함 여부") + | ||
theme_minimal() | ||
print(prop_plot) | ||
cat("모집단 비율에 대한 신뢰구간 포함률:", mean(contains_true_prop), "\n") | ||
# 모집단 평균에 대한 신뢰구간 계산 및 시각화 | ||
pop_mean <- 10 | ||
pop_sd <- 2 | ||
n_population <- 100000 | ||
population <- rnorm(n_population, mean = pop_mean, sd = pop_sd) | ||
true_mean <- mean(population) | ||
sample_means <- vector('numeric', n_samples) | ||
upper_cis_mean <- vector('numeric', n_samples) | ||
lower_cis_mean <- vector('numeric', n_samples) | ||
contains_true_mean <- vector('numeric', n_samples) | ||
for (i in 1:n_samples) { | ||
sample <- sample(population, sample_size, replace = FALSE) | ||
sample_mean <- mean(sample) | ||
sample_std <- sd(sample) / sqrt(sample_size) | ||
t_value <- qt((1 - conf_level) / 2, df = sample_size - 1, lower.tail = FALSE) | ||
margin_of_error <- t_value * sample_std | ||
sample_means[i] <- sample_mean | ||
upper_cis_mean[i] <- sample_mean + margin_of_error | ||
lower_cis_mean[i] <- sample_mean - margin_of_error | ||
contains_true_mean[i] <- ifelse(true_mean >= lower_cis_mean[i] & true_mean <= upper_cis_mean[i], 1, 0) | ||
} | ||
mean_df <- data.frame( | ||
sample_mean = sample_means, | ||
upper_ci = upper_cis_mean, | ||
lower_ci = lower_cis_mean, | ||
contains_true_mean = contains_true_mean | ||
) | ||
mean_plot <- ggplot(mean_df, aes(x = 1:n_samples, y = sample_mean)) + | ||
geom_point() + | ||
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci, color = factor(contains_true_mean))) + | ||
geom_hline(yintercept = true_mean, linetype = "dashed", color = "red") + | ||
labs(title = "모집단 평균에 대한 신뢰구간", | ||
x = "표본 번호", | ||
y = "표본 평균") + | ||
scale_color_manual(values = c("0" = "gray", "1" = "blue"), | ||
labels = c("포함 안 함", "포함"), | ||
name = "참값 포함") + | ||
theme_minimal() | ||
print(mean_plot) | ||
cat("모집단 평균에 대한 신뢰구간 포함률:", mean(contains_true_mean), "\n") | ||
``` | ||
|
||
|
||
|
||
|
Oops, something went wrong.