added 표본추출 시각화

bit2r · May 12, 2024 · 2e57aa1 · 2e57aa1
1 parent 547a3b6
commit 2e57aa1
Show file tree

Hide file tree

Showing 16 changed files with 2,831 additions and 2,066 deletions.
diff --git a/01_data/dist_continuous/index.qmd b/01_data/dist_continuous/index.qmd
@@ -3,7 +3,7 @@ title: "연속형 분포"
 author: "이광춘"
 date: today
 image: thumbnail.png
-categories: [news]
+categories: ["연속형 분포", "확률분포", "균등분포", "정규분포", "지수분포", "감마분포", "베타분포"]
 editor_options: 
   chunk_output_type: console
 ---
@@ -14,13 +14,6 @@ editor_options:
 
 2. **코딩**: 분석 대상 데이터가 어떤 분포를 따르는지를 이해하고, 이를 그래픽으로 표현하여 데이터의 분포 형태와 경향을 확인할 수 있습니다. `continuous_dist_summary` 함수는 주어진 분포와 매개변수를 기반으로 요약통계량과 검정통계량, 히스토그램, 확률밀도함수(PDF)와 누적분포함수(CDF)를 계산하고 시각화합니다.
 
-### 주요 기능:
-- **분포의 PDF 계산**: 함수는 입력받은 분포 타입과 매개변수를 사용하여 해당 분포의 확률밀도함수를 계산합니다.
-- **분포의 CDF 계산**: 동일한 매개변수를 사용하여 누적분포함수도 계산됩니다.
-- **시각화**: 계산된 PDF와 CDF를 그래프로 그려, 분석 결과를 시각적으로 표현합니다. 이를 통해 데이터의 전반적인 분포와 변화를 쉽게 파악할 수 있습니다.
-
-이러한 기능들을 통해 `continuous_dist_summary` 함수는 데이터 과학자나 연구자가 이론적 분포를 실제 데이터에 적용해보고, 그 적합성을 평가하는 데 매우 유용합니다.
-
 
 # Shiny 앱
 

diff --git a/01_data/dist_discrete/index.qmd b/01_data/dist_discrete/index.qmd
@@ -3,7 +3,7 @@ title: "이산형 분포"
 author: "이광춘"
 date: today
 image: thumbnail.png
-categories: [news]
+categories: ["확률분포", "이산형 분포", "베르누이 분포", "이항 분포", "포아송 분포"]
 editor_options: 
   chunk_output_type: console
 ---

diff --git a/01_data/sampling_concept/index.qmd b/01_data/sampling_concept/index.qmd
@@ -0,0 +1,267 @@
+---
+title: "표본추출 개념 시각화"
+author: "이광춘"
+date: today
+image: thumbnail.png
+categories: ["표본", "표본추출", "단순임의추출", "층화추출", "군집추출", "다단계추출", "시각화"]
+editor_options: 
+  chunk_output_type: console
+---
+
+
+# Shiny 앱
+
+:::{.column-page}
+
+```{shinylive-r}
+#| label: shinylive-sampling
+#| viewerWidth: 800
+#| viewerHeight: 700
+#| standalone: true
+
+library(shiny)
+library(openintro)
+data(COL)
+
+# _____ Simple Random _____ #
+build_srs <- function(n, N) {
+
+  colSamp <- COL[4]
+  PCH <- rep(c(1, 3, 20)[3], 3)
+  col <- rep(COL[1], N)
+  pch <- PCH[match(col, COL)]
+
+  plot(0, xlim = c(0,2), ylim = 0:1, type = 'n', axes = FALSE, xlab = "", ylab = "")
+  box()
+  x   <- runif(N, 0, 2)
+  y   <- runif(N)
+  inc <- n
+  points(x, y, col = col, pch = pch)
+
+  these <- sample(N, n)
+  points(x[these], y[these], pch = 20, cex = 0.8, col = colSamp)
+  points(x[these], y[these], cex = 1.4, col = colSamp)
+}
+
+
+# _____ Stratified _____ #
+build_stratified <- function(N, numStrata, sampleSizePerStratum) {
+  colSamp <- COL[4]
+  col <- rep(COL[1], N)
+  PCH <- rep(c(1, 3, 20)[3], 3)
+  plot(0, xlim = c(0, 2), ylim = 0:1 + 0.01,
+       type = 'n', axes = FALSE, xlab = "", ylab = "")
+  box()
+  X <- seq(0.1, 1.9, length.out = numStrata)
+  Y <- rep(0.5, numStrata)
+
+  # 각 계층의 크기를 무작위로 생성
+  strataSizes <- sample(ceiling(N/numStrata) * 0.5 + c(-1, 1) * ceiling(N/numStrata) * 0.25, numStrata, replace = TRUE)
+  strataSizes <- round(strataSizes / sum(strataSizes) * N)
+
+  R <- sqrt(strataSizes / 500)
+  above <- rep(1, numStrata)
+  currentIndex <- 1
+  for (i in 1:numStrata) {
+    hold <- seq(0, 2 * pi, length.out = 99)
+    x <- X[i] + (R[i] + 0.01) * cos(hold)
+    y <- Y[i] + (R[i] + 0.01) * sin(hold)
+    polygon(x, y, border = COL[5, 4])
+    x <- rep(NA, strataSizes[i])
+    y <- rep(NA, strataSizes[i])
+    for (j in 1:strataSizes[i]) {
+      inside <- FALSE
+      while (!inside) {
+        xx <- runif(1, -R[i], R[i])
+        yy <- runif(1, -R[i], R[i])
+        if (sqrt(xx^2 + yy^2) < R[i]) {
+          inside <- TRUE
+          x[j] <- xx
+          y[j] <- yy
+        }
+      }
+    }
+    type <- sample(1, strataSizes[i], TRUE)
+    pch <- PCH[type]
+    col <- COL[type]
+    x <- X[i] + x
+    y <- Y[i] + y
+    points(x, y, pch = pch, col = col)
+    these <- sample(strataSizes[i], min(sampleSizePerStratum, strataSizes[i]))
+    points(x[these], y[these],
+           pch = 20, cex = 0.8, col = colSamp)
+    points(x[these], y[these], cex = 1.4, col = colSamp)
+    currentIndex <- currentIndex + strataSizes[i]
+  }
+  text(X, Y + above * (R),
+       paste("계층", 1:numStrata),
+       pos = 2 + above,
+       cex = 1.3)
+}
+
+# _____ Cluster _____ #
+build_cluster <- function(numClusters, clusterSizes, selectedClusters) {
+  colSamp <- COL[4]
+  PCH <- rep(c(1, 3, 20)[3], 3)
+  plot(0, xlim = c(0, 2), ylim = c(0.01, 1.04), type = 'n', axes = FALSE, xlab = "", ylab = "")
+  box()
+  X <- seq(0.1, 1.9, length.out = numClusters)
+  Y <- runif(numClusters, 0.2, 0.8)
+  R <- sqrt(clusterSizes / 500)
+  above <- ifelse(Y > 0.5, 1, -1)
+  for (i in 1:numClusters) {
+    hold <- seq(0, 2 * pi, length.out = 99)
+    x <- X[i] + (R[i] + 0.02) * cos(hold)
+    y <- Y[i] + (R[i] + 0.02) * sin(hold)
+    polygon(x, y, border = COL[5, 4])
+    if (i %in% selectedClusters) {
+      polygon(x, y, border = COL[4], lty = 2, lwd = 1.5)
+    }
+    x <- rep(NA, clusterSizes[i])
+    y <- rep(NA, clusterSizes[i])
+    for (j in 1:clusterSizes[i]) {
+      inside <- FALSE
+      while (!inside) {
+        xx <- runif(1, -R[i], R[i])
+        yy <- runif(1, -R[i], R[i])
+        if (sqrt(xx^2 + yy^2) < R[i]) {
+          inside <- TRUE
+          x[j] <- xx
+          y[j] <- yy
+        }
+      }
+    }
+    type <- sample(1, clusterSizes[i], TRUE)
+    pch <- PCH[type]
+    col <- COL[type]
+    x <- X[i] + x
+    y <- Y[i] + y
+    points(x, y, pch = pch, col = col)
+    if (i %in% selectedClusters) {
+      points(x, y, pch = 20, cex = 0.8, col = colSamp)
+      points(x, y, cex = 1.4, col = colSamp)
+    }
+  }
+  text(X, Y + above * (R + 0.01),
+       paste("군집", 1:numClusters),
+       pos = 2 + above,
+       cex = 1.3)
+}
+
+# _____ Multistage Sampling _____ #
+build_multistage <- function(numClusters, sampleSizePerCluster, clusterSizes) {
+  colSamp <- COL[4]
+  PCH <- rep(c(1, 3, 20)[3], 3)
+  plot(0, xlim = c(0, 2), ylim = c(0.01, 1.04), type = 'n', axes = FALSE, xlab = "", ylab = "")
+  box()
+  X <- seq(0.1, 1.9, length.out = numClusters)
+  Y <- runif(numClusters, 0.2, 0.8)
+  R <- sqrt(clusterSizes / 500)
+  above <- ifelse(Y > 0.5, 1, -1)
+  for (i in 1:numClusters) {
+    hold <- seq(0, 2 * pi, length.out = 99)
+    x <- X[i] + (R[i] + 0.02) * cos(hold)
+    y <- Y[i] + (R[i] + 0.02) * sin(hold)
+    polygon(x, y, border = COL[5, 4])
+    x <- rep(NA, clusterSizes[i])
+    y <- rep(NA, clusterSizes[i])
+    for (j in 1:clusterSizes[i]) {
+      inside <- FALSE
+      while (!inside) {
+        xx <- runif(1, -R[i], R[i])
+        yy <- runif(1, -R[i], R[i])
+        if (sqrt(xx^2 + yy^2) < R[i]) {
+          inside <- TRUE
+          x[j] <- xx
+          y[j] <- yy
+        }
+      }
+    }
+    type <- sample(1, clusterSizes[i], TRUE)
+    pch <- PCH[type]
+    col <- COL[type]
+    x <- X[i] + x
+    y <- Y[i] + y
+    points(x, y, pch = pch, col = col)
+    these <- sample(clusterSizes[i], min(sampleSizePerCluster, clusterSizes[i]))
+    points(x[these], y[these], pch = 20, cex = 0.8, col = colSamp)
+    points(x[these], y[these], cex = 1.4, col = colSamp)
+  }
+  text(X, Y + above * (R + 0.01),
+       paste("군집", 1:numClusters),
+       pos = 2 + above, cex = 1.3)
+}
+
+# ui -----
+ui <- fluidPage(
+  titlePanel("표본 추출 방법 시각화"),
+  sidebarLayout(
+    sidebarPanel(
+      radioButtons("method", "표본 추출 방법 선택:",
+                   c("단순 무작위 표본 추출" = "srs",
+                     "층화 표본 추출" = "stratified",
+                     "군집 표본 추출" = "cluster",
+                     "다단계 표본 추출" = "multistage"),
+                   inline = TRUE),
+      conditionalPanel(
+        condition = "input.method == 'srs'",
+        sliderInput("sampleSize", "표본 크기:", min = 1, max = 100, value = 10)
+      ),
+      conditionalPanel(
+        condition = "input.method == 'stratified'",
+        sliderInput("numStrata", "층(Stratum) 수:", min = 1, max = 5, value = 3),
+        sliderInput("sampleSizePerStratum", "층 내부 표본 크기:", min = 1, max = 10, value = 3)
+      ),
+      conditionalPanel(
+        condition = "input.method == 'cluster'",
+        sliderInput("numClusters", "군집 수:", min = 1, max = 9, value = 3)
+      ),
+      conditionalPanel(
+        condition = "input.method == 'multistage'",
+        sliderInput("numClustersMultistage", "군집 수:", min = 1, max = 9, value = 3),
+        sliderInput("sampleSizePerClusterMultistage", "군집 내부 표본 크기:", min = 1, max = 10, value = 3)
+      )
+    ),
+    mainPanel(
+      plotOutput("samplingPlot")
+    )
+  )
+)
+
+server <- function(input, output) {
+  output$samplingPlot <- renderPlot({
+    if (input$method == "srs") {
+      build_srs(n = input$sampleSize, N = 100)
+    } else if (input$method == "stratified") {
+      build_stratified(N = 100, numStrata = input$numStrata, sampleSizePerStratum = input$sampleSizePerStratum)
+    } else if (input$method == "cluster") {
+      clusterSizeMin <- sample(5:30, 1)
+      clusterSizeMax <- sample((clusterSizeMin+5):50, 1)
+      clusterSizes <- sample(clusterSizeMin:clusterSizeMax, input$numClusters, replace = TRUE)
+      selectedClusters <- sample(1:input$numClusters, round(input$numClusters/3))
+      build_cluster(numClusters = input$numClusters, clusterSizes = clusterSizes, selectedClusters = selectedClusters)
+    } else if (input$method == "multistage") {
+      clusterSizeMin <- sample(5:30, 1)
+      clusterSizeMax <- sample((clusterSizeMin+5):50, 1)
+      clusterSizes <- sample(clusterSizeMin:clusterSizeMax, input$numClustersMultistage, replace = TRUE)
+      build_multistage(numClusters = input$numClustersMultistage, sampleSizePerCluster = input$sampleSizePerClusterMultistage, clusterSizes = clusterSizes)
+    }
+  })
+}
+
+shinyApp(ui, server)
+
+
+```
+
+:::
+
+# 코딩
+
+
+```{webr-r}
+
+```
+
+
+