sorrychoe · sorrychoe · Aug 13, 2023 · Aug 3, 2023 · Aug 3, 2023 · Aug 6, 2023
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -1,8 +1,6 @@
 # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
-  push:
-    branches: main
   pull_request:
     branches: main
 

diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -3,8 +3,6 @@
 on:
   push:
     branches: main
-  pull_request:
-    branches: main
   release:
     types: [published]
   workflow_dispatch:

diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -1,8 +1,6 @@
 # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
-  push:
-    branches: main
   pull_request:
     branches: main
 

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,12 @@
 Package: RBigKinds
-Title: What the Package Does (One Line, Title Case)
-Version: 0.0.1.9000
-Authors@R: 
-    person("First", "Last", , "first.last@example.com", role = c("aut", "cre"),
-           comment = c(ORCID = "YOUR-ORCID-ID"))
-Description: What the package does (one paragraph).
+Title: BigKinds Data Analysis Toolkit for R
+Version: 0.1.0.9000
+Authors@R: c(
+  person("Jaeseong", "Choe",, "cjssoote@gmail.com", role = c("aut", "cre"))
+  )
+Description: RBigKinds is a library for Data Analysis of BigKinds data through R.
+Author: Jaeseonc Choe [aut, cre]
+Maintainer: Jaeseong Choe <cjssoote@gmail.com>
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,8 +1,27 @@
 # Generated by roxygen2: do not edit by hand
 
-export(counter_to_dataframe)
-export(duplication_remover)
+export(DBSCAN)
+export(Kmeans)
+export(MeanShift)
+export(association)
+export(day_range)
 export(header_remover)
-export(keyword_list)
-export(keyword_parser)
-export(word_counter)
+export(keyword_dataframe)
+export(keyword_dataframe_no_duplicated)
+export(keywords_wordcloud)
+export(lda)
+export(normalize_vector)
+export(press_counter)
+export(tfidf)
+export(tfidf_vector)
+export(top_words)
+export(word_tokenizer)
+import(arules)
+import(dbscan)
+import(dplyr)
+import(ggplot2)
+import(tibble)
+import(tidymodels)
+import(tidytext)
+import(tm)
+import(wordcloud2)
diff --git a/R/RBigKinds.R b/R/RBigKinds.R
@@ -12,12 +12,3 @@ NULL
 # devtools::use_package("rmarkdown","Suggests")
 # devtools::use_package("testthat","Suggests")
 #
-# ### Preprocessing
-# devtools::use_package("dplyr","Imports")
-# devtools::use_package("rvest","Imports")
-#
-# ### Import
-# devtools::use_package("readxl", "Imports")
-#
-# ### export
-# devtools::use_package("writexl",Imports")
diff --git a/R/association.R b/R/association.R
@@ -0,0 +1,29 @@
+#' association
+#'
+#' 기사에 등장한 단어 별로 연관분석을 진행합니다.
+#' 연관분석 방법은 Apriori입니다.
+#' 
+#' @param df BigKinds 원본 문서
+#' @param min_support 최소 지지도
+#' @param minlen 연관된 최소 갯수
+#' @param maxlen 연관된 최대 갯수
+#'
+#' @examples
+#' association(df, min_support = 0.6, minlen = 3, maxlen = 10)
+#' @import arules
+#' @export
+association <- function(df, min_support = 0.5, minlen=2, maxlen = 10) {
+  if (is.data.frame(df)) {
+    words <- word_tokenizer(df)
+    data <- split(words$키워드, words$제목)
+    te_data <- as(data, "transactions")
+    result <- apriori(te_data, parameter = list(supp = min_support, minlen=minlen, maxlen=maxlen, target = "rules"))
+    result <- as.data.frame(inspect(result))
+    result <- result[, c("lhs", "rhs", "support", "confidence")]
+    colnames(result) <- c("lhs", "rhs", "support", "confidence")
+    result <- result[result[, "confidence"] > min_support ]
+    return(result)
+  } else {
+    stop("input type is to be have to df")
+  }
+}
diff --git a/R/barplot.R b/R/barplot.R
@@ -0,0 +1,32 @@
+#' top_words
+#'
+#' 언론사 별로 가장 많이 등장한 단어 순위를 시각화합니다.
+#' 최대 몇개의 단어를 추출할지는 직접 정할 수 있습니다.
+#' default는 25개입니다.
+#' 
+#' @param df BigKinds 원본 문서
+#' @param press 확인할 언론사 이름
+#' @param top_n 시각화할 단어 갯수
+#'
+#' @examples
+#' top_words(df, "경향신문", top_n=30)
+#' @import ggplot2
+#' @import dplyr
+#' @export
+top_words <- function(df, press = NA, top_n = 25) {
+  if (is.data.frame(df)) {
+    if (!is.na(press)){
+      df <- df |> filter(언론사 == press)
+    }
+    data <- keyword_dataframe(df)
+    data <- head(data[order(data$n, decreasing = TRUE), ], top_n)
+    ggplot(data, aes(reorder(키워드, n), n)) +
+      geom_bar(stat = "identity", fill = "steelblue") +
+      labs(x = "단어", y = "빈도") +
+      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+      ggtitle("사용 단어 빈도 상위", top_n) +
+      coord_flip()
+  } else {
+    stop("input type is to be have to DataFrame")
+  }
+}
diff --git a/R/clustering.R b/R/clustering.R
@@ -0,0 +1,69 @@
+#' Kmeans
+#'
+#' kmeans clustering을 진행합니다.
+#' 
+#' @param df BigKinds 원본 문서
+#' @param k  형성할 군집 갯수
+#' @param random_state seed 값
+#'
+#' @examples
+#' Kmeans(df, k = 3)
+#' 
+#' @export
+Kmeans <- function(vec, k, random_state = 123) {
+  if (is.matrix(vec)) {
+    set.seed(random_state)
+    kmeans_model <- kmeans(vec, centers = k, iter.max = 1000)
+    return(kmeans_model$cluster)
+  } else {
+    stop("input type is to be have to matrix")
+  }
+}
+
+#' DBSCAN
+#'
+#' DBSCAN 알고리즘을 진행합니다.
+#' 
+#' @param df BigKinds 원본 문서
+#' @param eps epsilon 값(보폭)
+#' @param min_sample 최적 샘플 갯수
+#' @param metric 거리 계산 방법(default = euclidean)
+#'
+#' @examples
+#' DBSCAN(vec, eps = 0.5, min_sample = 3)
+#' 
+#' @import dbscan
+#' 
+#' @export
+DBSCAN <- function(vec, eps, min_samples, metric = "euclidean") {
+  if (is.matrix(vec)) {
+    dbscan_model <- dbscan::dbscan(vec, eps = eps, minPts = min_samples, method = metric)
+    return(dbscan_model$cluster)
+  } else {
+    stop("input type is to be have to matrix")
+  }
+}
+
+#' MeanShift
+#'
+#' mean shift clustering을 진행합니다.
+#' 
+#' @param df BigKinds 원본 문서
+#' @param qt quantile 값(최적 bandwidth 추정을 위함)
+#'
+#' @examples
+#' MeanShift(vec, at = 0.5)
+#' 
+#' @export
+MeanShift <- function(vec, qt = 0.25) {
+  if (is.matrix(vec)) {
+    best_bandwidth <- density(vec)$bw
+    print(paste(qt, "기준 최적 bandwidth 값:", round(best_bandwidth, 2)))
+
+    ms_model <- meanshift(vec, bandwidth = best_bandwidth)
+    print(paste("cluster 갯수:", length(unique(ms_model))))
+    return(ms_model)
+  } else {
+    stop("input type is to be have to matrix")
+  }
+}
diff --git a/R/day_range.R b/R/day_range.R
@@ -0,0 +1,18 @@
+#' day_range
+#'
+#' 단어 범위를 return합니다.
+#' 
+#' @param df BigKinds 원본 문서
+#'
+#' @examples
+#' day_range(df)
+#'
+#' @export
+day_range <- function(df) {
+  if (is.data.frame(df)) {
+    print(paste("first day: ", min(df$일자)))
+    print(paste("last day: ", max(df$일자)))
+  } else {
+    stop("input type is to be have to DataFrame")
+  }
+}