diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index c2c5297..ecb6469 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,8 +1,6 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - push: - branches: main pull_request: branches: main diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 0b5cf48..f30826c 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -3,8 +3,6 @@ on: push: branches: main - pull_request: - branches: main release: types: [published] workflow_dispatch: diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 0317144..b31d89a 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -1,8 +1,6 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - push: - branches: main pull_request: branches: main diff --git a/DESCRIPTION b/DESCRIPTION index a0e8ec9..c02d9a4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,12 @@ Package: RBigKinds -Title: What the Package Does (One Line, Title Case) -Version: 0.0.1.9000 -Authors@R: - person("First", "Last", , "first.last@example.com", role = c("aut", "cre"), - comment = c(ORCID = "YOUR-ORCID-ID")) -Description: What the package does (one paragraph). +Title: BigKinds Data Analysis Toolkit for R +Version: 0.1.0.9000 +Authors@R: c( + person("Jaeseong", "Choe",, "cjssoote@gmail.com", role = c("aut", "cre")) + ) +Description: RBigKinds is a library for Data Analysis of BigKinds data through R. +Author: Jaeseonc Choe [aut, cre] +Maintainer: Jaeseong Choe License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 921bce5..e9465dd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,27 @@ # Generated by roxygen2: do not edit by hand -export(counter_to_dataframe) -export(duplication_remover) +export(DBSCAN) +export(Kmeans) +export(MeanShift) +export(association) +export(day_range) export(header_remover) -export(keyword_list) -export(keyword_parser) -export(word_counter) +export(keyword_dataframe) +export(keyword_dataframe_no_duplicated) +export(keywords_wordcloud) +export(lda) +export(normalize_vector) +export(press_counter) +export(tfidf) +export(tfidf_vector) +export(top_words) +export(word_tokenizer) +import(arules) +import(dbscan) +import(dplyr) +import(ggplot2) +import(tibble) +import(tidymodels) +import(tidytext) +import(tm) +import(wordcloud2) diff --git a/R/RBigKinds.R b/R/RBigKinds.R index acdd0c1..0c4e69a 100644 --- a/R/RBigKinds.R +++ b/R/RBigKinds.R @@ -12,12 +12,3 @@ NULL # devtools::use_package("rmarkdown","Suggests") # devtools::use_package("testthat","Suggests") # -# ### Preprocessing -# devtools::use_package("dplyr","Imports") -# devtools::use_package("rvest","Imports") -# -# ### Import -# devtools::use_package("readxl", "Imports") -# -# ### export -# devtools::use_package("writexl",Imports") \ No newline at end of file diff --git a/R/association.R b/R/association.R new file mode 100644 index 0000000..545f3c4 --- /dev/null +++ b/R/association.R @@ -0,0 +1,29 @@ +#' association +#' +#' 기사에 등장한 단어 별로 연관분석을 진행합니다. +#' 연관분석 방법은 Apriori입니다. +#' +#' @param df BigKinds 원본 문서 +#' @param min_support 최소 지지도 +#' @param minlen 연관된 최소 갯수 +#' @param maxlen 연관된 최대 갯수 +#' +#' @examples +#' association(df, min_support = 0.6, minlen = 3, maxlen = 10) +#' @import arules +#' @export +association <- function(df, min_support = 0.5, minlen=2, maxlen = 10) { + if (is.data.frame(df)) { + words <- word_tokenizer(df) + data <- split(words$키워드, words$제목) + te_data <- as(data, "transactions") + result <- apriori(te_data, parameter = list(supp = min_support, minlen=minlen, maxlen=maxlen, target = "rules")) + result <- as.data.frame(inspect(result)) + result <- result[, c("lhs", "rhs", "support", "confidence")] + colnames(result) <- c("lhs", "rhs", "support", "confidence") + result <- result[result[, "confidence"] > min_support ] + return(result) + } else { + stop("input type is to be have to df") + } +} \ No newline at end of file diff --git a/R/barplot.R b/R/barplot.R new file mode 100644 index 0000000..1485002 --- /dev/null +++ b/R/barplot.R @@ -0,0 +1,32 @@ +#' top_words +#' +#' 언론사 별로 가장 많이 등장한 단어 순위를 시각화합니다. +#' 최대 몇개의 단어를 추출할지는 직접 정할 수 있습니다. +#' default는 25개입니다. +#' +#' @param df BigKinds 원본 문서 +#' @param press 확인할 언론사 이름 +#' @param top_n 시각화할 단어 갯수 +#' +#' @examples +#' top_words(df, "경향신문", top_n=30) +#' @import ggplot2 +#' @import dplyr +#' @export +top_words <- function(df, press = NA, top_n = 25) { + if (is.data.frame(df)) { + if (!is.na(press)){ + df <- df |> filter(언론사 == press) + } + data <- keyword_dataframe(df) + data <- head(data[order(data$n, decreasing = TRUE), ], top_n) + ggplot(data, aes(reorder(키워드, n), n)) + + geom_bar(stat = "identity", fill = "steelblue") + + labs(x = "단어", y = "빈도") + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + ggtitle("사용 단어 빈도 상위", top_n) + + coord_flip() + } else { + stop("input type is to be have to DataFrame") + } +} \ No newline at end of file diff --git a/R/clustering.R b/R/clustering.R new file mode 100644 index 0000000..0b728b8 --- /dev/null +++ b/R/clustering.R @@ -0,0 +1,69 @@ +#' Kmeans +#' +#' kmeans clustering을 진행합니다. +#' +#' @param df BigKinds 원본 문서 +#' @param k 형성할 군집 갯수 +#' @param random_state seed 값 +#' +#' @examples +#' Kmeans(df, k = 3) +#' +#' @export +Kmeans <- function(vec, k, random_state = 123) { + if (is.matrix(vec)) { + set.seed(random_state) + kmeans_model <- kmeans(vec, centers = k, iter.max = 1000) + return(kmeans_model$cluster) + } else { + stop("input type is to be have to matrix") + } +} + +#' DBSCAN +#' +#' DBSCAN 알고리즘을 진행합니다. +#' +#' @param df BigKinds 원본 문서 +#' @param eps epsilon 값(보폭) +#' @param min_sample 최적 샘플 갯수 +#' @param metric 거리 계산 방법(default = euclidean) +#' +#' @examples +#' DBSCAN(vec, eps = 0.5, min_sample = 3) +#' +#' @import dbscan +#' +#' @export +DBSCAN <- function(vec, eps, min_samples, metric = "euclidean") { + if (is.matrix(vec)) { + dbscan_model <- dbscan::dbscan(vec, eps = eps, minPts = min_samples, method = metric) + return(dbscan_model$cluster) + } else { + stop("input type is to be have to matrix") + } +} + +#' MeanShift +#' +#' mean shift clustering을 진행합니다. +#' +#' @param df BigKinds 원본 문서 +#' @param qt quantile 값(최적 bandwidth 추정을 위함) +#' +#' @examples +#' MeanShift(vec, at = 0.5) +#' +#' @export +MeanShift <- function(vec, qt = 0.25) { + if (is.matrix(vec)) { + best_bandwidth <- density(vec)$bw + print(paste(qt, "기준 최적 bandwidth 값:", round(best_bandwidth, 2))) + + ms_model <- meanshift(vec, bandwidth = best_bandwidth) + print(paste("cluster 갯수:", length(unique(ms_model)))) + return(ms_model) + } else { + stop("input type is to be have to matrix") + } +} \ No newline at end of file diff --git a/R/day_range.R b/R/day_range.R new file mode 100644 index 0000000..03ad958 --- /dev/null +++ b/R/day_range.R @@ -0,0 +1,18 @@ +#' day_range +#' +#' 단어 범위를 return합니다. +#' +#' @param df BigKinds 원본 문서 +#' +#' @examples +#' day_range(df) +#' +#' @export +day_range <- function(df) { + if (is.data.frame(df)) { + print(paste("first day: ", min(df$일자))) + print(paste("last day: ", max(df$일자))) + } else { + stop("input type is to be have to DataFrame") + } +} \ No newline at end of file diff --git a/R/global.R b/R/global.R index f3fd2bb..1f26caf 100644 --- a/R/global.R +++ b/R/global.R @@ -1,123 +1,176 @@ -#' [] 표시된 헤더 삭제 +#' header_remover #' +#' 상단에 존재하는 헤더를 제거합니다. +#' +#' @param df BigKinds 원본 문서 #' -#' @param infile Path to the input file -#' @return A matrix of the infile +#' @examples +#' data <- header_remover(df) +#' head(data) #' @export header_remover <- function(df) { if (is.data.frame(df)) { ans <- gsub("\\[[^)]*\\]", "", df$`제목`) df$`제목` <- ans return(df) - } else if (is.list(df)) { - ans <- gsub("\\[[^)]*\\]", "", df) - return(ans) } else { stop("input value is to be have to list or DataFrame") } } -#' 키워드를 list로 변환 +#' word_tokenizer #' +#' 파일로부터 문서 별 키워드로 나열된 데이터 프레임으로 변환합니다. +#' +#' @param df BigKinds 원본 문서 #' -#' @param infile Path to the input file -#' @return A matrix of the infile +#' @examples +#' data <- word_tokenizer(df) +#' view(data) +#' @import tm +#' @import tibble +#' @import dplyr #' @export -keyword_list <- function(df) { +word_tokenizer <- function(df) { if (is.data.frame(df)) { - return(df$`키워드`) - } else if (is.list(df)) { - return(df) + df |> + select(`제목`,`키워드`) |> + rowid_to_column() |> + unnest_tokens( + input = "키워드", + output = "키워드" + ) -> keywords + return(keywords) } else { stop("input value is to be have to list or DataFrame") } } -#' [] 키워드 파싱 + +#' keyword_dataframe #' +#' BigKinds 데이터 셋을 키워드 갯수 데이터프레임으로 변환합니다. +#' +#' @param df BigKinds 원본 문서 #' -#' @param infile Path to the input file -#' @return A matrix of the infile +#' @examples +#' data <- keyword_dataframe(df) +#' view(data) +#' @import tm +#' @import tibble +#' @import dplyr #' @export -keyword_parser <- function(text_list) { - if (is.list(text_list)) { - news_key <- list() - for (word in text_list) { - if (is.character(word)) { - word <- strsplit(word, ",")[[1]] - news_key <- c(news_key, list(word)) - } else { - stop("input list is not valid format") - } - } - return(news_key) +keyword_dataframe <- function(df) { + if (is.data.frame(df)) { + data <- word_tokenizer(df) + data |> + group_by(키워드) |> + tally() |> + arrange(desc(n)) |> + as_tibble() -> keywords + return(keywords) } else { - stop("input type is to be have to list") + stop("input type is to be have to DataFrame") } } -#' 중복 값 제거 +#' keyword_dataframe_no_duplicated #' +#' BigKinds 데이터 셋을 키워드 갯수 데이터프레임(중복 미포함)으로 변환합니다. +#' +#' @param df BigKinds 원본 문서 #' -#' @param infile Path to the input file -#' @return A matrix of the infile +#' @examples +#' data <- keyword_dataframe_no_duplicated(df) +#' view(data) +#' @import tm +#' @import tibble +#' @import dplyr #' @export -duplication_remover <- function(news_key) { - if (is.list(news_key)) { - news_value <- list() - for (j in news_key) { - if (is.list(j)) { - j <- unique(j) - news_value <- c(news_value, list(j)) - } else { - stop("input list is not valid format") - } - } - return(news_value) +keyword_dataframe_no_duplicated <- function(df) { + if (is.data.frame(df)) { + data <- word_tokenizer(df) + + keywords_no_duplicated <- data[!duplicated(data[,c(2,3)]),] + + keywords_no_duplicated |> + group_by(키워드) |> + tally() |> + arrange(desc(n)) |> + as_tibble() -> return_keywords + return(return_keywords) } else { - stop("input type is to be have to list") + stop("input type is to be have to DataFrame") } } -#' 단어 갯수 카운트 +#' tfidf #' +#' 키워드의 tfidf score를 포함한 데이터 프레임을 반환합니다. +#' +#' @param df BigKinds 원본 문서 #' -#' @param infile Path to the input file -#' @return A matrix of the infile +#' @examples +#' data <- tfidf(df) +#' view(data) +#' @import tm +#' @import tibble +#' @import dplyr +#' @import tidytext #' @export -word_counter <- function(news_value) { - if (is.list(news_value)) { - key_words <- list() - for (k in seq_along(news_value)) { - for (i in news_value[[k]]) { - if (!(i %in% names(key_words))) { - key_words[[i]] <- 1 - } else { - key_words[[i]] <- key_words[[i]] + 1 - } - } - } - return(key_words) +tfidf <- function(df) { + if (is.data.frame(df)) { + data <- word_tokenizer(df) + data |> + bind_tf_idf(term = `키워드`, document = `제목`, n = rowid) -> tfidf + return(tfidf) } else { - stop("input type is to be have to list") + stop("input type is to be have to DataFrame") } } +#' tfidf_vector +#' +#' tfidf vector로 변환합니다. +#' +#' @param df BigKinds 원본 문서 +#' +#' @examples +#' data <- tfidf_vector(df) +#' view(data) +#' @import tm +#' @import tibble +#' @import dplyr +#' @import tidytext +#' @export +tfidf_vector <- function(df) { + if (is.data.frame(df)) { + data <- word_tokenizer(df) + + dtm <- DocumentTermMatrix(Corpus(VectorSource(data$키워드))) + tdm <- weightTfIdf(dtm) + vec <- as.matrix(tdm) + return(vec) + } else { + stop("input type is to be have to DataFrame") + } +} -#' counter dict --> dataframe +#' normalize_vector +#' +#' 벡터를 정규화합니다.(row 기준 minmax scaling) #' +#' @param vec tfidf vector #' -#' @param infile Path to the input file -#' @return A matrix of the infile #' @export -counter_to_dataframe <- function(key_words) { - if (is.list(key_words)) { - word_df <- data.frame(matrix(unlist(key_words), ncol = 2, byrow = TRUE)) - colnames(word_df) <- c("단어", "빈도") - word_df <- word_df[order(word_df$`빈도`, decreasing = TRUE), , drop = FALSE, ] - rownames(word_df) <- NULL - return(word_df) +normalize_vector <- function(vec) { + if (is.matrix(vec)) { + vec_nor <- t(normalize(t(vec))) + return(vec_nor) } else { - stop("input type is to be have to dict") + stop("input type is to be have to matrix") } } +normalize <- function(x, na.rm = TRUE) { + return((x- min(x)) /(max(x)-min(x))) +} diff --git a/R/preprocessing.R b/R/preprocessing.R deleted file mode 100644 index 68165c7..0000000 --- a/R/preprocessing.R +++ /dev/null @@ -1,67 +0,0 @@ -source(global.R) - -keyword_dataframe <- function(df) { - if (is.data.frame(df)) { - lis <- keyword_list(df) - keywords <- keyword_parser(lis) - counter <- word_counter(keywords) - df <- counter_to_dataframe(counter) - return(df) - } else { - stop("input type is to be have to DataFrame") - } -} - -keyword_dataframe_no_duplicated <- function(df) { - if (is.data.frame(df)) { - lis <- keyword_list(df) - keywords <- keyword_parser(lis) - keywords_set <- duplication_remover(keywords) - counter <- word_counter(keywords_set) - df <- counter_to_dataframe(counter) - return(df) - } else { - stop("input type is to be have to DataFrame") - } -} - -tfidf <- function(df, ...) { - if (is.data.frame(df)) { - if (length(...) > 0 && is.character(...)) { - df <- df[, ...] - } - lis <- keyword_list(df) - tfidfv <- DocumentTermMatrix(Corpus(VectorSource(lis)), control = list(weighting = weightTfIdf)) - word_count <- data.frame( - 단어 = colnames(tfidfv), - 빈도 = colSums(as.matrix(tfidfv)) - ) %>% - arrange(desc(빈도)) %>% - mutate(index = row_number()) %>% - select(-index) - return(word_count) - } else { - stop("input type is to be have to DataFrame") - } -} - -tfidf_vector <- function(df) { - if (is.data.frame(df)) { - lis <- keyword_list(df) - dtm <- DocumentTermMatrix(Corpus(VectorSource(lis))) - tdm <- weightTfIdf(dtm) - vec <- as.matrix(tdm) - return(vec) - } else { - stop("input type is to be have to DataFrame") - } -} - -normalize_vector <- function(vec) { - if (is.matrix(vec)) { - vec_nor <- t(normalize(t(vec))) - return(vec_nor) - } else { - stop("input type is to be have to matrix") - } -} diff --git a/R/press_counter.R b/R/press_counter.R new file mode 100644 index 0000000..8cfa1d3 --- /dev/null +++ b/R/press_counter.R @@ -0,0 +1,20 @@ +#' press_counter +#' +#' 언론사 별 기사의 갯수를 반환합니다. +#' +#' @param df BigKinds 원본 문서 +#' +#' @examples +#' press_counter(df) +#' +#' @export +press_counter <- function(df) { + if (is.data.frame(df)) { + freq <- table(df$언론사) + brod_df <- data.frame(언론사 = names(freq), 기사 = as.numeric(freq)) + return(brod_df) + } else { + stop("input type is to be have to DataFrame") + } +} + diff --git a/R/representation.R b/R/representation.R deleted file mode 100644 index 095f19d..0000000 --- a/R/representation.R +++ /dev/null @@ -1,130 +0,0 @@ -library(tidyverse) -library(arules) -library(proxy) - -source(global.R) - -day_range <- function(df) { - if (is.data.frame(df)) { - print(paste("first day: ", min(df$일자))) - print(paste("last day: ", max(df$일자))) - } else { - stop("input type is to be have to DataFrame") - } -} - -press_counter <- function(df) { - if (is.data.frame(df)) { - freq <- table(df$언론사) - brod_df <- data.frame(언론사 = names(freq), 기사 = as.numeric(freq)) - return(brod_df) - } else { - stop("input type is to be have to DataFrame") - } -} - -pca <- function(vec, Random_State = 123) { - if (is.matrix(vec)) { - pca_df <- prcomp(vec, center = TRUE)$x[, 1:2] - pca_df <- data.frame(`component 0` = pca_df[, 1], `component 1` = pca_df[, 2]) - return(pca_df) - } else { - stop("input type is to be have to matrix") - } -} - -nmf <- function(vec, Random_State = 123) { - if (is.matrix(vec)) { - nmf_df <- NMF::nmf(vec, 2, seed = Random_State)$W - nmf_df <- data.frame(`component 0` = nmf_df[, 1], `component 1` = nmf_df[, 2]) - return(nmf_df) - } else { - stop("input type is to be have to matrix") - } -} - -t_sne <- function(vec, learn_Rate = 100) { - if (is.matrix(vec)) { - tsne_df <- Rtsne::Rtsne(vec, dims = 2, perplexity = learn_Rate)$Y - tsne_df <- data.frame(`component 0` = tsne_df[, 1], `component 1` = tsne_df[, 2]) - return(tsne_df) - } else { - stop("input type is to be have to matrix") - } -} - -lsa <- function(vec) { - if (is.matrix(vec)) { - svd_df <- svd(vec)$u[, 1:2] - svd_df <- data.frame(`component 0` = svd_df[, 1], `component 1` = svd_df[, 2]) - return(svd_df) - } else { - stop("input type is to be have to matrix") - } -} - -kmeans <- function(vec, k, random_state = 123) { - if (is.matrix(vec)) { - set.seed(random_state) - kmeans_model <- kmeans(vec, centers = k, iter.max = 1000) - return(kmeans_model$cluster) - } else { - stop("input type is to be have to matrix") - } -} - -dbscan <- function(vec, eps, min_samples, metric = "euclidean") { - if (is.matrix(vec)) { - dbscan_model <- dbscan::dbscan(vec, eps = eps, minPts = min_samples, method = metric) - return(dbscan_model$cluster) - } else { - stop("input type is to be have to matrix") - } -} - -meanshift <- function(vec, qt = 0.25) { - if (is.matrix(vec)) { - best_bandwidth <- density(vec)$bw - print(paste(qt, "기준 최적 bandwidth 값:", round(best_bandwidth, 2))) - - ms_model <- meanshift(vec, bandwidth = best_bandwidth) - print(paste("cluster 갯수:", length(unique(ms_model)))) - return(ms_model) - } else { - stop("input type is to be have to matrix") - } -} - -lda <- function(dataframe, k = 10, train = 100, fit = 10) { - if (is.data.frame(dataframe)) { - lis <- keyword_parser(keyword_list(dataframe)) - model <- LDA(lis, k = k) - - for (words in lis) { - model$add.documents(words) - } - - for (i in seq(0, train, fit)) { - model$train(fit) - } - - return(model) - } else { - stop("input type is to be have to DataFrame") - } -} - -association <- function(dataframe, min_support = 0.5, use_colnames = TRUE, min_threshold = 0.1, metric = "confidence") { - if (is.data.frame(dataframe)) { - words <- keyword_parser(keyword_list(dataframe)) - te_data <- as(words, "transactions") - result <- apriori(te_data, parameter = list(supp = min_support, minlen = 2, maxlen = Inf, target = "rules")) - result <- as.data.frame(inspect(result)) - result <- result[, c("lhs", "rhs", "support", metric)] - colnames(result) <- c("lhs", "rhs", "support", metric) - result <- result[result[, metric] > min_threshold, ] - return(result) - } else { - stop("input type is to be have to DataFrame") - } -} diff --git a/R/topic_model.R b/R/topic_model.R new file mode 100644 index 0000000..075bd7f --- /dev/null +++ b/R/topic_model.R @@ -0,0 +1,27 @@ +#' lda +#' +#' 토픽 모델링을 시행합니다. +#' +#' @param df BigKinds 원본 문서 +#' @param k 토픽 개수 +#' +#' @examples +#' lda(df, k = 10) +#' +#' @import tidymodels +#' @import dplyr +#' @import tm +#' @export +lda <- function(dataframe, k = 10) { + if (is.data.frame(dataframe)) { + data <- word_tokenizer(dataframe) + data <- data |> + count(키워드, 제목) |> + cast_dtm(키워드, 제목, n) + model <- LDA(data, k = k) + + return(model) + } else { + stop("input type is to be have to DataFrame") + } +} diff --git a/R/visualization.R b/R/visualization.R deleted file mode 100644 index 0f51c8b..0000000 --- a/R/visualization.R +++ /dev/null @@ -1,58 +0,0 @@ -library(wordcloud) -library(ggplot2) - -keywords_wordcloud <- function(df, press) { - if (is.data.frame(df)) { - df_keywords <- df[df$언론사 == press, ] - keywords <- keyword_list(df_keywords) - news_key <- keyword_parser(keywords) - news_key <- duplication_remover(news_key) - key <- word_counter(news_key) - news_key <- counter_to_dataframe(key) - wc <- wordcloud::wordcloud( - words = news_key$단어, - freq = news_key$빈도, - scale = c(3, 0.5), - min.freq = 1, - max.words = 200, - random.order = FALSE, - rot.per = 0.35, - colors = brewer.pal(8, "Dark2") - ) - print(wc) - } else { - stop("input type is to be have to DataFrame") - } -} - -top_words <- function(df, press, top_n = 25) { - if (is.data.frame(df)) { - df_keywords <- df[grepl(press, df$언론사), ] - keywords <- keyword_list(df_keywords) - news_key <- keyword_parser(keywords) - news_key <- duplication_remover(news_key) - key <- word_counter(news_key) - news_key <- counter_to_dataframe(key) - data <- head(news_key[order(news_key$빈도, decreasing = TRUE), ], top_n) - ggplot(data, aes(reorder(단어, -빈도), 빈도)) + - geom_bar(stat = "identity", fill = "steelblue") + - labs(x = "단어", y = "빈도") + - theme(axis.text.x = element_text(angle = 45, hjust = 1)) + - ggtitle("언론사 별 사용 단어 빈도 상위", top_n) + - coord_flip() - } else { - stop("input type is to be have to DataFrame") - } -} - -scatterplot <- function(df, label) { - if (is.data.frame(df)) { - ggplot(df, aes(component.0, component.1, color = label)) + - geom_point() + - labs(x = "component 0", y = "component 1") + - ggtitle("Scatter plot for dimension reduction") + - theme(legend.position = "bottom") - } else { - stop("input type is to be have to DataFrame") - } -} diff --git a/R/wordclund.R b/R/wordclund.R new file mode 100644 index 0000000..9204a40 --- /dev/null +++ b/R/wordclund.R @@ -0,0 +1,24 @@ +#' keywords_wordcloud +#' +#' 언론사 별로 가장 많이 등장한 단어 순위를 wordcloud로 시각화합니다. +#' +#' @param df BigKinds 원본 문서 +#' @param press 확인할 언론사 이름 +#' +#' @examples +#' keywords_wordcloud(df, "조선일보") +#' @import wordcloud2 +#' @import dplyr +#' @export +keywords_wordcloud <- function(df, press=NA) { + if (is.data.frame(df)) { + if (!is.na(press)){ + df <- df |> filter(언론사 == press) + } + words <- keyword_dataframe(df) + wordcloud2(words) + } else { + stop("input type is to be have to DataFrame") + } +} + diff --git a/man/DBSCAN.Rd b/man/DBSCAN.Rd new file mode 100644 index 0000000..eb78cb7 --- /dev/null +++ b/man/DBSCAN.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{DBSCAN} +\alias{DBSCAN} +\title{DBSCAN} +\usage{ +DBSCAN(vec, eps, min_samples, metric = "euclidean") +} +\arguments{ +\item{eps}{epsilon 값(보폭)} + +\item{metric}{거리 계산 방법(default = euclidean)} + +\item{df}{BigKinds 원본 문서} + +\item{min_sample}{최적 샘플 갯수} +} +\description{ +DBSCAN 알고리즘을 진행합니다. +} +\examples{ +DBSCAN(vec, eps = 0.5, min_sample = 3) + +} diff --git a/man/Kmeans.Rd b/man/Kmeans.Rd new file mode 100644 index 0000000..4b8627b --- /dev/null +++ b/man/Kmeans.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{Kmeans} +\alias{Kmeans} +\title{Kmeans} +\usage{ +Kmeans(vec, k, random_state = 123) +} +\arguments{ +\item{k}{형성할 군집 갯수} + +\item{random_state}{seed 값} + +\item{df}{BigKinds 원본 문서} +} +\description{ +kmeans clustering을 진행합니다. +} +\examples{ +Kmeans(df, k = 3) + +} diff --git a/man/MeanShift.Rd b/man/MeanShift.Rd new file mode 100644 index 0000000..d81e157 --- /dev/null +++ b/man/MeanShift.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{MeanShift} +\alias{MeanShift} +\title{MeanShift} +\usage{ +MeanShift(vec, qt = 0.25) +} +\arguments{ +\item{qt}{quantile 값(최적 bandwidth 추정을 위함)} + +\item{df}{BigKinds 원본 문서} +} +\description{ +mean shift clustering을 진행합니다. +} +\examples{ +MeanShift(vec, at = 0.5) + +} diff --git a/man/association.Rd b/man/association.Rd new file mode 100644 index 0000000..5ca3240 --- /dev/null +++ b/man/association.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/association.R +\name{association} +\alias{association} +\title{association} +\usage{ +association(df, min_support = 0.5, minlen = 2, maxlen = 10) +} +\arguments{ +\item{df}{BigKinds 원본 문서} + +\item{min_support}{최소 지지도} + +\item{minlen}{연관된 최소 갯수} + +\item{maxlen}{연관된 최대 갯수} +} +\description{ +기사에 등장한 단어 별로 연관분석을 진행합니다. +연관분석 방법은 Apriori입니다. +} +\examples{ +association(df, min_support = 0.6, minlen = 3, maxlen = 10) +} diff --git a/man/counter_to_dataframe.Rd b/man/counter_to_dataframe.Rd deleted file mode 100644 index 1ea98fa..0000000 --- a/man/counter_to_dataframe.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/base.R -\name{counter_to_dataframe} -\alias{counter_to_dataframe} -\title{counter dict --> dataframe} -\usage{ -counter_to_dataframe(key_words) -} -\arguments{ -\item{infile}{Path to the input file} -} -\value{ -A matrix of the infile -} -\description{ -counter dict --> dataframe -} diff --git a/man/day_range.Rd b/man/day_range.Rd new file mode 100644 index 0000000..0409eb7 --- /dev/null +++ b/man/day_range.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/day_range.R +\name{day_range} +\alias{day_range} +\title{day_range} +\usage{ +day_range(df) +} +\arguments{ +\item{df}{BigKinds 원본 문서} +} +\description{ +단어 범위를 return합니다. +} +\examples{ +day_range(df) + +} diff --git a/man/duplication_remover.Rd b/man/duplication_remover.Rd deleted file mode 100644 index b58d02a..0000000 --- a/man/duplication_remover.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/base.R -\name{duplication_remover} -\alias{duplication_remover} -\title{중복 값 제거} -\usage{ -duplication_remover(news_key) -} -\arguments{ -\item{infile}{Path to the input file} -} -\value{ -A matrix of the infile -} -\description{ -중복 값 제거 -} diff --git a/man/header_remover.Rd b/man/header_remover.Rd index 489af16..d3b7bed 100644 --- a/man/header_remover.Rd +++ b/man/header_remover.Rd @@ -1,17 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/base.R +% Please edit documentation in R/global.R \name{header_remover} \alias{header_remover} -\title{[] 표시된 헤더 삭제} +\title{header_remover} \usage{ header_remover(df) } \arguments{ -\item{infile}{Path to the input file} -} -\value{ -A matrix of the infile +\item{df}{BigKinds 원본 문서} } \description{ -[] 표시된 헤더 삭제 +상단에 존재하는 헤더를 제거합니다. +} +\examples{ +data <- header_remover(df) +head(data) } diff --git a/man/keyword_dataframe.Rd b/man/keyword_dataframe.Rd new file mode 100644 index 0000000..a30cb01 --- /dev/null +++ b/man/keyword_dataframe.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/global.R +\name{keyword_dataframe} +\alias{keyword_dataframe} +\title{keyword_dataframe} +\usage{ +keyword_dataframe(df) +} +\arguments{ +\item{df}{BigKinds 원본 문서} +} +\description{ +BigKinds 데이터 셋을 키워드 갯수 데이터프레임으로 변환합니다. +} +\examples{ +data <- keyword_dataframe(df) +view(data) +} diff --git a/man/keyword_dataframe_no_duplicated.Rd b/man/keyword_dataframe_no_duplicated.Rd new file mode 100644 index 0000000..43b5d13 --- /dev/null +++ b/man/keyword_dataframe_no_duplicated.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/global.R +\name{keyword_dataframe_no_duplicated} +\alias{keyword_dataframe_no_duplicated} +\title{keyword_dataframe_no_duplicated} +\usage{ +keyword_dataframe_no_duplicated(df) +} +\arguments{ +\item{df}{BigKinds 원본 문서} +} +\description{ +BigKinds 데이터 셋을 키워드 갯수 데이터프레임(중복 미포함)으로 변환합니다. +} +\examples{ +data <- keyword_dataframe_no_duplicated(df) +view(data) +} diff --git a/man/keyword_list.Rd b/man/keyword_list.Rd deleted file mode 100644 index 241e534..0000000 --- a/man/keyword_list.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/base.R -\name{keyword_list} -\alias{keyword_list} -\title{키워드를 list로 변환} -\usage{ -keyword_list(df) -} -\arguments{ -\item{infile}{Path to the input file} -} -\value{ -A matrix of the infile -} -\description{ -키워드를 list로 변환 -} diff --git a/man/keyword_parser.Rd b/man/keyword_parser.Rd deleted file mode 100644 index 1c75b7f..0000000 --- a/man/keyword_parser.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/base.R -\name{keyword_parser} -\alias{keyword_parser} -\title{[] 키워드 파싱} -\usage{ -keyword_parser(text_list) -} -\arguments{ -\item{infile}{Path to the input file} -} -\value{ -A matrix of the infile -} -\description{ -[] 키워드 파싱 -} diff --git a/man/keywords_wordcloud.Rd b/man/keywords_wordcloud.Rd new file mode 100644 index 0000000..b7886cc --- /dev/null +++ b/man/keywords_wordcloud.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/wordclund.R +\name{keywords_wordcloud} +\alias{keywords_wordcloud} +\title{keywords_wordcloud} +\usage{ +keywords_wordcloud(df, press = NA) +} +\arguments{ +\item{df}{BigKinds 원본 문서} + +\item{press}{확인할 언론사 이름} +} +\description{ +언론사 별로 가장 많이 등장한 단어 순위를 wordcloud로 시각화합니다. +} +\examples{ +keywords_wordcloud(df, "조선일보") +} diff --git a/man/lda.Rd b/man/lda.Rd new file mode 100644 index 0000000..8c91d94 --- /dev/null +++ b/man/lda.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/topic_model.R +\name{lda} +\alias{lda} +\title{lda} +\usage{ +lda(dataframe, k = 10) +} +\arguments{ +\item{k}{토픽 개수} + +\item{df}{BigKinds 원본 문서} +} +\description{ +토픽 모델링을 시행합니다. +} +\examples{ +lda(df, k = 10) + +} diff --git a/man/normalize_vector.Rd b/man/normalize_vector.Rd new file mode 100644 index 0000000..5e50ab5 --- /dev/null +++ b/man/normalize_vector.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/global.R +\name{normalize_vector} +\alias{normalize_vector} +\title{normalize_vector} +\usage{ +normalize_vector(vec) +} +\arguments{ +\item{vec}{tfidf vector} +} +\description{ +벡터를 정규화합니다.(row 기준 minmax scaling) +} diff --git a/man/press_counter.Rd b/man/press_counter.Rd new file mode 100644 index 0000000..f236414 --- /dev/null +++ b/man/press_counter.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/press_counter.R +\name{press_counter} +\alias{press_counter} +\title{press_counter} +\usage{ +press_counter(df) +} +\arguments{ +\item{df}{BigKinds 원본 문서} +} +\description{ +언론사 별 기사의 갯수를 반환합니다. +} +\examples{ +press_counter(df) + +} diff --git a/man/tfidf.Rd b/man/tfidf.Rd new file mode 100644 index 0000000..15d6895 --- /dev/null +++ b/man/tfidf.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/global.R +\name{tfidf} +\alias{tfidf} +\title{tfidf} +\usage{ +tfidf(df) +} +\arguments{ +\item{df}{BigKinds 원본 문서} +} +\description{ +키워드의 tfidf score를 포함한 데이터 프레임을 반환합니다. +} +\examples{ +data <- tfidf(df) +view(data) +} diff --git a/man/tfidf_vector.Rd b/man/tfidf_vector.Rd new file mode 100644 index 0000000..a7408c7 --- /dev/null +++ b/man/tfidf_vector.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/global.R +\name{tfidf_vector} +\alias{tfidf_vector} +\title{tfidf_vector} +\usage{ +tfidf_vector(df) +} +\arguments{ +\item{df}{BigKinds 원본 문서} +} +\description{ +tfidf vector로 변환합니다. +} +\examples{ +data <- tfidf_vector(df) +view(data) +} diff --git a/man/top_words.Rd b/man/top_words.Rd new file mode 100644 index 0000000..685fcd5 --- /dev/null +++ b/man/top_words.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/barplot.R +\name{top_words} +\alias{top_words} +\title{top_words} +\usage{ +top_words(df, press = NA, top_n = 25) +} +\arguments{ +\item{df}{BigKinds 원본 문서} + +\item{press}{확인할 언론사 이름} + +\item{top_n}{시각화할 단어 갯수} +} +\description{ +언론사 별로 가장 많이 등장한 단어 순위를 시각화합니다. +최대 몇개의 단어를 추출할지는 직접 정할 수 있습니다. +default는 25개입니다. +} +\examples{ +top_words(df, "경향신문", top_n=30) +} diff --git a/man/word_counter.Rd b/man/word_counter.Rd deleted file mode 100644 index 19361f6..0000000 --- a/man/word_counter.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/base.R -\name{word_counter} -\alias{word_counter} -\title{단어 갯수 카운트} -\usage{ -word_counter(news_value) -} -\arguments{ -\item{infile}{Path to the input file} -} -\value{ -A matrix of the infile -} -\description{ -단어 갯수 카운트 -} diff --git a/man/word_tokenizer.Rd b/man/word_tokenizer.Rd new file mode 100644 index 0000000..43cbedd --- /dev/null +++ b/man/word_tokenizer.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/global.R +\name{word_tokenizer} +\alias{word_tokenizer} +\title{word_tokenizer} +\usage{ +word_tokenizer(df) +} +\arguments{ +\item{df}{BigKinds 원본 문서} +} +\description{ +파일로부터 문서 별 키워드로 나열된 데이터 프레임으로 변환합니다. +} +\examples{ +data <- word_tokenizer(df) +view(data) +}