Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

set R function #1

Merged
merged 9 commits into from
Aug 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: main
pull_request:
branches: main

Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
on:
push:
branches: main
pull_request:
branches: main
release:
types: [published]
workflow_dispatch:
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: main
pull_request:
branches: main

Expand Down
14 changes: 8 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
Package: RBigKinds
Title: What the Package Does (One Line, Title Case)
Version: 0.0.1.9000
Authors@R:
person("First", "Last", , "first.last@example.com", role = c("aut", "cre"),
comment = c(ORCID = "YOUR-ORCID-ID"))
Description: What the package does (one paragraph).
Title: BigKinds Data Analysis Toolkit for R
Version: 0.1.0.9000
Authors@R: c(
person("Jaeseong", "Choe",, "cjssoote@gmail.com", role = c("aut", "cre"))
)
Description: RBigKinds is a library for Data Analysis of BigKinds data through R.
Author: Jaeseonc Choe [aut, cre]
Maintainer: Jaeseong Choe <cjssoote@gmail.com>
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
Expand Down
29 changes: 24 additions & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,8 +1,27 @@
# Generated by roxygen2: do not edit by hand

export(counter_to_dataframe)
export(duplication_remover)
export(DBSCAN)
export(Kmeans)
export(MeanShift)
export(association)
export(day_range)
export(header_remover)
export(keyword_list)
export(keyword_parser)
export(word_counter)
export(keyword_dataframe)
export(keyword_dataframe_no_duplicated)
export(keywords_wordcloud)
export(lda)
export(normalize_vector)
export(press_counter)
export(tfidf)
export(tfidf_vector)
export(top_words)
export(word_tokenizer)
import(arules)
import(dbscan)
import(dplyr)
import(ggplot2)
import(tibble)
import(tidymodels)
import(tidytext)
import(tm)
import(wordcloud2)
9 changes: 0 additions & 9 deletions R/RBigKinds.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,3 @@ NULL
# devtools::use_package("rmarkdown","Suggests")
# devtools::use_package("testthat","Suggests")
#
# ### Preprocessing
# devtools::use_package("dplyr","Imports")
# devtools::use_package("rvest","Imports")
#
# ### Import
# devtools::use_package("readxl", "Imports")
#
# ### export
# devtools::use_package("writexl",Imports")
29 changes: 29 additions & 0 deletions R/association.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#' association
#'
#' 기사에 등장한 단어 별로 연관분석을 진행합니다.
#' 연관분석 방법은 Apriori입니다.
#'
#' @param df BigKinds 원본 문서
#' @param min_support 최소 지지도
#' @param minlen 연관된 최소 갯수
#' @param maxlen 연관된 최대 갯수
#'
#' @examples
#' association(df, min_support = 0.6, minlen = 3, maxlen = 10)
#' @import arules
#' @export
association <- function(df, min_support = 0.5, minlen=2, maxlen = 10) {
if (is.data.frame(df)) {
words <- word_tokenizer(df)
data <- split(words$키워드, words$제목)
te_data <- as(data, "transactions")
result <- apriori(te_data, parameter = list(supp = min_support, minlen=minlen, maxlen=maxlen, target = "rules"))
result <- as.data.frame(inspect(result))
result <- result[, c("lhs", "rhs", "support", "confidence")]
colnames(result) <- c("lhs", "rhs", "support", "confidence")
result <- result[result[, "confidence"] > min_support ]
return(result)
} else {
stop("input type is to be have to df")
}
}
32 changes: 32 additions & 0 deletions R/barplot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#' top_words
#'
#' 언론사 별로 가장 많이 등장한 단어 순위를 시각화합니다.
#' 최대 몇개의 단어를 추출할지는 직접 정할 수 있습니다.
#' default는 25개입니다.
#'
#' @param df BigKinds 원본 문서
#' @param press 확인할 언론사 이름
#' @param top_n 시각화할 단어 갯수
#'
#' @examples
#' top_words(df, "경향신문", top_n=30)
#' @import ggplot2
#' @import dplyr
#' @export
top_words <- function(df, press = NA, top_n = 25) {
if (is.data.frame(df)) {
if (!is.na(press)){
df <- df |> filter(언론사 == press)
}
data <- keyword_dataframe(df)
data <- head(data[order(data$n, decreasing = TRUE), ], top_n)
ggplot(data, aes(reorder(키워드, n), n)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(x = "단어", y = "빈도") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("사용 단어 빈도 상위", top_n) +
coord_flip()
} else {
stop("input type is to be have to DataFrame")
}
}
69 changes: 69 additions & 0 deletions R/clustering.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#' Kmeans
#'
#' kmeans clustering을 진행합니다.
#'
#' @param df BigKinds 원본 문서
#' @param k 형성할 군집 갯수
#' @param random_state seed 값
#'
#' @examples
#' Kmeans(df, k = 3)
#'
#' @export
Kmeans <- function(vec, k, random_state = 123) {
if (is.matrix(vec)) {
set.seed(random_state)
kmeans_model <- kmeans(vec, centers = k, iter.max = 1000)
return(kmeans_model$cluster)
} else {
stop("input type is to be have to matrix")
}
}

#' DBSCAN
#'
#' DBSCAN 알고리즘을 진행합니다.
#'
#' @param df BigKinds 원본 문서
#' @param eps epsilon 값(보폭)
#' @param min_sample 최적 샘플 갯수
#' @param metric 거리 계산 방법(default = euclidean)
#'
#' @examples
#' DBSCAN(vec, eps = 0.5, min_sample = 3)
#'
#' @import dbscan
#'
#' @export
DBSCAN <- function(vec, eps, min_samples, metric = "euclidean") {
if (is.matrix(vec)) {
dbscan_model <- dbscan::dbscan(vec, eps = eps, minPts = min_samples, method = metric)
return(dbscan_model$cluster)
} else {
stop("input type is to be have to matrix")
}
}

#' MeanShift
#'
#' mean shift clustering을 진행합니다.
#'
#' @param df BigKinds 원본 문서
#' @param qt quantile 값(최적 bandwidth 추정을 위함)
#'
#' @examples
#' MeanShift(vec, at = 0.5)
#'
#' @export
MeanShift <- function(vec, qt = 0.25) {
if (is.matrix(vec)) {
best_bandwidth <- density(vec)$bw
print(paste(qt, "기준 최적 bandwidth 값:", round(best_bandwidth, 2)))

ms_model <- meanshift(vec, bandwidth = best_bandwidth)
print(paste("cluster 갯수:", length(unique(ms_model))))
return(ms_model)
} else {
stop("input type is to be have to matrix")
}
}
18 changes: 18 additions & 0 deletions R/day_range.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' day_range
#'
#' 단어 범위를 return합니다.
#'
#' @param df BigKinds 원본 문서
#'
#' @examples
#' day_range(df)
#'
#' @export
day_range <- function(df) {
if (is.data.frame(df)) {
print(paste("first day: ", min(df$일자)))
print(paste("last day: ", max(df$일자)))
} else {
stop("input type is to be have to DataFrame")
}
}
Loading