Rcode


# python 설치
## https://www.python.org/downloads/windows/ 
## Windows x86-64 executable installer

# rpy2 설치
## https://www.lfd.uci.edu/~gohlke/pythonlibs/#rpy2
## => rpy2-2.9.5-py3-none-win_amd64.whl 으로 파일명 변경

# visual studio 설치(konlpy 설치 위해 필요)
## https://visualstudio.microsoft.com/ko/downloads/
## Desktop development with C++(C++를 사용한 데스크톱 개발), Python development(Python 개발) 체크 후 설치

## 자바 다운로드
## https://www.java.com/ko/download/

## 파이썬 module 설치를 위한 세팅
# shell("curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py")
# shell("python get-pip.py")
# shell("pip install --upgrade setuptools")

## 다운받은 rpy2 파일을 설치
# shell("pip install rpy2-2.9.5-py3-none-win_amd64.whl")

# ------------------------------------------------------------------------------------------- #

## 필요 패키지 설치
# pkg <- c("reticulate", "data.table", "magrittr", "RcppMeCab", "tidytext", "RmecabKo", "tm", "slam", "stringr")
# if(sum(!(pkg %in% rownames(installed.packages())))>0) {install.packages(setdiff(pkg, rownames(installed.packages())))}

# 텍스트 생성
t_text <- c("테스트입니다.", "어떤가요?")
# t_text <- "한글입니다."

# =========================== #
# 1. Komoran 모듈 사용(Python)
# =========================== #
## 파이썬 모듈 설치(https://konlpy-ko.readthedocs.io/ko/v0.4.3/install)
# shell("pip install nltk --force-reinstall")
# shell("pip install konlpy --force-reinstall")
# shell("pip install JPype1-1.2.0-cp38-cp38-win_amd64.whl --force-reinstall")
library(reticulate)
library(data.table)

# 형태소 분석기 모듈 로드
py_run_string(paste("from konlpy.tag import Komoran;", "komoran = Komoran();"))

# 형태소 분석
if(length(t_text) == 1){
  py_run_string("text_py = komoran.pos(r.t_text);")
}else{
  py_run_string("text_py = list(map(komoran.pos, r.t_text));")
}
# R 객체로 결과값 불러오기
text_r <- py$text_py
if(length(t_text) == 1) text_r <- list(text_r)

# garbage collection
py_run_string("del text_py")
py_gc <- import("gc")
py_gc$collect()

# 결과값 정제(NN:명사, NP:대명사, VV:동사, VA:형용사, SL:외국어, SH:한자, SN:숫자)
## 품사표 참조 : https://komorandocs.readthedocs.io/kr/latest/firststep/postypes.html
kor_morph <- lapply(text_r, function(x){
  o <- sapply(x, "[[", 1)
  names(o) <- sapply(x, "[[", 2)
  o
})

# data.table로 변환
kor_morph <- data.table(doc_id = rep(seq_along(kor_morph), times = sapply(kor_morph, length)),
                        word = unlist(kor_morph), type = names(unlist(kor_morph)))

# =========================== #
# 2. RcppMeCab 패키지 사용(은전한닢 프로젝트)
# =========================== #
library(data.table)
library(magrittr)
library(RcppMeCab)
library(tidytext)

## 최초 1번만 실행하여 mecab 설치 진행
# RmecabKo::install_mecab("C:/mecab")

# set term/doc
text_tab <- data.table(doc_id = seq_along(t_text), txt = enc2utf8(t_text))
kor_morph2 <- suppressWarnings({
  text_tab %>% unnest_tokens(word, txt, token = posParallel, to_lower = F)
})

# 단어/품사 분리
kor_morph2 <- cbind(kor_morph2[,1,with=F],
                    kor_morph2[,tstrsplit(word, "/", names = c("word", "type"))])

# =========================== #
# 부록 : RmecabKo 패키지 사용
# =========================== #
library(RmecabKo)
library(magrittr)

# 명사 추출
t_text %>% enc2native %>% token_nouns
t_text %>% enc2native %>% nouns

# 단어 추출
t_text %>% enc2native %>% token_words

# 형태소 추출
t_text %>% enc2native %>% token_morph
t_text %>% enc2native %>% RmecabKo::pos(join = F)

# =========================== #
# dtm 생성
# =========================== #
library(data.table)
library(magrittr)
library(tm)
library(slam)
# library(stringr)

# kor_morph <- kor_morph2
# kor_morph[,word:=str_to_lower(word)]

# 문서별 단어 카운트
kor_morph <- kor_morph[,.N,by=list(doc_id,word)]

# unique term/doc
term <- kor_morph[,list(word)] %>% unique %>% setorder(word) %>% .[,word]
doc <- kor_morph[,list(doc_id)] %>% unique %>% setorder(doc_id) %>% .[,doc_id]

# make dtm
m <- simple_triplet_matrix(i = chmatch(kor_morph[,word], term),
                           j = match(kor_morph[,doc_id], doc),
                           v = kor_morph[,N],
                           nrow = length(term), ncol = length(doc),
                           dimnames = list(Terms = term, Docs = as.character(doc)))

# weighting options ( weightTf / weightBin / weightTfIdf )
dtm <- t(as.TermDocumentMatrix(m, weighting = weightTf))
# inspect(dtm)