-
Notifications
You must be signed in to change notification settings - Fork 0
/
Rcode
142 lines (112 loc) · 4.51 KB
/
Rcode
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# python 설치
## https://www.python.org/downloads/windows/
## Windows x86-64 executable installer
# rpy2 설치
## https://www.lfd.uci.edu/~gohlke/pythonlibs/#rpy2
## => rpy2-2.9.5-py3-none-win_amd64.whl 으로 파일명 변경
# visual studio 설치(konlpy 설치 위해 필요)
## https://visualstudio.microsoft.com/ko/downloads/
## Desktop development with C++(C++를 사용한 데스크톱 개발), Python development(Python 개발) 체크 후 설치
## 자바 다운로드
## https://www.java.com/ko/download/
## 파이썬 module 설치를 위한 세팅
# shell("curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py")
# shell("python get-pip.py")
# shell("pip install --upgrade setuptools")
## 다운받은 rpy2 파일을 설치
# shell("pip install rpy2-2.9.5-py3-none-win_amd64.whl")
# ------------------------------------------------------------------------------------------- #
## 필요 패키지 설치
# pkg <- c("reticulate", "data.table", "magrittr", "RcppMeCab", "tidytext", "RmecabKo", "tm", "slam", "stringr")
# if(sum(!(pkg %in% rownames(installed.packages())))>0) {install.packages(setdiff(pkg, rownames(installed.packages())))}
# 텍스트 생성
t_text <- c("테스트입니다.", "어떤가요?")
# t_text <- "한글입니다."
# =========================== #
# 1. Komoran 모듈 사용(Python)
# =========================== #
## 파이썬 모듈 설치(https://konlpy-ko.readthedocs.io/ko/v0.4.3/install)
# shell("pip install nltk --force-reinstall")
# shell("pip install konlpy --force-reinstall")
# shell("pip install JPype1-1.2.0-cp38-cp38-win_amd64.whl --force-reinstall")
library(reticulate)
library(data.table)
# 형태소 분석기 모듈 로드
py_run_string(paste("from konlpy.tag import Komoran;", "komoran = Komoran();"))
# 형태소 분석
if(length(t_text) == 1){
py_run_string("text_py = komoran.pos(r.t_text);")
}else{
py_run_string("text_py = list(map(komoran.pos, r.t_text));")
}
# R 객체로 결과값 불러오기
text_r <- py$text_py
if(length(t_text) == 1) text_r <- list(text_r)
# garbage collection
py_run_string("del text_py")
py_gc <- import("gc")
py_gc$collect()
# 결과값 정제(NN:명사, NP:대명사, VV:동사, VA:형용사, SL:외국어, SH:한자, SN:숫자)
## 품사표 참조 : https://komorandocs.readthedocs.io/kr/latest/firststep/postypes.html
kor_morph <- lapply(text_r, function(x){
o <- sapply(x, "[[", 1)
names(o) <- sapply(x, "[[", 2)
o
})
# data.table로 변환
kor_morph <- data.table(doc_id = rep(seq_along(kor_morph), times = sapply(kor_morph, length)),
word = unlist(kor_morph), type = names(unlist(kor_morph)))
# =========================== #
# 2. RcppMeCab 패키지 사용(은전한닢 프로젝트)
# =========================== #
library(data.table)
library(magrittr)
library(RcppMeCab)
library(tidytext)
## 최초 1번만 실행하여 mecab 설치 진행
# RmecabKo::install_mecab("C:/mecab")
# set term/doc
text_tab <- data.table(doc_id = seq_along(t_text), txt = enc2utf8(t_text))
kor_morph2 <- suppressWarnings({
text_tab %>% unnest_tokens(word, txt, token = posParallel, to_lower = F)
})
# 단어/품사 분리
kor_morph2 <- cbind(kor_morph2[,1,with=F],
kor_morph2[,tstrsplit(word, "/", names = c("word", "type"))])
# =========================== #
# 부록 : RmecabKo 패키지 사용
# =========================== #
library(RmecabKo)
library(magrittr)
# 명사 추출
t_text %>% enc2native %>% token_nouns
t_text %>% enc2native %>% nouns
# 단어 추출
t_text %>% enc2native %>% token_words
# 형태소 추출
t_text %>% enc2native %>% token_morph
t_text %>% enc2native %>% RmecabKo::pos(join = F)
# =========================== #
# dtm 생성
# =========================== #
library(data.table)
library(magrittr)
library(tm)
library(slam)
# library(stringr)
# kor_morph <- kor_morph2
# kor_morph[,word:=str_to_lower(word)]
# 문서별 단어 카운트
kor_morph <- kor_morph[,.N,by=list(doc_id,word)]
# unique term/doc
term <- kor_morph[,list(word)] %>% unique %>% setorder(word) %>% .[,word]
doc <- kor_morph[,list(doc_id)] %>% unique %>% setorder(doc_id) %>% .[,doc_id]
# make dtm
m <- simple_triplet_matrix(i = chmatch(kor_morph[,word], term),
j = match(kor_morph[,doc_id], doc),
v = kor_morph[,N],
nrow = length(term), ncol = length(doc),
dimnames = list(Terms = term, Docs = as.character(doc)))
# weighting options ( weightTf / weightBin / weightTfIdf )
dtm <- t(as.TermDocumentMatrix(m, weighting = weightTf))
# inspect(dtm)