-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawling_all.R
69 lines (58 loc) · 2.37 KB
/
crawling_all.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# 0. 패키지 설치 ----------------------------------------------
if (!require("pacman")) (install.packages("pacman"))
pacman::p_load(
tidyverse, rvest, magrittr
)
# 1.1.링크 저장 및 만들기 --------------------------------------
"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EC%9D%B4%ED%95%B4%EC%B6%A9%EB%8F%8C%EB%B0%A9%EC%A7%80&sort=2&photo=3&field=0&pd=3&ds=2011.01.01&de=2021.04.29&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20110101to20210429,a:all&start=" -> url
# 검색어와 기간설정 후 검색한 뒤 링크 통째로 따기
PAGE <- seq(from=1,to=3141,by=10) # (to에 검색결과창 맨 마지막 페이지 번호 입력)
naver_url_list <- c()
for (page_i in PAGE) {
naver_url <- paste0(url,page_i)
naver_url_list <- c(naver_url_list, naver_url)
}
naver_url_list
# 1.3. url 에서 관련기사 url 추출 ----------------------------------
news_url <- c()
for (page_i in PAGE) {
naver_url <- paste0(url, page_i)
html <- read_html(naver_url)
temp <- unique(
html_nodes(html, '#main_pack') %>% # id= 는 #
html_nodes(css = '.group_news') %>% # class= 는 css= 붙이고 . 찍어주기
html_nodes(css = '.news_info') %>%
html_nodes('a') %>%
html_attr('href')
)
news_url <- c(news_url, temp)
}
news_url %>%
as_tibble() %>%
filter(value %>% str_detect("news.naver.com")) -> NEWS
NEWS %>%
rename(url = value) %>%
mutate(id =row_number(),
title = "",
content = "",
date = "",
press = "") %>%
select(id, date, title, content, url) -> NEWS_prep
for (i in 1:dim(NEWS_prep)[1]) {
html <- read_html(as.character(NEWS_prep$url[i]))
temp_news_title <- repair_encoding(html_text(html_nodes(html, '#articleTitle')), from = 'utf-8')
temp_news_content <- repair_encoding(html_text(html_nodes(html, '#articleBodyContents')), from = 'utf-8')
temp_news_date <- repair_encoding(html_text(html_nodes(html, css = ".t11")), from = 'utf-8')
temp_news_press <- repair_encoding(html_text(html_nodes(html, css = ".copyright")), from = 'utf-8')
if (length(temp_news_title) > 0) {
NEWS_prep$title[i] <- temp_news_title
NEWS_prep$content[i] <- temp_news_content
NEWS_prep$date[i] <- temp_news_date
NEWS_prep$press[i] <- temp_news_press
}
}
NEWS_prep -> NEWS
NEWS %>%
select(press)
NEWS %>%
write_excel_csv("news.csv")