-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrap.R
84 lines (74 loc) · 2.75 KB
/
scrap.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
library(dplyr)
library(lubridate)
library(xml2)
library(selectr)
library(rvest)
library(stringr)
url <- "https://timesofindia.indiatimes.com/topic/Hiv/news"
# url = "https://timesofindia.indiatimes.com/topic/HIV-infections/news"
baseurl <- "https://timesofindia.indiatimes.com"
# artlist <- character()
# content <- list(text=character(), pdate = integer())
req_year <- 2018
get_arts <- function(page, req_year){
dates <-
html_nodes(page, css = ".meta") %>%
html_text() %>%
as.Date(format = "%d %b %Y")
art_index <- year(dates)==req_year
arts <- html_nodes(page, css = "li.article div a") %>%
html_attr("href") %>%
sapply(function(x) paste(baseurl, x, sep = ''), USE.NAMES = F)
art_index[is.na(art_index)] <- F
# browser()
tibble(
link = arts[art_index],
# art = sapply(
# arts[art_index],function(x) get_content(x),
# USE.NAMES = F
# ),
pdate = as.Date(dates[art_index])
) %>%
mutate(art = vapply(
link, function(x) {k=get_content(x);k},
character(1),
USE.NAMES = F
)
)
}
get_content <- function(contenturl) {
tryCatch(
{
contentpage <- read_html(contenturl)
# ignore_text <- "Download The Times of India News App for Latest City .Make sense of the 2019 Lok Sabha elections and results on May 23 with TOI. Follow us to track latest news, live updates, news analysis and cutting-edge data analytics. Track live election results, the big trends and fastest updates on counting day with India's largest news network.#ElectionsWithTimes PreviousElection Data HubPoll Fantasy GameTOI Campaign TrackerData StoriesPoll ShotsOpinion Next"
content <- contentpage %>%
html_nodes(css = "._3WlLe") %>%
html_text()
if(length(content)==0)
return('')
# content <- substring(content,1, nchar(content)-nchar(ignore_text))
# content <- sub(ignore_text, "", content, fixed = T)
content <- substr(
content,
start = 1,
stop = str_locate(content,"Download The Times of India")[1]-1
)
content <- sub("[A-Z]*:", "", content) %>%
str_squish()
content
},
error = function(e){''}
)
}
page <- read_html(url)
artlist <- get_arts(page, req_year)
for(i in 2:20){
nurl <- paste(url, i, sep = '/')
print(nurl)
page <- read_html(nurl)
print(page)
artlist <- rbind(artlist,get_arts(page, req_year))
}
write.csv(artlist, "./scraped-data.csv")
artlist <- artlist %>% filter(nchar(art)>0)
write.csv(artlist, "./scraped-data-clean.csv")