-
Notifications
You must be signed in to change notification settings - Fork 3
/
grab_index.R
59 lines (40 loc) · 1.19 KB
/
grab_index.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
library(tidyverse)
library(rvest)
library(glue)
library(httr)
base_page_url <- "https://www.pracuj.pl/praca?pn="
# tym parametrem można sterować - trzeba sprawdzić jak to wygląda na stronie
N_pages <- 759
safe_read_html <- function(f_url) {
t_page <- GET(f_url,
user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"))
if(t_page$status_code == 200) {
t_page$content %>% rawToChar() %>% read_html()
} else {
return(NULL)
}
}
all_links <- vector("character")
for(i in seq_len(N_pages)) {
cat(glue("Wczytuje {i} stronę indeksu"))
page_url <- glue("{base_page_url}{i}")
page <- safe_read_html(page_url)
if(!is_null(page)) {
ids <- page %>%
html_nodes("head > script") %>%
.[[2]] %>%
as.character() %>%
str_match_all("([0-9]){7,}") %>%
.[[1]] %>%
.[, 1]
all_links <- c(all_links, ids)
cat(" - gotowe\r")
} else {
cat(" - błąd\n")
}
}
print(length(all_links))
all_links <- unique(all_links)
print(length(all_links))
all_links <- paste0("https://www.pracuj.pl/praca/x,oferta,", all_links)
saveRDS(all_links, file = "offer_urls.rds")