-
Notifications
You must be signed in to change notification settings - Fork 7
/
scraper_amazon_best_sellers.R
80 lines (56 loc) · 2.08 KB
/
scraper_amazon_best_sellers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
##-------------------------------------------------------------------------------------------##
## FUNCTION FOR SCRAPING PRODUCT URLS AND IDS FROM AMAZON BEST SELLERS LIST ##
##-------------------------------------------------------------------------------------------##
## R version 3.4.3 (2017-11-30)
## Author: Lisa Hehnke || lhehnke.github.io || @DataPlanes
#-------#
# Setup #
#-------#
# Install and load packages using pacman
if (!require("pacman")) install.packages("pacman")
library(pacman)
p_load(rvest, xml2)
#-----------------------#
# Scrape URLs from list #
#-----------------------#
# Create HTML of best sellers
## Example: https://www.amazon.com/gp/bestsellers/books/3377866011/ref=pd_zg_hrsr_books_1_2
list <- read_html("https://www.amazon.com/gp/bestsellers/books/3377866011/ref=pd_zg_hrsr_books_1_2")
# Get URLs for all pages of list
## Alternatively: add #2, #3, #4, and #5 to base URL
list_pages <- list %>%
html_nodes("#zg_paginationWrapper") %>%
html_nodes("a") %>%
html_attr("href")
#---------------------------------#
# Function to scrape product URLs #
#---------------------------------#
# Function to scrape product URLs for all products on list
product_url_scraper <- function(doc){
if(!"pacman" %in% installed.packages()[,"Package"]) install.packages("pacman")
pacman::p_load(rvest)
product_urls <- doc %>%
html_nodes("div.zg_itemImmersion") %>%
html_nodes("a.a-link-normal") %>%
html_attr("href") %>%
grep("(.*)product-reviews(.*)", invert = TRUE, value = TRUE, .) %>%
unique()
return(product_urls)
}
#---------------------#
# Scrape product URLs #
#---------------------#
# Function to scrape URLs
get_product_urls <- function(url) {
url %>%
read_html() %>%
product_url_scraper() %>%
paste0("https://www.amazon.com", .)
}
# Get all product URLs from best sellers list
product_urls <- unlist(lapply(list_pages, get_product_urls))
#---------------------#
# Extract product IDs #
#---------------------#
# Get product IDs (ASIN) from URLs
product_ids <- sub(".*?/dp/(.*?)/.*", "\\1", product_urls)