Skip to content

Commit

Permalink
Add amazon sample app
Browse files Browse the repository at this point in the history
  • Loading branch information
HackyRoot authored Mar 14, 2023
1 parent 897c70b commit 7df5dc0
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 0 deletions.
19 changes: 19 additions & 0 deletions 1_extract_book.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from scrapy.http import HtmlResponse
from scrapy.loader import ItemLoader
import json, scrapy, custom_settings_config, os


class AmazonbooksSpider(scrapy.Spider):
name = 'amazon-books'
allowed_domains = ['amazon.co.uk']
start_urls = ['https://www.amazon.co.uk/dp/178685807X']
custom_settings = custom_settings_config.custom_settings

def parse(self, response):
yield{
"book_url": response.url,
'author': response.css('#bylineInfo .a-link-normal::text').getall(),
'book_title': response.css('#productTitle::text').get(),
'price': response.css('.a-color-price::text').get(),
'cover': response.css('#main-image::attr(src)').get()
}
39 changes: 39 additions & 0 deletions 2_parse_shelf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from scrapy.http import HtmlResponse
from scrapy.loader import ItemLoader
import json, scrapy, custom_settings_config, os


class AmazonbooksSpider(scrapy.Spider):
name = 'amazonbooks'
allowed_domains = ['amazon.co.uk']
start_urls = ['https://www.amazon.co.uk/gp/bestsellers/books']

custom_settings = custom_settings_config.custom_settings

def parse(self, response):
raw_data = response.css('[data-client-recs-list]::attr(data-client-recs-list)').get()
data = json.loads(raw_data)
for item in data:
url = 'https://www.amazon.co.uk/dp/{}'.format(item['id'])
yield scrapy.Request(url, callback=self.parse_item,
meta={'rank': item['metadataMap']['render.zg.rank'], 'id': item['id']})

next_page = response.css('.a-last a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)

def parse_item(self, response):
rank = response.meta.get('rank')
id = response.meta.get('id')
authors = response.css(
'#bylineInfo_feature_div #bylineInfo span.author.notFaded a.a-link-normal::text').getall()[2:]

yield {
"book_url": response.url,
'author': authors,
'rank': rank,
'id': id,
'book_title': response.css('#productTitle::text').get(),
'price': response.css('.a-color-price::text').get(),
'cover': response.css('#main-image::attr(src)').get()
}
25 changes: 25 additions & 0 deletions 3_screenshot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from scrapy.http import HtmlResponse
from scrapy.loader import ItemLoader
import json, scrapy, custom_settings_config, os
from base64 import b64decode, decodebytes
import datetime

class AmazonbooksSpider(scrapy.Spider):
name = 'amazonbooks'
allowed_domains = ['amazon.co.uk']
custom_settings = custom_settings_config.custom_settings

def start_requests(self):
yield scrapy.Request(
"https://www.amazon.co.uk/gp/bestsellers/books",
meta = {
"zyte_api": {
"screenshot": True,
}
}
)

def parse(self, response):
screenshot: bytes = b64decode(response.raw_api_response["screenshot"]) # decode base64 response
with open("output_3.jpg", "wb") as fh:
fh.write(screenshot) # write bytes to the output.jpg file
26 changes: 26 additions & 0 deletions 4_geolocation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from scrapy.http import HtmlResponse
from scrapy.loader import ItemLoader
import json, scrapy, custom_settings_config, os
from base64 import b64decode, decodebytes
import datetime

class AmazonbooksSpider(scrapy.Spider):
name = 'amazonbooks'
allowed_domains = ['amazon.co.uk']
custom_settings = custom_settings_config.custom_settings

def start_requests(self):
yield scrapy.Request(
"https://www.amazon.co.uk/gp/bestsellers/books",
meta = {
"zyte_api": {
"screenshot": True,
"geolocation": "AU",
}
}
)

def parse(self, response):
screenshot: bytes = b64decode(response.raw_api_response["screenshot"]) # decode base64 response
with open("../amazon_in/output_4.jpg", "wb") as fh:
fh.write(screenshot) # write bytes to the output.jpg file
68 changes: 68 additions & 0 deletions 5_actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from scrapy.http import HtmlResponse
from scrapy.loader import ItemLoader
import json, scrapy, custom_settings_config, os
from base64 import b64decode, decodebytes
import datetime

class AmazonbooksSpider(scrapy.Spider):
name = 'amazonbooks'
allowed_domains = ['amazon.co.uk']
custom_settings = custom_settings_config.custom_settings

def start_requests(self):
yield scrapy.Request(
"https://www.amazon.co.uk/dp/178685807X",
meta = {
"zyte_api": {
"screenshot": True,
"browserHtml": True,
"actions": [
{
"action": "click",
"selector": {
"type": "css",
"value": ".cip-a-size-small"
},
"delay": 0,
"button": "left",
"onError": "return"
},
{
"action": "type",
"selector": {
"type": "css",
"value": "#GLUXZipUpdateInput"
},
"delay": 0,
"onError": "return",
"text": "NW1 5LJ", # BT23 4AA
},
{
"action": "click",
"selector": {
"type": "css",
"value": ".a-button-input"
},
"delay": 0,
"button": "left",
"onError": "return"
}
],
}
}
)

def parse(self, response):
screenshot: bytes = b64decode(response.raw_api_response["screenshot"]) # decode base64 response
with open("../amazon_in/output_5.jpg", "wb") as fh:
fh.write(screenshot) # write bytes to the output.jpg file

yield{
"book_url": response.url,
'author': response.css('#bylineInfo a.a-link-normal::text').getall()[2:],
'book_title': response.css('#productTitle::text').get(),
'price': response.css('.a-color-price::text').get(),
'cover': response.css('#main-image::attr(src)').get(),
'delivery': response.css(
'#mir-layout-DELIVERY_BLOCK #mir-layout-DELIVERY_BLOCK-slot-SECONDARY_DELIVERY_MESSAGE_LARGE .a-text-bold::text').get(),
}

0 comments on commit 7df5dc0

Please sign in to comment.