diff --git a/1_extract_book.py b/1_extract_book.py new file mode 100644 index 0000000..8535036 --- /dev/null +++ b/1_extract_book.py @@ -0,0 +1,19 @@ +from scrapy.http import HtmlResponse +from scrapy.loader import ItemLoader +import json, scrapy, custom_settings_config, os + + +class AmazonbooksSpider(scrapy.Spider): + name = 'amazon-books' + allowed_domains = ['amazon.co.uk'] + start_urls = ['https://www.amazon.co.uk/dp/178685807X'] + custom_settings = custom_settings_config.custom_settings + + def parse(self, response): + yield{ + "book_url": response.url, + 'author': response.css('#bylineInfo .a-link-normal::text').getall(), + 'book_title': response.css('#productTitle::text').get(), + 'price': response.css('.a-color-price::text').get(), + 'cover': response.css('#main-image::attr(src)').get() + } \ No newline at end of file diff --git a/2_parse_shelf.py b/2_parse_shelf.py new file mode 100644 index 0000000..2a4c5e5 --- /dev/null +++ b/2_parse_shelf.py @@ -0,0 +1,39 @@ +from scrapy.http import HtmlResponse +from scrapy.loader import ItemLoader +import json, scrapy, custom_settings_config, os + + +class AmazonbooksSpider(scrapy.Spider): + name = 'amazonbooks' + allowed_domains = ['amazon.co.uk'] + start_urls = ['https://www.amazon.co.uk/gp/bestsellers/books'] + + custom_settings = custom_settings_config.custom_settings + + def parse(self, response): + raw_data = response.css('[data-client-recs-list]::attr(data-client-recs-list)').get() + data = json.loads(raw_data) + for item in data: + url = 'https://www.amazon.co.uk/dp/{}'.format(item['id']) + yield scrapy.Request(url, callback=self.parse_item, + meta={'rank': item['metadataMap']['render.zg.rank'], 'id': item['id']}) + + next_page = response.css('.a-last a::attr(href)').get() + if next_page is not None: + yield response.follow(next_page, callback=self.parse) + + def parse_item(self, response): + rank = response.meta.get('rank') + id = response.meta.get('id') + authors = response.css( + '#bylineInfo_feature_div #bylineInfo span.author.notFaded a.a-link-normal::text').getall()[2:] + + yield { + "book_url": response.url, + 'author': authors, + 'rank': rank, + 'id': id, + 'book_title': response.css('#productTitle::text').get(), + 'price': response.css('.a-color-price::text').get(), + 'cover': response.css('#main-image::attr(src)').get() + } diff --git a/3_screenshot.py b/3_screenshot.py new file mode 100644 index 0000000..caa8358 --- /dev/null +++ b/3_screenshot.py @@ -0,0 +1,25 @@ +from scrapy.http import HtmlResponse +from scrapy.loader import ItemLoader +import json, scrapy, custom_settings_config, os +from base64 import b64decode, decodebytes +import datetime + +class AmazonbooksSpider(scrapy.Spider): + name = 'amazonbooks' + allowed_domains = ['amazon.co.uk'] + custom_settings = custom_settings_config.custom_settings + + def start_requests(self): + yield scrapy.Request( + "https://www.amazon.co.uk/gp/bestsellers/books", + meta = { + "zyte_api": { + "screenshot": True, + } + } + ) + + def parse(self, response): + screenshot: bytes = b64decode(response.raw_api_response["screenshot"]) # decode base64 response + with open("output_3.jpg", "wb") as fh: + fh.write(screenshot) # write bytes to the output.jpg file \ No newline at end of file diff --git a/4_geolocation.py b/4_geolocation.py new file mode 100644 index 0000000..b8e45e7 --- /dev/null +++ b/4_geolocation.py @@ -0,0 +1,26 @@ +from scrapy.http import HtmlResponse +from scrapy.loader import ItemLoader +import json, scrapy, custom_settings_config, os +from base64 import b64decode, decodebytes +import datetime + +class AmazonbooksSpider(scrapy.Spider): + name = 'amazonbooks' + allowed_domains = ['amazon.co.uk'] + custom_settings = custom_settings_config.custom_settings + + def start_requests(self): + yield scrapy.Request( + "https://www.amazon.co.uk/gp/bestsellers/books", + meta = { + "zyte_api": { + "screenshot": True, + "geolocation": "AU", + } + } + ) + + def parse(self, response): + screenshot: bytes = b64decode(response.raw_api_response["screenshot"]) # decode base64 response + with open("../amazon_in/output_4.jpg", "wb") as fh: + fh.write(screenshot) # write bytes to the output.jpg file \ No newline at end of file diff --git a/5_actions.py b/5_actions.py new file mode 100644 index 0000000..1406a18 --- /dev/null +++ b/5_actions.py @@ -0,0 +1,68 @@ +from scrapy.http import HtmlResponse +from scrapy.loader import ItemLoader +import json, scrapy, custom_settings_config, os +from base64 import b64decode, decodebytes +import datetime + +class AmazonbooksSpider(scrapy.Spider): + name = 'amazonbooks' + allowed_domains = ['amazon.co.uk'] + custom_settings = custom_settings_config.custom_settings + + def start_requests(self): + yield scrapy.Request( + "https://www.amazon.co.uk/dp/178685807X", + meta = { + "zyte_api": { + "screenshot": True, + "browserHtml": True, + "actions": [ + { + "action": "click", + "selector": { + "type": "css", + "value": ".cip-a-size-small" + }, + "delay": 0, + "button": "left", + "onError": "return" + }, + { + "action": "type", + "selector": { + "type": "css", + "value": "#GLUXZipUpdateInput" + }, + "delay": 0, + "onError": "return", + "text": "NW1 5LJ", # BT23 4AA + }, + { + "action": "click", + "selector": { + "type": "css", + "value": ".a-button-input" + }, + "delay": 0, + "button": "left", + "onError": "return" + } + ], + } + } + ) + + def parse(self, response): + screenshot: bytes = b64decode(response.raw_api_response["screenshot"]) # decode base64 response + with open("../amazon_in/output_5.jpg", "wb") as fh: + fh.write(screenshot) # write bytes to the output.jpg file + + yield{ + "book_url": response.url, + 'author': response.css('#bylineInfo a.a-link-normal::text').getall()[2:], + 'book_title': response.css('#productTitle::text').get(), + 'price': response.css('.a-color-price::text').get(), + 'cover': response.css('#main-image::attr(src)').get(), + 'delivery': response.css( + '#mir-layout-DELIVERY_BLOCK #mir-layout-DELIVERY_BLOCK-slot-SECONDARY_DELIVERY_MESSAGE_LARGE .a-text-bold::text').get(), + } \ No newline at end of file