Add amazon sample app

zytedata · Mar 14, 2023 · 7df5dc0 · 7df5dc0
1 parent 897c70b
commit 7df5dc0
Show file tree

Hide file tree

Showing 5 changed files with 177 additions and 0 deletions.
diff --git a/1_extract_book.py b/1_extract_book.py
@@ -0,0 +1,19 @@
+from scrapy.http import HtmlResponse
+from scrapy.loader import ItemLoader
+import json, scrapy, custom_settings_config, os
+
+
+class AmazonbooksSpider(scrapy.Spider):
+    name = 'amazon-books'
+    allowed_domains = ['amazon.co.uk']
+    start_urls = ['https://www.amazon.co.uk/dp/178685807X']
+    custom_settings = custom_settings_config.custom_settings
+
+    def parse(self, response):
+        yield{
+            "book_url": response.url,
+            'author': response.css('#bylineInfo .a-link-normal::text').getall(),
+            'book_title': response.css('#productTitle::text').get(),
+            'price': response.css('.a-color-price::text').get(),
+            'cover': response.css('#main-image::attr(src)').get()
+        }
diff --git a/2_parse_shelf.py b/2_parse_shelf.py
@@ -0,0 +1,39 @@
+from scrapy.http import HtmlResponse
+from scrapy.loader import ItemLoader
+import json, scrapy, custom_settings_config, os
+
+
+class AmazonbooksSpider(scrapy.Spider):
+    name = 'amazonbooks'
+    allowed_domains = ['amazon.co.uk']
+    start_urls = ['https://www.amazon.co.uk/gp/bestsellers/books']
+
+    custom_settings = custom_settings_config.custom_settings
+
+    def parse(self, response):
+        raw_data = response.css('[data-client-recs-list]::attr(data-client-recs-list)').get()
+        data = json.loads(raw_data)
+        for item in data:
+            url = 'https://www.amazon.co.uk/dp/{}'.format(item['id'])
+            yield scrapy.Request(url, callback=self.parse_item,
+                                 meta={'rank': item['metadataMap']['render.zg.rank'], 'id': item['id']})
+
+        next_page = response.css('.a-last a::attr(href)').get()
+        if next_page is not None:
+            yield response.follow(next_page, callback=self.parse)
+
+    def parse_item(self, response):
+        rank = response.meta.get('rank')
+        id = response.meta.get('id')
+        authors = response.css(
+            '#bylineInfo_feature_div #bylineInfo span.author.notFaded a.a-link-normal::text').getall()[2:]
+
+        yield {
+            "book_url": response.url,
+            'author': authors,
+            'rank': rank,
+            'id': id,
+            'book_title': response.css('#productTitle::text').get(),
+            'price': response.css('.a-color-price::text').get(),
+            'cover': response.css('#main-image::attr(src)').get()
+        }
diff --git a/3_screenshot.py b/3_screenshot.py
@@ -0,0 +1,25 @@
+from scrapy.http import HtmlResponse
+from scrapy.loader import ItemLoader
+import json, scrapy, custom_settings_config, os
+from base64 import b64decode, decodebytes
+import datetime
+
+class AmazonbooksSpider(scrapy.Spider):
+    name = 'amazonbooks'
+    allowed_domains = ['amazon.co.uk']
+    custom_settings = custom_settings_config.custom_settings
+
+    def start_requests(self):
+        yield scrapy.Request(
+            "https://www.amazon.co.uk/gp/bestsellers/books",
+            meta = {
+                "zyte_api": {
+                    "screenshot": True,
+                }
+            }
+        )
+
+    def parse(self, response):
+        screenshot: bytes = b64decode(response.raw_api_response["screenshot"])  # decode base64 response
+        with open("output_3.jpg", "wb") as fh:
+            fh.write(screenshot)  # write bytes to the output.jpg file
diff --git a/4_geolocation.py b/4_geolocation.py
@@ -0,0 +1,26 @@
+from scrapy.http import HtmlResponse
+from scrapy.loader import ItemLoader
+import json, scrapy, custom_settings_config, os
+from base64 import b64decode, decodebytes
+import datetime
+
+class AmazonbooksSpider(scrapy.Spider):
+    name = 'amazonbooks'
+    allowed_domains = ['amazon.co.uk']
+    custom_settings = custom_settings_config.custom_settings
+
+    def start_requests(self):
+        yield scrapy.Request(
+            "https://www.amazon.co.uk/gp/bestsellers/books",
+            meta = {
+                "zyte_api": {
+                    "screenshot": True,
+                    "geolocation": "AU",
+                }
+            }
+        )
+
+    def parse(self, response):
+        screenshot: bytes = b64decode(response.raw_api_response["screenshot"])  # decode base64 response
+        with open("../amazon_in/output_4.jpg", "wb") as fh:
+            fh.write(screenshot)  # write bytes to the output.jpg file
diff --git a/5_actions.py b/5_actions.py
@@ -0,0 +1,68 @@
+from scrapy.http import HtmlResponse
+from scrapy.loader import ItemLoader
+import json, scrapy, custom_settings_config, os
+from base64 import b64decode, decodebytes
+import datetime
+
+class AmazonbooksSpider(scrapy.Spider):
+    name = 'amazonbooks'
+    allowed_domains = ['amazon.co.uk']
+    custom_settings = custom_settings_config.custom_settings
+
+    def start_requests(self):
+        yield scrapy.Request(
+            "https://www.amazon.co.uk/dp/178685807X",
+            meta = {
+                "zyte_api": {
+                    "screenshot": True,
+                    "browserHtml": True,
+                    "actions": [
+                        {
+                            "action": "click",
+                            "selector": {
+                                "type": "css",
+                                "value": ".cip-a-size-small"
+                            },
+                            "delay": 0,
+                            "button": "left",
+                            "onError": "return"
+                        },
+                        {
+                            "action": "type",
+                            "selector": {
+                                "type": "css",
+                                "value": "#GLUXZipUpdateInput"
+                            },
+                            "delay": 0,
+                            "onError": "return",
+                            "text": "NW1 5LJ",  # BT23 4AA
+                        },
+                        {
+                            "action": "click",
+                            "selector": {
+                                "type": "css",
+                                "value": ".a-button-input"
+                            },
+                            "delay": 0,
+                            "button": "left",
+                            "onError": "return"
+                        }
+                    ],
+                }
+            }
+        )
+
+    def parse(self, response):
+        screenshot: bytes = b64decode(response.raw_api_response["screenshot"])  # decode base64 response
+        with open("../amazon_in/output_5.jpg", "wb") as fh:
+            fh.write(screenshot)  # write bytes to the output.jpg file
+
+        yield{
+            "book_url": response.url,
+            'author': response.css('#bylineInfo a.a-link-normal::text').getall()[2:],
+            'book_title': response.css('#productTitle::text').get(),
+            'price': response.css('.a-color-price::text').get(),
+            'cover': response.css('#main-image::attr(src)').get(),
+            'delivery': response.css(
+                '#mir-layout-DELIVERY_BLOCK #mir-layout-DELIVERY_BLOCK-slot-SECONDARY_DELIVERY_MESSAGE_LARGE .a-text-bold::text').get(),
+        }