"Fix" scrolling problem in CarousellClicker and update requirements.txt

jia1 · Aug 7, 2020 · 10b5347 · 10b5347
1 parent bee6f26
commit 10b5347
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 51 deletions.
diff --git a/CarousellClicker.py b/CarousellClicker.py
@@ -6,18 +6,31 @@
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
 
 #####################################################
 #################### USER INPUTS ####################
 #####################################################
-# These can be made into command line arguments
 # Source: http://image-net.org/challenges/LSVRC/2014/browse-synsets
-# TODO: Build more robust mapping between illegal_entities and search_terms
-illegal_entities = ['tiger cat', 'tiger']
-search_terms = ['tiger']
+
+# search_terms is a dictionary of search_term: list of illegal_entities
+search_terms = {
+    'tiger': {'tiger cat', 'tiger'}
+}
+
+
+#####################################################
+################### DEFAULT INPUTS ##################
+#####################################################
+# Do not change unless you know what you are doing
+download_directory = 'CarousellClicker'
+initial_state_start_index = len('<script>window.initialState=')
+initial_state_end_index = -len('</script>')
+load_more_button_xpath = '//button[text()="Load more"]'
+wait_in_seconds = 10
 
 
 #####################################################
@@ -28,12 +41,11 @@ def download_image(image_url, download_file_path):
     with open(download_file_path, 'wb') as f:
         f.write(requests.get(image_url).content)
 
-def is_illegal(image_file_path):
+def is_illegal(image_file_path, illegal_entities):
     stream = os.popen(f'. predict.sh {image_file_path}')
     predictions = list(map(lambda s: s.split(':')[-1].strip(), stream.readlines()))
     # TODO: Add confidence threshold here
-    # TODO: Can return tuple of (is_illegal, illegal_entity) for downstream processing
-    return predictions[0] in illegal_entities # top 1 only
+    return predictions[0] in illegal_entities # Top 1 only
 
 
 #####################################################
@@ -45,73 +57,65 @@ def get_carousell_search_url(search_term):
 def is_initial_state(line):
     return line.lstrip().startswith('<script>window.initialState=')
 
-initial_state_start_index = len('<script>window.initialState=')
-initial_state_end_index = -len('</script>')
-
-def get_carousell_listing_url(collection_id, product_id):
-    return f'https://sg.carousell.com/api-service/related-listing/?collection_id={collection_id}&country_id=1880251&locale=en&product_id={product_id}'
-
 def is_product_img(img_url):
     return img_url.startswith('https://media.karousell.com/media/photos/products/')
 
-timeout = 10
-
+# https://www.hackerearth.com/practice/notes/praveen97uma/crawling-a-website-that-loads-content-using-javascript-with-selenium-webdriver-in-python
+def page_down(browser, page_downs):
+    body = browser.find_element_by_tag_name('body')
+    while page_downs:
+        body.send_keys(Keys.PAGE_DOWN)
+        time.sleep(1) # So that I can see that stuff really happened
+        page_downs -= 1
+    return browser
+
+# NOT USING THIS FOR NOW.
+# The website I'm crawling does not seem to react well to this scroll-down program. Perhaps it is
+# too aggressive. Refactored from:
+# https://stackoverflow.com/questions/22702277/crawl-site-that-has-infinite-scrolling-using-python
 def scroll_down(browser):
     per_scroll = 200
     max_height = browser.execute_script('return document.body.scrollHeight')
     new_height = per_scroll
     while max_height > new_height:
         browser.execute_script(f'window.scrollTo(0, {new_height})')
-        time.sleep(1)
+        time.sleep(1) # So that I can see that stuff really happened
         max_height = browser.execute_script('return document.body.scrollHeight')
         new_height += per_scroll
-    # https://stackoverflow.com/questions/22702277/crawl-site-that-has-infinite-scrolling-using-python
-    # last_height = browser.execute_script('return document.body.scrollHeight')
-    # while True:
-    #     browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
-    #     time.sleep(timeout)
-    #     new_height = browser.execute_script('return document.body.scrollHeight')
-    #     if new_height == last_height:
-    #         break
-    #     last_height = new_height
 
 
 #####################################################
 ################# CAROUSELL SPIDER ##################
 #####################################################
-default_search_term = search_terms[0]
-
-browser = webdriver.Chrome()
-
-imgs_to_download = []
 illegal_items = []
+browser = webdriver.Chrome()
 
-try:
-    browser.get(get_carousell_search_url(default_search_term))
-    is_element_present = EC.presence_of_element_located((By.XPATH, '//button[text()="Load more"]'))
-    # TODO: Fix scrolling
-    WebDriverWait(browser, timeout).until(is_element_present)
-    load_more_button = browser.find_element_by_xpath('//button[text()="Load more"]')
-    load_more_button.click()
-    time.sleep(timeout)
-    scroll_down(browser)
-    imgs = browser.find_elements_by_tag_name('img')
-    for img in imgs:
-        src = img.get_attribute('src')
-        if is_product_img(src):
-            imgs_to_download.append(src)
-except TimeoutException:
-    print('Timed out.')
-finally:
-    browser.quit()
+for search_term in search_terms:
+    imgs_to_download = []
+    try:
+        browser.get(get_carousell_search_url(search_term))
+        browser = page_down(browser, 4) # Arbitrary
+        is_element_present = EC.presence_of_element_located((By.XPATH, load_more_button_xpath))
+        WebDriverWait(browser, wait_in_seconds).until(is_element_present)
+        load_more_button = browser.find_element_by_xpath(load_more_button_xpath)
+        load_more_button.click()
+        time.sleep(wait_in_seconds)
+        browser = page_down(browser, 6) # Arbitrary
+        for img in browser.find_elements_by_tag_name('img'):
+            src = img.get_attribute('src')
+            if is_product_img(src):
+                imgs_to_download.append(src)
+    except TimeoutException:
+        print('Timed out.')
     print(imgs_to_download) # Remove when not needed
     if imgs_to_download:
-        download_directory = f'CarousellClicker'
         for i in range(len(imgs_to_download)):
             img_url = imgs_to_download[i]
             download_file_path = f'{download_directory}/{i}.jpg'
             Path(download_directory).mkdir(parents=True, exist_ok=True)
             download_image(img_url, download_file_path)
-            if is_illegal(download_file_path):
+            if is_illegal(download_file_path, search_terms[search_term]):
                 illegal_items.append(img_url)
-    print(illegal_items)
+
+browser.quit()
+print(illegal_items)
diff --git a/requirements.txt b/requirements.txt
@@ -24,6 +24,7 @@ pyOpenSSL==19.1.0
 queuelib==1.5.0
 requests==2.24.0
 Scrapy==2.3.0
+selenium==3.141.0
 service-identity==18.1.0
 six==1.15.0
 Twisted==20.3.0