Skip to content

Commit

Permalink
"Fix" scrolling problem in CarousellClicker and update requirements.txt
Browse files Browse the repository at this point in the history
  • Loading branch information
jia1 committed Aug 7, 2020
1 parent bee6f26 commit 10b5347
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 51 deletions.
106 changes: 55 additions & 51 deletions CarousellClicker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,31 @@
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


#####################################################
#################### USER INPUTS ####################
#####################################################
# These can be made into command line arguments
# Source: http://image-net.org/challenges/LSVRC/2014/browse-synsets
# TODO: Build more robust mapping between illegal_entities and search_terms
illegal_entities = ['tiger cat', 'tiger']
search_terms = ['tiger']

# search_terms is a dictionary of search_term: list of illegal_entities
search_terms = {
'tiger': {'tiger cat', 'tiger'}
}


#####################################################
################### DEFAULT INPUTS ##################
#####################################################
# Do not change unless you know what you are doing
download_directory = 'CarousellClicker'
initial_state_start_index = len('<script>window.initialState=')
initial_state_end_index = -len('</script>')
load_more_button_xpath = '//button[text()="Load more"]'
wait_in_seconds = 10


#####################################################
Expand All @@ -28,12 +41,11 @@ def download_image(image_url, download_file_path):
with open(download_file_path, 'wb') as f:
f.write(requests.get(image_url).content)

def is_illegal(image_file_path):
def is_illegal(image_file_path, illegal_entities):
stream = os.popen(f'. predict.sh {image_file_path}')
predictions = list(map(lambda s: s.split(':')[-1].strip(), stream.readlines()))
# TODO: Add confidence threshold here
# TODO: Can return tuple of (is_illegal, illegal_entity) for downstream processing
return predictions[0] in illegal_entities # top 1 only
return predictions[0] in illegal_entities # Top 1 only


#####################################################
Expand All @@ -45,73 +57,65 @@ def get_carousell_search_url(search_term):
def is_initial_state(line):
return line.lstrip().startswith('<script>window.initialState=')

initial_state_start_index = len('<script>window.initialState=')
initial_state_end_index = -len('</script>')

def get_carousell_listing_url(collection_id, product_id):
return f'https://sg.carousell.com/api-service/related-listing/?collection_id={collection_id}&country_id=1880251&locale=en&product_id={product_id}'

def is_product_img(img_url):
return img_url.startswith('https://media.karousell.com/media/photos/products/')

timeout = 10

# https://www.hackerearth.com/practice/notes/praveen97uma/crawling-a-website-that-loads-content-using-javascript-with-selenium-webdriver-in-python
def page_down(browser, page_downs):
body = browser.find_element_by_tag_name('body')
while page_downs:
body.send_keys(Keys.PAGE_DOWN)
time.sleep(1) # So that I can see that stuff really happened
page_downs -= 1
return browser

# NOT USING THIS FOR NOW.
# The website I'm crawling does not seem to react well to this scroll-down program. Perhaps it is
# too aggressive. Refactored from:
# https://stackoverflow.com/questions/22702277/crawl-site-that-has-infinite-scrolling-using-python
def scroll_down(browser):
per_scroll = 200
max_height = browser.execute_script('return document.body.scrollHeight')
new_height = per_scroll
while max_height > new_height:
browser.execute_script(f'window.scrollTo(0, {new_height})')
time.sleep(1)
time.sleep(1) # So that I can see that stuff really happened
max_height = browser.execute_script('return document.body.scrollHeight')
new_height += per_scroll
# https://stackoverflow.com/questions/22702277/crawl-site-that-has-infinite-scrolling-using-python
# last_height = browser.execute_script('return document.body.scrollHeight')
# while True:
# browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
# time.sleep(timeout)
# new_height = browser.execute_script('return document.body.scrollHeight')
# if new_height == last_height:
# break
# last_height = new_height


#####################################################
################# CAROUSELL SPIDER ##################
#####################################################
default_search_term = search_terms[0]

browser = webdriver.Chrome()

imgs_to_download = []
illegal_items = []
browser = webdriver.Chrome()

try:
browser.get(get_carousell_search_url(default_search_term))
is_element_present = EC.presence_of_element_located((By.XPATH, '//button[text()="Load more"]'))
# TODO: Fix scrolling
WebDriverWait(browser, timeout).until(is_element_present)
load_more_button = browser.find_element_by_xpath('//button[text()="Load more"]')
load_more_button.click()
time.sleep(timeout)
scroll_down(browser)
imgs = browser.find_elements_by_tag_name('img')
for img in imgs:
src = img.get_attribute('src')
if is_product_img(src):
imgs_to_download.append(src)
except TimeoutException:
print('Timed out.')
finally:
browser.quit()
for search_term in search_terms:
imgs_to_download = []
try:
browser.get(get_carousell_search_url(search_term))
browser = page_down(browser, 4) # Arbitrary
is_element_present = EC.presence_of_element_located((By.XPATH, load_more_button_xpath))
WebDriverWait(browser, wait_in_seconds).until(is_element_present)
load_more_button = browser.find_element_by_xpath(load_more_button_xpath)
load_more_button.click()
time.sleep(wait_in_seconds)
browser = page_down(browser, 6) # Arbitrary
for img in browser.find_elements_by_tag_name('img'):
src = img.get_attribute('src')
if is_product_img(src):
imgs_to_download.append(src)
except TimeoutException:
print('Timed out.')
print(imgs_to_download) # Remove when not needed
if imgs_to_download:
download_directory = f'CarousellClicker'
for i in range(len(imgs_to_download)):
img_url = imgs_to_download[i]
download_file_path = f'{download_directory}/{i}.jpg'
Path(download_directory).mkdir(parents=True, exist_ok=True)
download_image(img_url, download_file_path)
if is_illegal(download_file_path):
if is_illegal(download_file_path, search_terms[search_term]):
illegal_items.append(img_url)
print(illegal_items)

browser.quit()
print(illegal_items)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ pyOpenSSL==19.1.0
queuelib==1.5.0
requests==2.24.0
Scrapy==2.3.0
selenium==3.141.0
service-identity==18.1.0
six==1.15.0
Twisted==20.3.0
Expand Down

0 comments on commit 10b5347

Please sign in to comment.