Merge pull request #19 from ecoron/0.9.1

0.9.1
ecoron · Sep 14, 2017 · 963a7e1 · 963a7e1
2 parents a73a7be + caf8cc3
commit 963a7e1
Show file tree

Hide file tree

Showing 9 changed files with 119 additions and 40 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -60,7 +60,7 @@
 # The short X.Y version.
 version = '0.9'
 # The full version, including alpha/beta/rc tags.
-release = '0.9.0'
+release = '0.9.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -96,7 +96,10 @@ don't customize this setting, the default is used.
 Proxy file
 ----------
 
-You can provide a list of proxys which should used for scraping the search engines.
+This feature works not stable in versions <= 0.9.1, if you use more then one worker
+and have more then one proxy in your file.
+
+You can provide a list of proxies which should used for scraping the search engines.
 For this you have to create a proxy_file and to set the path to the file in the configuration.
 
 The proxy_file should look like this

diff --git a/docs/examples.rst b/docs/examples.rst
@@ -59,6 +59,35 @@ custom path to the binary.
        if 'serp_title' in result and len(result['serp_title']) > 1:
            print(result['serp_title'])
 
+Using Chrome
+------------
+
+.. code-block:: bash
+
+   python examples\example_chrome.py
+
+It is possible to use Chrome, but we recomment PhantomJs, which is installed by default.
+For using Chrome u need to download the latest `chromedriver`_ and to set the executable_path.
+
+.. code-block:: bash
+
+   import pprint
+   import serpscrap
+   
+   keywords = ['berlin']
+   
+   config = serpscrap.Config()
+   config.set('sel_browser', 'chrome')
+   config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe')
+   
+   scrap = serpscrap.SerpScrap()
+   scrap.init(config=config.get(), keywords=keywords)
+   results = scrap.run()
+   
+   for result in results:
+       pprint.pprint(result)
+       print()
+
 Image search
 ------------
 
@@ -157,3 +186,5 @@ References
 .. _`examples`: https://github.com/ecoron/SerpScrap/tree/master/examples
 .. _`example_simple.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_simple.py
 .. _`example_related.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_related.py
+.. _`chromedriver`: https://sites.google.com/a/chromium.org/chromedriver/downloads
+
diff --git a/examples/example_chrome.py b/examples/example_chrome.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import pprint
+import serpscrap
+
+keywords = ['berlin']
+
+config = serpscrap.Config()
+config.set('sel_browser', 'chrome')
+config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe')
+
+scrap = serpscrap.SerpScrap()
+scrap.init(config=config.get(), keywords=keywords)
+results = scrap.run()
+
+for result in results:
+    pprint.pprint(result)
+    print()
diff --git a/scrapcore/core.py b/scrapcore/core.py
@@ -4,6 +4,7 @@
 import queue
 import threading
 
+from random import shuffle
 from scrapcore.cachemanager import CacheManager
 from scrapcore.database import ScraperSearch
 from scrapcore.database import get_session, fixtures
@@ -78,6 +79,7 @@ def main(self, return_results=False, config=None):
 
         if not proxies:
             raise Exception('''No proxies available. Turning down.''')
+        shuffle(proxies)
 
         # get a scoped sqlalchemy session
         session_cls = get_session(config, scoped=True)
@@ -122,10 +124,10 @@ def main(self, return_results=False, config=None):
             self.logger.info('''
                 Going to scrape {num_keywords} keywords with {num_proxies}
                 proxies by using {num_threads} threads.'''.format(
-                    num_keywords=len(list(scrape_jobs)),
-                    num_proxies=len(proxies),
-                    num_threads=num_search_engines)
-                )
+                num_keywords=len(list(scrape_jobs)),
+                num_proxies=len(proxies),
+                num_threads=num_search_engines)
+            )
 
             progress_thread = None
 
@@ -139,7 +141,6 @@ def main(self, return_results=False, config=None):
             for search_engine in search_engines:
 
                 for proxy in proxies:
-
                     for worker in range(num_workers):
                         num_worker += 1
                         workers.put(

diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py
@@ -60,7 +60,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
         'yahoo': '.compPagination .next',
         'baidu': '.n',
         'ask': '#paging div a.txt3.l_nu',
-        'blekko': '',
         'duckduckgo': '',
         'googleimg': '#pnnext',
         'baiduimg': '.n',
@@ -74,7 +73,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
         'baidu': (By.NAME, 'wd'),
         'duckduckgo': (By.NAME, 'q'),
         'ask': (By.NAME, 'q'),
-        'blekko': (By.NAME, 'q'),
         'google': (By.NAME, 'q'),
         'googleimg': (By.NAME, 'as_q'),
         'baiduimg': (By.NAME, 'word'),
@@ -102,7 +100,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
         'baidu': 'http://baidu.com/',
         'duckduckgo': 'https://duckduckgo.com/',
         'ask': 'http://ask.com/',
-        'blekko': 'http://blekko.com/',
     }
 
     image_search_locations = {
@@ -113,7 +110,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
         'baidu': 'http://image.baidu.com/',
         'duckduckgo': None,  # duckduckgo doesnt't support direct image search
         'ask': 'http://www.ask.com/pictures/',
-        'blekko': None,
         'googleimg': 'https://www.google.com/advanced_image_search',
         'baiduimg': 'http://image.baidu.com/',
     }
@@ -168,6 +164,7 @@ def proxy_check(self, proxy):
 
         try:
             self.webdriver.get(self.config.get('proxy_info_url'))
+            time.sleep(2)
             try:
                 text = re.search(
                     r'(\{.*?\})',
@@ -211,7 +208,10 @@ def _save_debug_screenshot(self):
                 str(self.page_number),
             )
         )
-        self.webdriver.get_screenshot_as_file(location)
+        try:
+            self.webdriver.get_screenshot_as_file(location)
+        except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
+            logger.error(err)
 
     def _set_xvfb_display(self):
         # TODO: should we check the format of the config?
@@ -237,6 +237,7 @@ def _get_webdriver(self):
 
     def _get_Chrome(self):
         try:
+            chrome_ops = webdriver.ChromeOptions()
             if self.proxy:
                 chrome_ops = webdriver.ChromeOptions()
                 chrome_ops.add_argument(
@@ -247,13 +248,28 @@ def _get_Chrome(self):
                     )
                 )
                 self.webdriver = webdriver.Chrome(
-                    executable_path=self.config['executebale_path'],
+                    executable_path=self.config['executable_path'],
                     chrome_options=chrome_ops
                 )
-            else:
-                self.webdriver = webdriver.Chrome(
-                    executable_path=self.config['executable_path']
+
+            chrome_ops.add_argument('--no-sandbox')
+            chrome_ops.add_argument('--start-maximized')
+            chrome_ops.add_argument(
+                '--window-position={},{}'.format(
+                    randint(10, 30),
+                    randint(10, 30)
                 )
+            )
+            chrome_ops.add_argument(
+                '--window-size={},{}'.format(
+                    randint(800, 1024),
+                    randint(600, 900)
+                )
+            )
+            self.webdriver = webdriver.Chrome(
+                executable_path=self.config['executable_path'],
+                chrome_options=chrome_ops
+            )
             return True
         except WebDriverException:
             raise
@@ -326,12 +342,16 @@ def _get_PhantomJS(self):
             logger.info('useragent: {}'.format(useragent))
             dcap = dict(DesiredCapabilities.PHANTOMJS)
             dcap["phantomjs.page.settings.userAgent"] = useragent
-            self.webdriver = webdriver.PhantomJS(
-                executable_path=self.config['executable_path'],
-                service_args=service_args,
-                desired_capabilities=dcap
-            )
-            return True
+            try:
+                self.webdriver = webdriver.PhantomJS(
+                    executable_path=self.config['executable_path'],
+                    service_args=service_args,
+                    desired_capabilities=dcap
+                )
+                return True
+            except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
+                logger.error(err)
+                return False
         except WebDriverException as e:
             logger.error(e)
         return False
@@ -472,7 +492,7 @@ def _goto_next_page(self):
                 element.click()
             except WebDriverException:
                 # See http://stackoverflow.com/questions/11908249/debugging-element-is-not-clickable-at-point-error
-                # first move mouse to the next element, some times the element is not visibility, like blekko.com
+                # first move mouse to the next element, some times the element is not visibility
                 selector = self.next_page_selectors[self.search_engine_name]
                 if selector:
                     try:
@@ -550,7 +570,7 @@ def wait_until_serp_loaded(self):
             elif self.search_engine_name == 'ask':
                 selector = '#paging .pgcsel .pg'
 
-            content = None
+            # content = None
             try:
                 time.sleep(1)
                 WebDriverWait(self.webdriver, 5).until(
@@ -562,7 +582,7 @@ def wait_until_serp_loaded(self):
             except TimeoutException:
                 self._save_debug_screenshot()
                 try:
-                    content = self.webdriver.find_element_by_css_selector(selector).text
+                    self.webdriver.find_element_by_css_selector(selector).text
                 except NoSuchElementException:
                     logger.error('Skipp it, no such element - SeleniumSearchError')
                     raise SeleniumSearchError('Stop Scraping, seems we are blocked')
@@ -614,7 +634,9 @@ def search(self):
             if self.search_param_fields:
                 wait_res = self._wait_until_search_param_fields_appears()
                 if wait_res is False:
+                    self.quit()
                     raise Exception('Waiting search param input fields time exceeds')
+
                 for param, field in self.search_param_fields.items():
                     if field[0] == By.ID:
                         js_tpl = '''
@@ -635,7 +657,11 @@ def search(self):
                 self.search_input.send_keys(self.query + Keys.ENTER)
             except ElementNotVisibleException:
                 time.sleep(2)
-                self.search_input.send_keys(self.query + Keys.ENTER)
+                try:
+                    self.search_input.send_keys(self.query + Keys.ENTER)
+                except Exception:
+                    logger.error('send keys not possible, maybe page cannot loaded')
+                    self.quit()
             except Exception:
                 logger.error('send keys not possible')
                 pass
@@ -656,6 +682,8 @@ def search(self):
                     self._save_debug_screenshot()
                     time.sleep(.5)
                 self.html = self.webdriver.execute_script('return document.body.innerHTML;')
+            except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
+                logger.error(err)
             except WebDriverException:
                 self.html = self.webdriver.page_source
 
@@ -707,8 +735,11 @@ def run(self):
                 self.build_search()
                 self.search()
 
-            if self.webdriver:
-                self.webdriver.quit()
+            self.quit()
+
+    def quit(self):
+        if self.webdriver:
+            self.webdriver.quit()
 
 
 """
@@ -754,14 +785,6 @@ def wait_until_serp_loaded(self):
         super()._wait_until_search_input_field_appears()
 
 
-class BlekkoSelScrape(SelScrape):
-    def __init__(self, *args, **kwargs):
-        SelScrape.__init__(self, *args, **kwargs)
-
-    def _goto_next_page(self):
-        pass
-
-
 class AskSelScrape(SelScrape):
     def __init__(self, *args, **kwargs):
         SelScrape.__init__(self, *args, **kwargs)

diff --git a/scrapcore/scraping.py b/scrapcore/scraping.py
@@ -308,5 +308,8 @@ def update_proxy_status(self, status, ipinfo=None, online=True):
                 proxy.status = status
                 proxy.online = online
 
-                self.session.add(proxy)
-                self.session.commit()
+                try:
+                    self.session.merge(proxy, load=True)
+                    self.session.commit()
+                except:
+                    pass
diff --git a/serpscrap/config.py b/serpscrap/config.py
@@ -53,7 +53,7 @@ class Config():
         },
         'proxy_file': '',
         'proxy_check_url': 'http://canihazip.com/s',
-        'proxy_info_url': 'http://ipinfo.io/json',
+        'proxy_info_url': 'https://ipinfo.io/json',
         'stop_on_detection': True,
         'today': datetime.datetime.strftime(
             datetime.datetime.utcnow(),

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 from setuptools import setup, find_packages
 
-version = '0.9.0'
+version = '0.9.1'
 
 
 setup(