Merge pull request #29 from ecoron/0.10.1

0.10.1
ecoron · Jan 12, 2018 · f11fc5d · f11fc5d
2 parents 1345c37 + 88d0d19
commit f11fc5d
Show file tree

Hide file tree

Showing 9 changed files with 21 additions and 21 deletions.
diff --git a/README.rst b/README.rst
@@ -114,7 +114,7 @@ Notes about major changes between releases
 0.10.0
 ======
 
-* support for headless chrome
+* support for headless chrome, adjusted default time between scrapes
 
 0.9.0
 =====

diff --git a/docs/conf.py b/docs/conf.py
@@ -60,7 +60,7 @@
 # The short X.Y version.
 version = '0.10'
 # The full version, including alpha/beta/rc tags.
-release = '0.10.0'
+release = '0.10.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/requirements.txt b/requirements.txt
@@ -4,5 +4,5 @@ beautifulsoup4==4.6.0
 html2text==2017.10.4
 PySocks==1.6.7
 sqlalchemy==1.1.15
-selenium==3.8.0
+selenium==3.8.1
 cssselect==1.0.1
diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py
@@ -215,7 +215,7 @@ def _save_debug_screenshot(self):
             )
         )
 
-        if self.config.get('chrome_headless') is True:
+        if self.config.get('sel_browser') == 'chrome' and self.config.get('chrome_headless') is True:
             self._enable_download_in_headless_chrome(self.webdriver, screendir)
         try:
             self.webdriver.get_screenshot_as_file(location)
@@ -455,7 +455,7 @@ def _get_search_param_fields(self):
         else:
             return {}
 
-    def _wait_until_search_input_field_appears(self, max_wait=5):
+    def _wait_until_search_input_field_appears(self, max_wait=10):
         """Waits until the search input field can be located for the current search engine
 
         Args:

diff --git a/serpscrap/config.py b/serpscrap/config.py
@@ -40,8 +40,8 @@ class Config():
         'log_level': 'INFO',
         'num_workers': 1,
         'num_results_per_page': 10,
-        'sleeping_min': 5,
-        'sleeping_max': 15,
+        'sleeping_min': 20,
+        'sleeping_max': 25,
         'search_type': 'normal',
         'google_search_url': 'https://www.google.com/search?',
         'bing_search_url': 'http://www.bing.com/search?',

diff --git a/serpscrap/phantom_install.py b/serpscrap/phantom_install.py
@@ -4,6 +4,7 @@
 import tarfile
 import urllib.request
 import zipfile
+import tempfile
 from scrapcore.logger import Logger
 
 logger = Logger()
@@ -27,19 +28,13 @@ def detect_phantomjs(self):
         if 'windows' in this_os:
             if os.path.isfile(self.home_dir + self.binary_win):
                 return self.home_dir + self.binary_win
-            else:
-                return False
         elif 'linux' in this_os:
             if sys.maxsize > 2 ** 32:
                 if os.path.isfile(self.home_dir + self.binary_linux64):
                     return self.home_dir + self.binary_linux64
-                else:
-                    return False
             else:
                 if os.path.isfile(self.home_dir + self.binary_linux32):
                     return self.home_dir + self.binary_linux32
-                else:
-                    return False
         else:
             raise Exception('''
             Platform not supported.
@@ -65,9 +60,13 @@ def download(self):
             Platform not supported.
             install phantomjs manualy and update the path in your config
             ''')
-        # Download the file from `url` and save it locally under `file_name`:
-        urllib.request.urlretrieve(base_url + file_name, '/tmp/' + file_name)
-        self.unpack('/tmp/' + file_name, archive)
+        # Download the file from `url` and save it under `file_name`:
+        tmp_dir = tempfile.gettempdir() + '/'
+        try:
+            urllib.request.urlretrieve(base_url + file_name, tmp_dir + file_name)
+            self.unpack(tmp_dir + file_name, archive)
+        except:
+            raise Exception('Download and unpack of phantomjs failed. Check if %(tmp_dir)s exists and has write permissions' % {'tmp_dir' : tmp_dir})
 
     def unpack(self, file_path, archive):
         logger.info('unpacking phantomjs')

diff --git a/serpscrap/serpscrap.py b/serpscrap/serpscrap.py
@@ -80,15 +80,15 @@ def init(self, config=None, keywords=None):
             logger.info('preparing phantomjs')
             firstrun = PhantomInstall()
             phantomjs = firstrun.detect_phantomjs()
-            if phantomjs is False:
+            if phantomjs is None:
                 firstrun.download()
                 phantomjs = firstrun.detect_phantomjs()
-                if phantomjs is False:
+                if phantomjs is None:
                     raise Exception('''
                         phantomjs binary not found,
                         provide custom path in config''')
             self.config.__setitem__('executable_path', phantomjs)
-            logger.info('using ' + phantomjs)
+            logger.info('using ' + str(phantomjs))
 
         # cleanup screenshot dir on init
         if os.path.exists(self.config['dir_screenshot']):

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 from setuptools import setup, find_packages
 
-version = '0.10.0'
+version = '0.10.1'
 
 
 setup(
@@ -28,7 +28,7 @@
         'html2text==2017.10.4',
         'lxml==3.8.0',
         'sqlalchemy==1.1.15',
-        'selenium==3.8.0',
+        'selenium==3.8.1',
         'cssselect==1.0.1',
     ],
     classifiers=[

diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -47,6 +47,7 @@ def test_simple(self):
         assert len(results) > 0
         assert len(results[0]) > 0
 
+
     def test_screenshot(self):
         keywords = random.choice(self.keyword_list)
         config = Config()