Skip to content

Commit

Permalink
Merge pull request #29 from ecoron/0.10.1
Browse files Browse the repository at this point in the history
0.10.1
  • Loading branch information
Ronald Schmidt authored Jan 12, 2018
2 parents 1345c37 + 88d0d19 commit f11fc5d
Show file tree
Hide file tree
Showing 9 changed files with 21 additions and 21 deletions.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ Notes about major changes between releases
0.10.0
======

* support for headless chrome
* support for headless chrome, adjusted default time between scrapes

0.9.0
=====
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
# The short X.Y version.
version = '0.10'
# The full version, including alpha/beta/rc tags.
release = '0.10.0'
release = '0.10.1'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ beautifulsoup4==4.6.0
html2text==2017.10.4
PySocks==1.6.7
sqlalchemy==1.1.15
selenium==3.8.0
selenium==3.8.1
cssselect==1.0.1
4 changes: 2 additions & 2 deletions scrapcore/scraper/selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def _save_debug_screenshot(self):
)
)

if self.config.get('chrome_headless') is True:
if self.config.get('sel_browser') == 'chrome' and self.config.get('chrome_headless') is True:
self._enable_download_in_headless_chrome(self.webdriver, screendir)
try:
self.webdriver.get_screenshot_as_file(location)
Expand Down Expand Up @@ -455,7 +455,7 @@ def _get_search_param_fields(self):
else:
return {}

def _wait_until_search_input_field_appears(self, max_wait=5):
def _wait_until_search_input_field_appears(self, max_wait=10):
"""Waits until the search input field can be located for the current search engine
Args:
Expand Down
4 changes: 2 additions & 2 deletions serpscrap/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ class Config():
'log_level': 'INFO',
'num_workers': 1,
'num_results_per_page': 10,
'sleeping_min': 5,
'sleeping_max': 15,
'sleeping_min': 20,
'sleeping_max': 25,
'search_type': 'normal',
'google_search_url': 'https://www.google.com/search?',
'bing_search_url': 'http://www.bing.com/search?',
Expand Down
17 changes: 8 additions & 9 deletions serpscrap/phantom_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import tarfile
import urllib.request
import zipfile
import tempfile
from scrapcore.logger import Logger

logger = Logger()
Expand All @@ -27,19 +28,13 @@ def detect_phantomjs(self):
if 'windows' in this_os:
if os.path.isfile(self.home_dir + self.binary_win):
return self.home_dir + self.binary_win
else:
return False
elif 'linux' in this_os:
if sys.maxsize > 2 ** 32:
if os.path.isfile(self.home_dir + self.binary_linux64):
return self.home_dir + self.binary_linux64
else:
return False
else:
if os.path.isfile(self.home_dir + self.binary_linux32):
return self.home_dir + self.binary_linux32
else:
return False
else:
raise Exception('''
Platform not supported.
Expand All @@ -65,9 +60,13 @@ def download(self):
Platform not supported.
install phantomjs manualy and update the path in your config
''')
# Download the file from `url` and save it locally under `file_name`:
urllib.request.urlretrieve(base_url + file_name, '/tmp/' + file_name)
self.unpack('/tmp/' + file_name, archive)
# Download the file from `url` and save it under `file_name`:
tmp_dir = tempfile.gettempdir() + '/'
try:
urllib.request.urlretrieve(base_url + file_name, tmp_dir + file_name)
self.unpack(tmp_dir + file_name, archive)
except:
raise Exception('Download and unpack of phantomjs failed. Check if %(tmp_dir)s exists and has write permissions' % {'tmp_dir' : tmp_dir})

def unpack(self, file_path, archive):
logger.info('unpacking phantomjs')
Expand Down
6 changes: 3 additions & 3 deletions serpscrap/serpscrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ def init(self, config=None, keywords=None):
logger.info('preparing phantomjs')
firstrun = PhantomInstall()
phantomjs = firstrun.detect_phantomjs()
if phantomjs is False:
if phantomjs is None:
firstrun.download()
phantomjs = firstrun.detect_phantomjs()
if phantomjs is False:
if phantomjs is None:
raise Exception('''
phantomjs binary not found,
provide custom path in config''')
self.config.__setitem__('executable_path', phantomjs)
logger.info('using ' + phantomjs)
logger.info('using ' + str(phantomjs))

# cleanup screenshot dir on init
if os.path.exists(self.config['dir_screenshot']):
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
from setuptools import setup, find_packages

version = '0.10.0'
version = '0.10.1'


setup(
Expand All @@ -28,7 +28,7 @@
'html2text==2017.10.4',
'lxml==3.8.0',
'sqlalchemy==1.1.15',
'selenium==3.8.0',
'selenium==3.8.1',
'cssselect==1.0.1',
],
classifiers=[
Expand Down
1 change: 1 addition & 0 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_simple(self):
assert len(results) > 0
assert len(results[0]) > 0


def test_screenshot(self):
keywords = random.choice(self.keyword_list)
config = Config()
Expand Down

0 comments on commit f11fc5d

Please sign in to comment.