Skip to content

Commit

Permalink
Merge pull request #33 from ecoron/0.10.2
Browse files Browse the repository at this point in the history
0.10.2
  • Loading branch information
Ronald Schmidt authored May 8, 2018
2 parents f11fc5d + 2d33a36 commit 2b3c0c4
Show file tree
Hide file tree
Showing 10 changed files with 55 additions and 63 deletions.
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@

# General information about the project.
project = 'SerpScrap'
copyright = '2017, ecoron'
copyright = '2017-2018, ecoron'
author = 'ecoron'

# The version info for the project you're documenting, acts as replacement for
Expand All @@ -60,7 +60,7 @@
# The short X.Y version.
version = '0.10'
# The full version, including alpha/beta/rc tags.
release = '0.10.1'
release = '0.10.2'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
44 changes: 24 additions & 20 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,25 @@ Ensure the executing user has read/write permissions for this folder.
Default configuration
---------------------

* cachedir: '/tmp/.serpscrap/' - path cachefiles
* clean_cache_after: 24 - clean cached files older then x hours
* database_name: '/tmp/serpscrap' - path and name sqlite db (stores scrape results)
* do_caching: True - enable / disable caching
* headers: - dict to customize request header, see below
* num_pages_for_keyword: 2 - number of result pages to scrape
* num_results_per_page: 10 - number results per searchengine page
* proxy_file: '' - path to proxy file, see below
* scrape_urls: False - scrape urls of search results
* search_engines: ['google'] - search engines (google)
* url_threads: 3 - number of threads if scrape_urls is true
* use_own_ip: True - if using proxies set to False
* sleeping_min: 5 - min seconds to sleep between scrapes
* sleeping_max: 15 - max seconds to sleep between scrapes
* screenshot: True - enable screenshots for each query
* dir_screenshot: '/tmp/screenshots' - basedir for saved screenshots
* chrome_headless: True - run chrome in headless mode, default is True
* cachedir: '/tmp/.serpscrap/' - path cachefiles
* chrome_headless: True - run chrome in headless mode, default is True
* clean_cache_after: 24 - clean cached files older then x hours
* database_name: '/tmp/serpscrap' - path and name sqlite db (stores scrape results)
* dir_screenshot: '/tmp/screenshots' - basedir for saved screenshots
* do_caching: True - enable / disable caching
* executable_path: '/usr/local/bin/chromedriver' - path to chromedriver
* google_search_url: 'https://www.google.com/search?' - base search url, modify for other countries
* headers: - dict to customize request header, see below
* num_pages_for_keyword: 2 - number of result pages to scrape
* num_results_per_page: 10 - number results per searchengine page
* proxy_file: '' - path to proxy file, see below
* scrape_urls: False - scrape urls of search results
* screenshot: True - enable screenshots for each query
* search_engines: ['google'] - search engines (google)
* sleeping_max: 15 - max seconds to sleep between scrapes
* sleeping_min: 5 - min seconds to sleep between scrapes
* url_threads: 3 - number of threads if scrape_urls is true
* use_own_ip: True - if using proxies set to False

Custom configuration
--------------------
Expand All @@ -48,7 +50,9 @@ Change some config params.
scrap = serpscrap.SerpScrap()
scrap.init(config=config.get(), keywords=keywords)
Using your own configuration
You can apply your own config dictionary. It is not required to provide any possible
config key. by applying the default config values will be overwritten by the new values.
for not provided config keys the deault values still exists.

.. code-block:: python
Expand All @@ -61,10 +65,10 @@ Using your own configuration
'database_name': '/tmp/serpscrap',
'do_caching': True,
'num_pages_for_keyword': 2,
'proxy_file': '',
'scrape_urls': True,
'search_engines': ['google'],
'url_threads': 3,
'google_search_url': 'https://www.google.com/search?',
'executable_path', '/usr/local/bin/chromedriver',
}
config.apply(config_new)
Expand Down
2 changes: 1 addition & 1 deletion docs/results.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ If you prefer to save the results use the as_csv() method.
{
'query': 'example',
'query_num_results total': 'Ungefähr 1.740.000.000 Ergebnisse (0,50 '
'query_num_results_total': 'Ungefähr 1.740.000.000 Ergebnisse (0,50 '
'Sekunden)\xa0',
'query_num_results_page': 10,
'query_page_number': 1,
Expand Down
2 changes: 1 addition & 1 deletion install_chrome.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# Versions
CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`
SELENIUM_STANDALONE_VERSION=3.4.0
SELENIUM_STANDALONE_VERSION=3.11.0
SELENIUM_SUBDIR=$(echo "$SELENIUM_STANDALONE_VERSION" | cut -d"." -f-2)

# Remove existing downloads and binaries so we can start from scratch.
Expand Down
12 changes: 6 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
lxml==3.8.0
lxml==4.2.1
chardet==3.0.4
beautifulsoup4==4.6.0
html2text==2017.10.4
PySocks==1.6.7
sqlalchemy==1.1.15
selenium==3.8.1
cssselect==1.0.1
html2text==2018.1.9
PySocks==1.6.8
sqlalchemy==1.2.7
selenium==3.11.0
cssselect==1.0.3
28 changes: 10 additions & 18 deletions scrapcore/scraper/selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ def wait_until_serp_loaded(self):
if self.search_type == 'normal':

if self.search_engine_name == 'google':
selector = '#navcnt td.cur'
selector = '#resultStats'
elif self.search_engine_name == 'yandex':
selector = '.pager__item_current_yes font font'
elif self.search_engine_name == 'bing':
Expand All @@ -591,31 +591,22 @@ def wait_until_serp_loaded(self):
elif self.search_engine_name == 'ask':
selector = '#paging .pgcsel .pg'

# content = None
try:
time.sleep(1)
WebDriverWait(self.webdriver, 5).until(
EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, selector),
str(self.page_number)
)
)
except TimeoutException:
WebDriverWait(self.webdriver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, selector)))
except NoSuchElementException:
logger.error('No such element. Seeing if title matches before raising SeleniumSearchError')
self._save_debug_screenshot()
try:
self.webdriver.find_element_by_css_selector(selector).text
except NoSuchElementException:
logger.error('Skip it, no such element - SeleniumSearchError')
self.wait_until_title_contains_keyword()
except TimeoutException:
self.quit()
raise SeleniumSearchError('Stop Scraping, seems we are blocked')
except Exception:
except Exception as e:
logger.error('Scrape Exception pass. Selector: ' + str(selector))
logger.error('Error: ' + str(e))
self._save_debug_screenshot()
pass

elif self.search_type == 'image':
self.wait_until_title_contains_keyword()

else:
self.wait_until_title_contains_keyword()

Expand Down Expand Up @@ -716,7 +707,8 @@ def search(self):

# Click the next page link not when leaving the loop
# in the next iteration.
if self.page_number in self.pages_per_keyword:
if self.page_number + 1 in self.pages_per_keyword:
logger.info('Requesting the next page')
next_url = self._goto_next_page()
self.requested_at = datetime.datetime.utcnow()

Expand Down
4 changes: 0 additions & 4 deletions scrapcore/user_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
Expand All @@ -48,15 +47,12 @@
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Mozilla/5.0 (Windows NT 6.1; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0',
Expand Down
6 changes: 3 additions & 3 deletions serpscrap/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,10 @@ def set(self, key, value):
self.config.__setitem__(key, value)

def apply(self, config):
"""apply an individual conig
"""apply an individual config, replace default config
by values of new config
Args:
config (dict): new configuration
"""

self.config = config
self.config.update(config)
2 changes: 1 addition & 1 deletion serpscrap/serpscrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def scrap_serps(self):
})
for link in serp.links:
self.results.append({
'query_num_results total': serp.num_results_for_query,
'query_num_results_total': serp.num_results_for_query,
'query_num_results_page': serp.num_results,
'query_page_number': serp.page_number,
'query': serp.query,
Expand Down
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
from setuptools import setup, find_packages

version = '0.10.1'
version = '0.10.2'


setup(
Expand All @@ -22,14 +22,14 @@
license='MIT',
packages=find_packages(),
install_requires=[
'PySocks==1.6.7',
'PySocks==1.6.8',
'chardet==3.0.4',
'beautifulsoup4==4.6.0',
'html2text==2017.10.4',
'lxml==3.8.0',
'sqlalchemy==1.1.15',
'selenium==3.8.1',
'cssselect==1.0.1',
'html2text==2018.1.9',
'lxml==4.2.1',
'sqlalchemy==1.2.7',
'selenium==3.11.0',
'cssselect==1.0.3',
],
classifiers=[
'Development Status :: 4 - Beta',
Expand Down

0 comments on commit 2b3c0c4

Please sign in to comment.