From 9e5c917992142a6670201482ec3b56456e941275 Mon Sep 17 00:00:00 2001 From: Reese Date: Thu, 14 Nov 2024 10:23:04 -0500 Subject: [PATCH] Get the repo to pass flake8, resolve issue #7 and #9 --- README.md | 2 +- crawlerdetect/__init__.py | 4 +- crawlerdetect/__main__.py | 14 +----- crawlerdetect/src/crawlerdetect.py | 21 ++++++++ tests/fixtures/headers.json | 66 ++++++++++++++++++++++++++ tests/fixtures/user_agent/crawlers.txt | 3 +- tests/test_crawlerdetect.py | 36 +++++++------- 7 files changed, 113 insertions(+), 33 deletions(-) create mode 100644 tests/fixtures/headers.json diff --git a/README.md b/README.md index f12f1ee..b443cc4 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ## About CrawlerDetect This is a Python wrapper for [CrawlerDetect](https://github.com/JayBizzle/Crawler-Detect) - the web crawler detection library -It helps to detect bots/crawlers/spiders via the user agent and other HTTP-headers. Currently able to detect > 1,000's of bots/spiders/crawlers. +It helps to detect bots/crawlers/spiders via the user agent and other HTTP-headers. Currently able to detect > 1,000's of bots/spiders/crawlers. ### Installation Run `pip install crawlerdetect` diff --git a/crawlerdetect/__init__.py b/crawlerdetect/__init__.py index da42330..06e64cc 100644 --- a/crawlerdetect/__init__.py +++ b/crawlerdetect/__init__.py @@ -3,6 +3,6 @@ """ from .src import providers -from .src.crawlerdetect import CrawlerDetect +from .src.crawlerdetect import CrawlerDetect, get_crawlerdetect_version -__all__ = ("CrawlerDetect", "providers") +__all__ = ("CrawlerDetect", "providers", "get_crawlerdetect_version") diff --git a/crawlerdetect/__main__.py b/crawlerdetect/__main__.py index 8915970..cbb0df1 100644 --- a/crawlerdetect/__main__.py +++ b/crawlerdetect/__main__.py @@ -1,18 +1,6 @@ -import configparser -import os import sys - -def get_crawlerdetect_version(): - config = configparser.ConfigParser() - - current_directory = os.path.dirname(os.path.abspath(__file__)) - parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir)) - config_file_path = os.path.join(parent_directory, "setup.cfg") - - config.read(config_file_path) - - return config["crawlerdetect"]["version"] +from crawlerdetect import get_crawlerdetect_version if __name__ == "__main__": diff --git a/crawlerdetect/src/crawlerdetect.py b/crawlerdetect/src/crawlerdetect.py index 563bbe3..b9931d6 100644 --- a/crawlerdetect/src/crawlerdetect.py +++ b/crawlerdetect/src/crawlerdetect.py @@ -1,8 +1,25 @@ +import configparser +import os import re from .providers import Crawlers, Exclusions, Headers +def get_crawlerdetect_version(): + config = configparser.ConfigParser() + + current_directory = os.path.dirname(os.path.abspath(__file__)) + grandparent_directory = os.path.abspath(os.path.join(current_directory, os.pardir, os.pardir)) + config_file_path = os.path.join(grandparent_directory, "setup.cfg") + + config.read(config_file_path) + + return config["crawlerdetect"]["version"] + + +version = get_crawlerdetect_version() + + class CrawlerDetect(object): def __init__(self, headers=None, user_agent=""): self.crawlers = Crawlers() @@ -16,6 +33,10 @@ def __init__(self, headers=None, user_agent=""): self.setHttpHeaders(headers) self.setUserAgent(user_agent) + @property + def version(self): + return version + def setHttpHeaders(self, http_headers): self.httpHeaders = {} diff --git a/tests/fixtures/headers.json b/tests/fixtures/headers.json new file mode 100644 index 0000000..2713732 --- /dev/null +++ b/tests/fixtures/headers.json @@ -0,0 +1,66 @@ +{ + "test_current_visitor": { + "DOCUMENT_ROOT": "\/home\/test\/public_html", + "GATEWAY_INTERFACE": "CGI\/1.1", + "HTTP_ACCEPT": "*\/*", + "HTTP_ACCEPT_ENCODING": "gzip, deflate", + "HTTP_CACHE_CONTROL": "no-cache", + "HTTP_CONNECTION": "Keep-Alive", + "HTTP_FROM": "bingbot(at)microsoft.com", + "HTTP_HOST": "www.test.com", + "HTTP_PRAGMA": "no-cache", + "HTTP_USER_AGENT": "Mozilla\/5.0 (compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm)", + "PATH": "\/bin:\/usr\/bin", + "QUERY_STRING": "order=closingDate", + "REDIRECT_STATUS": "200", + "REMOTE_ADDR": "127.0.0.1", + "REMOTE_PORT": "3360", + "REQUEST_METHOD": "GET", + "REQUEST_URI": "\/?test=testing", + "SCRIPT_FILENAME": "\/home\/test\/public_html\/index.php", + "SCRIPT_NAME": "\/index.php", + "SERVER_ADDR": "127.0.0.1", + "SERVER_ADMIN": "webmaster@test.com", + "SERVER_NAME": "www.test.com", + "SERVER_PORT": "80", + "SERVER_PROTOCOL": "HTTP\/1.1", + "SERVER_SIGNATURE": "", + "SERVER_SOFTWARE": "Apache", + "UNIQUE_ID": "Vx6MENRxerBUSDEQgFLAAAAAS", + "PHP_SELF": "\/index.php", + "REQUEST_TIME_FLOAT": 1461619728.0705, + "REQUEST_TIME": 1461619728 + }, + "test_http_from_header": { + "DOCUMENT_ROOT": "\/home\/test\/public_html", + "GATEWAY_INTERFACE": "CGI\/1.1", + "HTTP_ACCEPT": "*\/*", + "HTTP_ACCEPT_ENCODING": "gzip, deflate", + "HTTP_CACHE_CONTROL": "no-cache", + "HTTP_CONNECTION": "Keep-Alive", + "HTTP_FROM": "googlebot(at)googlebot.com", + "HTTP_HOST": "www.test.com", + "HTTP_PRAGMA": "no-cache", + "HTTP_USER_AGENT": "Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/28.0.1500.71 Safari\/537.36", + "PATH": "\/bin:\/usr\/bin", + "QUERY_STRING": "order=closingDate", + "REDIRECT_STATUS": "200", + "REMOTE_ADDR": "127.0.0.1", + "REMOTE_PORT": "3360", + "REQUEST_METHOD": "GET", + "REQUEST_URI": "\/?test=testing", + "SCRIPT_FILENAME": "\/home\/test\/public_html\/index.php", + "SCRIPT_NAME": "\/index.php", + "SERVER_ADDR": "127.0.0.1", + "SERVER_ADMIN": "webmaster@test.com", + "SERVER_NAME": "www.test.com", + "SERVER_PORT": "80", + "SERVER_PROTOCOL": "HTTP\/1.1", + "SERVER_SIGNATURE": "", + "SERVER_SOFTWARE": "Apache", + "UNIQUE_ID": "Vx6MENRxerBUSDEQgFLAAAAAS", + "PHP_SELF": "\/index.php", + "REQUEST_TIME_FLOAT": 1461619728.0705, + "REQUEST_TIME": 1461619728 + } +} \ No newline at end of file diff --git a/tests/fixtures/user_agent/crawlers.txt b/tests/fixtures/user_agent/crawlers.txt index 72a77f8..9e446cd 100644 --- a/tests/fixtures/user_agent/crawlers.txt +++ b/tests/fixtures/user_agent/crawlers.txt @@ -3675,4 +3675,5 @@ Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GoogleOther) Chro Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Mobile Safari/537.36 (compatible; GoogleOther) Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.94 Mobile Safari/537.36 (compatible; GoogleOther) Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) -Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots) \ No newline at end of file +Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots) +Mozilla/5.0+(compatible; MonSpark/1.0; http://www.monspark.com/) diff --git a/tests/test_crawlerdetect.py b/tests/test_crawlerdetect.py index a1300d9..35a3f86 100644 --- a/tests/test_crawlerdetect.py +++ b/tests/test_crawlerdetect.py @@ -2,25 +2,29 @@ import os import re -from crawlerdetect import CrawlerDetect -from crawlerdetect import __main__ as main -from crawlerdetect import providers +from crawlerdetect import CrawlerDetect, get_crawlerdetect_version, providers from .base_case import CrawlerDetectTestCase +with open(os.path.join(os.path.dirname(__file__), "fixtures/headers.json")) as f: + test_headers = json.load(f) + + class CrawlerDetectTests(CrawlerDetectTestCase): def test_get_crawlerdetect_version(self): - version = main.get_crawlerdetect_version() + version = get_crawlerdetect_version() version_parts = version.split(".") self.assertEqual(len(version_parts), 3) self.assertTrue(version_parts[0].isdigit()) self.assertTrue(version_parts[1].isdigit()) def test_is_crawler(self): - res = self.cd.isCrawler( - "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)" + ua = ( + "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile " + "(compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)" ) + res = self.cd.isCrawler(ua) self.assertTrue(res) def test_user_agents_are_bots(self): @@ -56,9 +60,11 @@ def test_sec_ch_ua_are_devices(self): self.assertFalse(test, line) def test_it_returns_correct_matched_bot_name(self): - self.cd.isCrawler( - "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)" + ua = ( + "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) " + "Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)" ) + self.cd.isCrawler(ua) matches = self.cd.getMatches() self.assertEqual(self.cd.getMatches(), "monitoring", matches) @@ -72,22 +78,20 @@ def test_empty_user_agent(self): self.assertFalse(test) def test_current_visitor(self): - headers = json.loads( - '{"DOCUMENT_ROOT":"\/home\/test\/public_html","GATEWAY_INTERFACE":"CGI\/1.1","HTTP_ACCEPT":"*\/*","HTTP_ACCEPT_ENCODING":"gzip, deflate","HTTP_CACHE_CONTROL":"no-cache","HTTP_CONNECTION":"Keep-Alive","HTTP_FROM":"bingbot(at)microsoft.com","HTTP_HOST":"www.test.com","HTTP_PRAGMA":"no-cache","HTTP_USER_AGENT":"Mozilla\/5.0 (compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm)","PATH":"\/bin:\/usr\/bin","QUERY_STRING":"order=closingDate","REDIRECT_STATUS":"200","REMOTE_ADDR":"127.0.0.1","REMOTE_PORT":"3360","REQUEST_METHOD":"GET","REQUEST_URI":"\/?test=testing","SCRIPT_FILENAME":"\/home\/test\/public_html\/index.php","SCRIPT_NAME":"\/index.php","SERVER_ADDR":"127.0.0.1","SERVER_ADMIN":"webmaster@test.com","SERVER_NAME":"www.test.com","SERVER_PORT":"80","SERVER_PROTOCOL":"HTTP\/1.1","SERVER_SIGNATURE":"","SERVER_SOFTWARE":"Apache","UNIQUE_ID":"Vx6MENRxerBUSDEQgFLAAAAAS","PHP_SELF":"\/index.php","REQUEST_TIME_FLOAT":1461619728.0705,"REQUEST_TIME":1461619728}' - ) + headers = test_headers["test_current_visitor"] cd = CrawlerDetect(headers=headers) self.assertTrue(cd.isCrawler()) def test_user_agent_passed_via_contructor(self): - cd = CrawlerDetect( - user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)" + ua = ( + "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; " + "Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)" ) + cd = CrawlerDetect(user_agent=ua) self.assertTrue(cd.isCrawler()) def test_http_from_header(self): - headers = json.loads( - '{"DOCUMENT_ROOT":"\/home\/test\/public_html","GATEWAY_INTERFACE":"CGI\/1.1","HTTP_ACCEPT":"*\/*","HTTP_ACCEPT_ENCODING":"gzip, deflate","HTTP_CACHE_CONTROL":"no-cache","HTTP_CONNECTION":"Keep-Alive","HTTP_FROM":"googlebot(at)googlebot.com","HTTP_HOST":"www.test.com","HTTP_PRAGMA":"no-cache","HTTP_USER_AGENT":"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/28.0.1500.71 Safari\/537.36","PATH":"\/bin:\/usr\/bin","QUERY_STRING":"order=closingDate","REDIRECT_STATUS":"200","REMOTE_ADDR":"127.0.0.1","REMOTE_PORT":"3360","REQUEST_METHOD":"GET","REQUEST_URI":"\/?test=testing","SCRIPT_FILENAME":"\/home\/test\/public_html\/index.php","SCRIPT_NAME":"\/index.php","SERVER_ADDR":"127.0.0.1","SERVER_ADMIN":"webmaster@test.com","SERVER_NAME":"www.test.com","SERVER_PORT":"80","SERVER_PROTOCOL":"HTTP\/1.1","SERVER_SIGNATURE":"","SERVER_SOFTWARE":"Apache","UNIQUE_ID":"Vx6MENRxerBUSDEQgFLAAAAAS","PHP_SELF":"\/index.php","REQUEST_TIME_FLOAT":1461619728.0705,"REQUEST_TIME":1461619728}' - ) + headers = test_headers["test_http_from_header"] print(headers) cd = CrawlerDetect(headers=headers) self.assertTrue(cd.isCrawler())