From 9e5c917992142a6670201482ec3b56456e941275 Mon Sep 17 00:00:00 2001
From: Reese <mrreeseallison@gmail.com>
Date: Thu, 14 Nov 2024 10:23:04 -0500
Subject: [PATCH] Get the repo to pass flake8, resolve issue #7 and #9

---
 README.md                              |  2 +-
 crawlerdetect/__init__.py              |  4 +-
 crawlerdetect/__main__.py              | 14 +-----
 crawlerdetect/src/crawlerdetect.py     | 21 ++++++++
 tests/fixtures/headers.json            | 66 ++++++++++++++++++++++++++
 tests/fixtures/user_agent/crawlers.txt |  3 +-
 tests/test_crawlerdetect.py            | 36 +++++++-------
 7 files changed, 113 insertions(+), 33 deletions(-)
 create mode 100644 tests/fixtures/headers.json

diff --git a/README.md b/README.md
index f12f1ee..b443cc4 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 ## About CrawlerDetect
 
 This is a Python wrapper for [CrawlerDetect](https://github.com/JayBizzle/Crawler-Detect) - the web crawler detection library
-It helps to detect  bots/crawlers/spiders via the user agent and other HTTP-headers. Currently able to detect > 1,000's of bots/spiders/crawlers.
+It helps to detect bots/crawlers/spiders via the user agent and other HTTP-headers. Currently able to detect > 1,000's of bots/spiders/crawlers.
 
 ### Installation
 Run `pip install crawlerdetect`
diff --git a/crawlerdetect/__init__.py b/crawlerdetect/__init__.py
index da42330..06e64cc 100644
--- a/crawlerdetect/__init__.py
+++ b/crawlerdetect/__init__.py
@@ -3,6 +3,6 @@
 """
 
 from .src import providers
-from .src.crawlerdetect import CrawlerDetect
+from .src.crawlerdetect import CrawlerDetect, get_crawlerdetect_version
 
-__all__ = ("CrawlerDetect", "providers")
+__all__ = ("CrawlerDetect", "providers", "get_crawlerdetect_version")
diff --git a/crawlerdetect/__main__.py b/crawlerdetect/__main__.py
index 8915970..cbb0df1 100644
--- a/crawlerdetect/__main__.py
+++ b/crawlerdetect/__main__.py
@@ -1,18 +1,6 @@
-import configparser
-import os
 import sys
 
-
-def get_crawlerdetect_version():
-    config = configparser.ConfigParser()
-
-    current_directory = os.path.dirname(os.path.abspath(__file__))
-    parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
-    config_file_path = os.path.join(parent_directory, "setup.cfg")
-
-    config.read(config_file_path)
-
-    return config["crawlerdetect"]["version"]
+from crawlerdetect import get_crawlerdetect_version
 
 
 if __name__ == "__main__":
diff --git a/crawlerdetect/src/crawlerdetect.py b/crawlerdetect/src/crawlerdetect.py
index 563bbe3..b9931d6 100644
--- a/crawlerdetect/src/crawlerdetect.py
+++ b/crawlerdetect/src/crawlerdetect.py
@@ -1,8 +1,25 @@
+import configparser
+import os
 import re
 
 from .providers import Crawlers, Exclusions, Headers
 
 
+def get_crawlerdetect_version():
+    config = configparser.ConfigParser()
+
+    current_directory = os.path.dirname(os.path.abspath(__file__))
+    grandparent_directory = os.path.abspath(os.path.join(current_directory, os.pardir, os.pardir))
+    config_file_path = os.path.join(grandparent_directory, "setup.cfg")
+
+    config.read(config_file_path)
+
+    return config["crawlerdetect"]["version"]
+
+
+version = get_crawlerdetect_version()
+
+
 class CrawlerDetect(object):
     def __init__(self, headers=None, user_agent=""):
         self.crawlers = Crawlers()
@@ -16,6 +33,10 @@ def __init__(self, headers=None, user_agent=""):
         self.setHttpHeaders(headers)
         self.setUserAgent(user_agent)
 
+    @property
+    def version(self):
+        return version
+
     def setHttpHeaders(self, http_headers):
         self.httpHeaders = {}
 
diff --git a/tests/fixtures/headers.json b/tests/fixtures/headers.json
new file mode 100644
index 0000000..2713732
--- /dev/null
+++ b/tests/fixtures/headers.json
@@ -0,0 +1,66 @@
+{
+    "test_current_visitor": {
+        "DOCUMENT_ROOT": "\/home\/test\/public_html",
+        "GATEWAY_INTERFACE": "CGI\/1.1",
+        "HTTP_ACCEPT": "*\/*",
+        "HTTP_ACCEPT_ENCODING": "gzip, deflate",
+        "HTTP_CACHE_CONTROL": "no-cache",
+        "HTTP_CONNECTION": "Keep-Alive",
+        "HTTP_FROM": "bingbot(at)microsoft.com",
+        "HTTP_HOST": "www.test.com",
+        "HTTP_PRAGMA": "no-cache",
+        "HTTP_USER_AGENT": "Mozilla\/5.0 (compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm)",
+        "PATH": "\/bin:\/usr\/bin",
+        "QUERY_STRING": "order=closingDate",
+        "REDIRECT_STATUS": "200",
+        "REMOTE_ADDR": "127.0.0.1",
+        "REMOTE_PORT": "3360",
+        "REQUEST_METHOD": "GET",
+        "REQUEST_URI": "\/?test=testing",
+        "SCRIPT_FILENAME": "\/home\/test\/public_html\/index.php",
+        "SCRIPT_NAME": "\/index.php",
+        "SERVER_ADDR": "127.0.0.1",
+        "SERVER_ADMIN": "webmaster@test.com",
+        "SERVER_NAME": "www.test.com",
+        "SERVER_PORT": "80",
+        "SERVER_PROTOCOL": "HTTP\/1.1",
+        "SERVER_SIGNATURE": "",
+        "SERVER_SOFTWARE": "Apache",
+        "UNIQUE_ID": "Vx6MENRxerBUSDEQgFLAAAAAS",
+        "PHP_SELF": "\/index.php",
+        "REQUEST_TIME_FLOAT": 1461619728.0705,
+        "REQUEST_TIME": 1461619728
+    },
+    "test_http_from_header": {
+        "DOCUMENT_ROOT": "\/home\/test\/public_html",
+        "GATEWAY_INTERFACE": "CGI\/1.1",
+        "HTTP_ACCEPT": "*\/*",
+        "HTTP_ACCEPT_ENCODING": "gzip, deflate",
+        "HTTP_CACHE_CONTROL": "no-cache",
+        "HTTP_CONNECTION": "Keep-Alive",
+        "HTTP_FROM": "googlebot(at)googlebot.com",
+        "HTTP_HOST": "www.test.com",
+        "HTTP_PRAGMA": "no-cache",
+        "HTTP_USER_AGENT": "Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/28.0.1500.71 Safari\/537.36",
+        "PATH": "\/bin:\/usr\/bin",
+        "QUERY_STRING": "order=closingDate",
+        "REDIRECT_STATUS": "200",
+        "REMOTE_ADDR": "127.0.0.1",
+        "REMOTE_PORT": "3360",
+        "REQUEST_METHOD": "GET",
+        "REQUEST_URI": "\/?test=testing",
+        "SCRIPT_FILENAME": "\/home\/test\/public_html\/index.php",
+        "SCRIPT_NAME": "\/index.php",
+        "SERVER_ADDR": "127.0.0.1",
+        "SERVER_ADMIN": "webmaster@test.com",
+        "SERVER_NAME": "www.test.com",
+        "SERVER_PORT": "80",
+        "SERVER_PROTOCOL": "HTTP\/1.1",
+        "SERVER_SIGNATURE": "",
+        "SERVER_SOFTWARE": "Apache",
+        "UNIQUE_ID": "Vx6MENRxerBUSDEQgFLAAAAAS",
+        "PHP_SELF": "\/index.php",
+        "REQUEST_TIME_FLOAT": 1461619728.0705,
+        "REQUEST_TIME": 1461619728
+    }
+}
\ No newline at end of file
diff --git a/tests/fixtures/user_agent/crawlers.txt b/tests/fixtures/user_agent/crawlers.txt
index 72a77f8..9e446cd 100644
--- a/tests/fixtures/user_agent/crawlers.txt
+++ b/tests/fixtures/user_agent/crawlers.txt
@@ -3675,4 +3675,5 @@ Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GoogleOther) Chro
 Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Mobile Safari/537.36 (compatible; GoogleOther)
 Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.94 Mobile Safari/537.36 (compatible; GoogleOther)
 Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
-Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)
\ No newline at end of file
+Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)
+Mozilla/5.0+(compatible; MonSpark/1.0; http://www.monspark.com/)
diff --git a/tests/test_crawlerdetect.py b/tests/test_crawlerdetect.py
index a1300d9..35a3f86 100644
--- a/tests/test_crawlerdetect.py
+++ b/tests/test_crawlerdetect.py
@@ -2,25 +2,29 @@
 import os
 import re
 
-from crawlerdetect import CrawlerDetect
-from crawlerdetect import __main__ as main
-from crawlerdetect import providers
+from crawlerdetect import CrawlerDetect, get_crawlerdetect_version, providers
 
 from .base_case import CrawlerDetectTestCase
 
 
+with open(os.path.join(os.path.dirname(__file__), "fixtures/headers.json")) as f:
+    test_headers = json.load(f)
+
+
 class CrawlerDetectTests(CrawlerDetectTestCase):
     def test_get_crawlerdetect_version(self):
-        version = main.get_crawlerdetect_version()
+        version = get_crawlerdetect_version()
         version_parts = version.split(".")
         self.assertEqual(len(version_parts), 3)
         self.assertTrue(version_parts[0].isdigit())
         self.assertTrue(version_parts[1].isdigit())
 
     def test_is_crawler(self):
-        res = self.cd.isCrawler(
-            "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
+        ua = (
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile "
+            "(compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
         )
+        res = self.cd.isCrawler(ua)
         self.assertTrue(res)
 
     def test_user_agents_are_bots(self):
@@ -56,9 +60,11 @@ def test_sec_ch_ua_are_devices(self):
                 self.assertFalse(test, line)
 
     def test_it_returns_correct_matched_bot_name(self):
-        self.cd.isCrawler(
-            "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
+        ua = (
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) "
+            "Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
         )
+        self.cd.isCrawler(ua)
         matches = self.cd.getMatches()
         self.assertEqual(self.cd.getMatches(), "monitoring", matches)
 
@@ -72,22 +78,20 @@ def test_empty_user_agent(self):
         self.assertFalse(test)
 
     def test_current_visitor(self):
-        headers = json.loads(
-            '{"DOCUMENT_ROOT":"\/home\/test\/public_html","GATEWAY_INTERFACE":"CGI\/1.1","HTTP_ACCEPT":"*\/*","HTTP_ACCEPT_ENCODING":"gzip, deflate","HTTP_CACHE_CONTROL":"no-cache","HTTP_CONNECTION":"Keep-Alive","HTTP_FROM":"bingbot(at)microsoft.com","HTTP_HOST":"www.test.com","HTTP_PRAGMA":"no-cache","HTTP_USER_AGENT":"Mozilla\/5.0 (compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm)","PATH":"\/bin:\/usr\/bin","QUERY_STRING":"order=closingDate","REDIRECT_STATUS":"200","REMOTE_ADDR":"127.0.0.1","REMOTE_PORT":"3360","REQUEST_METHOD":"GET","REQUEST_URI":"\/?test=testing","SCRIPT_FILENAME":"\/home\/test\/public_html\/index.php","SCRIPT_NAME":"\/index.php","SERVER_ADDR":"127.0.0.1","SERVER_ADMIN":"webmaster@test.com","SERVER_NAME":"www.test.com","SERVER_PORT":"80","SERVER_PROTOCOL":"HTTP\/1.1","SERVER_SIGNATURE":"","SERVER_SOFTWARE":"Apache","UNIQUE_ID":"Vx6MENRxerBUSDEQgFLAAAAAS","PHP_SELF":"\/index.php","REQUEST_TIME_FLOAT":1461619728.0705,"REQUEST_TIME":1461619728}'
-        )
+        headers = test_headers["test_current_visitor"]
         cd = CrawlerDetect(headers=headers)
         self.assertTrue(cd.isCrawler())
 
     def test_user_agent_passed_via_contructor(self):
-        cd = CrawlerDetect(
-            user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
+        ua = (
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile (compatible; "
+            "Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)"
         )
+        cd = CrawlerDetect(user_agent=ua)
         self.assertTrue(cd.isCrawler())
 
     def test_http_from_header(self):
-        headers = json.loads(
-            '{"DOCUMENT_ROOT":"\/home\/test\/public_html","GATEWAY_INTERFACE":"CGI\/1.1","HTTP_ACCEPT":"*\/*","HTTP_ACCEPT_ENCODING":"gzip, deflate","HTTP_CACHE_CONTROL":"no-cache","HTTP_CONNECTION":"Keep-Alive","HTTP_FROM":"googlebot(at)googlebot.com","HTTP_HOST":"www.test.com","HTTP_PRAGMA":"no-cache","HTTP_USER_AGENT":"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/28.0.1500.71 Safari\/537.36","PATH":"\/bin:\/usr\/bin","QUERY_STRING":"order=closingDate","REDIRECT_STATUS":"200","REMOTE_ADDR":"127.0.0.1","REMOTE_PORT":"3360","REQUEST_METHOD":"GET","REQUEST_URI":"\/?test=testing","SCRIPT_FILENAME":"\/home\/test\/public_html\/index.php","SCRIPT_NAME":"\/index.php","SERVER_ADDR":"127.0.0.1","SERVER_ADMIN":"webmaster@test.com","SERVER_NAME":"www.test.com","SERVER_PORT":"80","SERVER_PROTOCOL":"HTTP\/1.1","SERVER_SIGNATURE":"","SERVER_SOFTWARE":"Apache","UNIQUE_ID":"Vx6MENRxerBUSDEQgFLAAAAAS","PHP_SELF":"\/index.php","REQUEST_TIME_FLOAT":1461619728.0705,"REQUEST_TIME":1461619728}'
-        )
+        headers = test_headers["test_http_from_header"]
         print(headers)
         cd = CrawlerDetect(headers=headers)
         self.assertTrue(cd.isCrawler())