Check robots.txt file before crawling

DataCrawl-AI · Jun 19, 2024 · 448e2ec · 448e2ec
1 parent 003c30b
commit 448e2ec
Showing 1 changed file with 64 additions and 19 deletions.
diff --git a/src/tiny_web_crawler/core/spider.py b/src/tiny_web_crawler/core/spider.py
@@ -8,10 +8,13 @@
 from typing import Dict, List, Optional, Set, Any
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import urllib.parse
+import urllib.robotparser
+import requests
 
 from tiny_web_crawler.networking.fetcher import fetch_url
 from tiny_web_crawler.networking.validator import is_valid_url
 from tiny_web_crawler.networking.formatter import format_url
+from tiny_web_crawler.networking.robots_txt import is_robots_txt_allowed, setup_robots_txt_parser, get_robots_txt_url
 from tiny_web_crawler.logging import get_logger, set_logging_level, INFO, DEBUG
 
 DEFAULT_SCHEME: str = 'http://'
@@ -51,11 +54,14 @@ class Spider:
     include_body: bool = False
     internal_links_only: bool = False
     external_links_only: bool = False
+    respect_robots_txt: bool = True
 
     def __post_init__(self) -> None:
-        self.scheme = DEFAULT_SCHEME
+        self.scheme: str = DEFAULT_SCHEME
 
-        self.root_netloc = urllib.parse.urlparse(self.root_url).netloc
+        self.robots: Dict[str, urllib.robotparser.RobotFileParser] = {}
+
+        self.root_netloc: str = urllib.parse.urlparse(self.root_url).netloc
 
         if self.internal_links_only and self.external_links_only:
             raise ValueError("Only one of internal_links_only and external_links_only can be set to True")
@@ -65,6 +71,15 @@ def __post_init__(self) -> None:
         else:
             set_logging_level(INFO)
 
+        if not self.respect_robots_txt:
+            logger.warning(
+                "Ignoring robots.txt files! You might be at risk of:\n"+
+                "Agent/IP bans;\n"+
+                "Disrupted operation;\n"+
+                "Increased suspicion from anti-bot services;\n"+
+                "Potential legal action;"
+            )
+
     def save_results(self) -> None:
         """
         Saves the crawl results into a JSON file.
@@ -88,6 +103,10 @@ def crawl(self, url: str) -> None:
             logger.debug("URL already crawled: %s", url)
             return
 
+        if self.respect_robots_txt and not self._handle_robots_txt(url):
+            logger.debug("Skipped: Url doesn't allow crawling: %s", url)
+            return
+
         logger.debug("Crawling: %s", url)
         soup = fetch_url(url)
         if not soup:
@@ -101,24 +120,8 @@ def crawl(self, url: str) -> None:
 
         for link in links:
             pretty_url = format_url(link['href'].lstrip(), url, self.scheme)
-            if not is_valid_url(pretty_url):
-                logger.debug("Invalid url: %s", pretty_url)
-                continue
-
-            if pretty_url in self.crawl_result[url]['urls']:
-                continue
-
-            if self.url_regex:
-                if not re.compile(self.url_regex).match(pretty_url):
-                    logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
-                    continue
-
-            if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
-                logger.debug("Skipping: External link: %s", pretty_url)
-                continue
 
-            if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
-                logger.debug("Skipping: Internal link: %s", pretty_url)
+            if self._should_skip_link(pretty_url, url):
                 continue
 
             self.crawl_result[url]['urls'].append(pretty_url)
@@ -129,6 +132,48 @@ def crawl(self, url: str) -> None:
             self.link_count += 1
             logger.debug("Links crawled: %s", self.link_count)
 
+    def _should_skip_link(self, pretty_url: str, url: str) -> bool:
+        if not is_valid_url(pretty_url):
+            logger.debug("Invalid url: %s", pretty_url)
+            return True
+
+        if pretty_url in self.crawl_result[url]['urls']:
+            return True
+
+        if self.url_regex and not re.compile(self.url_regex).match(pretty_url):
+            logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
+            return True
+
+        if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
+            logger.debug("Skipping: External link: %s", pretty_url)
+            return True
+
+        if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
+            logger.debug("Skipping: Internal link: %s", pretty_url)
+            return True
+
+        return False
+
+    def _handle_robots_txt(self, url: str) -> bool:
+        user_agent = requests.utils.default_user_agent()
+        robots_url = get_robots_txt_url(url)
+
+        if robots_url in self.robots:
+            robot_parser = self.robots[robots_url]
+        else:
+            robot_parser = setup_robots_txt_parser(robots_url)
+
+            self.robots[robots_url] = robot_parser
+
+        if not is_robots_txt_allowed(url, robot_parser):
+            return False
+
+        crawl_delay = robot_parser.crawl_delay(user_agent)
+        if crawl_delay is not None:
+            time.sleep(float(crawl_delay))
+
+        return True
+
     def start(self) -> Dict[str, Dict[str, List[str]]]:
         """
         Starts the crawling process from the root URL. Crawls up to max_links URLs.