Skip to content

Commit

Permalink
Check robots.txt file before crawling
Browse files Browse the repository at this point in the history
  • Loading branch information
Mews committed Jun 19, 2024
1 parent 003c30b commit 448e2ec
Showing 1 changed file with 64 additions and 19 deletions.
83 changes: 64 additions & 19 deletions src/tiny_web_crawler/core/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@
from typing import Dict, List, Optional, Set, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import urllib.parse
import urllib.robotparser
import requests

from tiny_web_crawler.networking.fetcher import fetch_url
from tiny_web_crawler.networking.validator import is_valid_url
from tiny_web_crawler.networking.formatter import format_url
from tiny_web_crawler.networking.robots_txt import is_robots_txt_allowed, setup_robots_txt_parser, get_robots_txt_url
from tiny_web_crawler.logging import get_logger, set_logging_level, INFO, DEBUG

DEFAULT_SCHEME: str = 'http://'
Expand Down Expand Up @@ -51,11 +54,14 @@ class Spider:
include_body: bool = False
internal_links_only: bool = False
external_links_only: bool = False
respect_robots_txt: bool = True

def __post_init__(self) -> None:
self.scheme = DEFAULT_SCHEME
self.scheme: str = DEFAULT_SCHEME

self.root_netloc = urllib.parse.urlparse(self.root_url).netloc
self.robots: Dict[str, urllib.robotparser.RobotFileParser] = {}

self.root_netloc: str = urllib.parse.urlparse(self.root_url).netloc

if self.internal_links_only and self.external_links_only:
raise ValueError("Only one of internal_links_only and external_links_only can be set to True")
Expand All @@ -65,6 +71,15 @@ def __post_init__(self) -> None:
else:
set_logging_level(INFO)

if not self.respect_robots_txt:
logger.warning(
"Ignoring robots.txt files! You might be at risk of:\n"+
"Agent/IP bans;\n"+
"Disrupted operation;\n"+
"Increased suspicion from anti-bot services;\n"+
"Potential legal action;"
)

def save_results(self) -> None:
"""
Saves the crawl results into a JSON file.
Expand All @@ -88,6 +103,10 @@ def crawl(self, url: str) -> None:
logger.debug("URL already crawled: %s", url)
return

if self.respect_robots_txt and not self._handle_robots_txt(url):
logger.debug("Skipped: Url doesn't allow crawling: %s", url)
return

logger.debug("Crawling: %s", url)
soup = fetch_url(url)
if not soup:
Expand All @@ -101,24 +120,8 @@ def crawl(self, url: str) -> None:

for link in links:
pretty_url = format_url(link['href'].lstrip(), url, self.scheme)
if not is_valid_url(pretty_url):
logger.debug("Invalid url: %s", pretty_url)
continue

if pretty_url in self.crawl_result[url]['urls']:
continue

if self.url_regex:
if not re.compile(self.url_regex).match(pretty_url):
logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
continue

if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: External link: %s", pretty_url)
continue

if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: Internal link: %s", pretty_url)
if self._should_skip_link(pretty_url, url):
continue

self.crawl_result[url]['urls'].append(pretty_url)
Expand All @@ -129,6 +132,48 @@ def crawl(self, url: str) -> None:
self.link_count += 1
logger.debug("Links crawled: %s", self.link_count)

def _should_skip_link(self, pretty_url: str, url: str) -> bool:
if not is_valid_url(pretty_url):
logger.debug("Invalid url: %s", pretty_url)
return True

if pretty_url in self.crawl_result[url]['urls']:
return True

if self.url_regex and not re.compile(self.url_regex).match(pretty_url):
logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
return True

if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: External link: %s", pretty_url)
return True

if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: Internal link: %s", pretty_url)
return True

return False

def _handle_robots_txt(self, url: str) -> bool:
user_agent = requests.utils.default_user_agent()
robots_url = get_robots_txt_url(url)

if robots_url in self.robots:
robot_parser = self.robots[robots_url]
else:
robot_parser = setup_robots_txt_parser(robots_url)

self.robots[robots_url] = robot_parser

if not is_robots_txt_allowed(url, robot_parser):
return False

crawl_delay = robot_parser.crawl_delay(user_agent)
if crawl_delay is not None:
time.sleep(float(crawl_delay))

return True

def start(self) -> Dict[str, Dict[str, List[str]]]:
"""
Starts the crawling process from the root URL. Crawls up to max_links URLs.
Expand Down

0 comments on commit 448e2ec

Please sign in to comment.