diff --git a/tests/networking/test_robots_txt.py b/tests/networking/test_robots_txt.py new file mode 100644 index 0000000..aa122f0 --- /dev/null +++ b/tests/networking/test_robots_txt.py @@ -0,0 +1,83 @@ +from unittest.mock import patch, MagicMock +from io import BytesIO +import urllib.robotparser + +from tiny_web_crawler.networking.robots_txt import get_robots_txt_url, is_robots_txt_allowed, setup_robots_txt_parser + +def test_get_robots_txt_url() -> None: + assert ( + get_robots_txt_url("http://example") + == "http://example/robots.txt" + ) + + assert ( + get_robots_txt_url("http://example/path") + == "http://example/robots.txt" + ) + + assert ( + get_robots_txt_url("https://example/") + == "https://example/robots.txt" + ) + + assert ( + get_robots_txt_url("http://example/path1/path2/path3/path4") + == "http://example/robots.txt" + ) + + assert ( + get_robots_txt_url("http://example/path1/path2/path3/path4") + == "http://example/robots.txt" + ) + + assert ( + get_robots_txt_url("http://example/path#fragment") + == "http://example/robots.txt" + ) + + assert ( + get_robots_txt_url("http://example/path?query=test") + == "http://example/robots.txt" + ) + + + +@patch('urllib.request.urlopen') +def test_is_robots_txt_allowed_true(mock_urlopen: MagicMock) -> None: + # Mock the response content of robots.txt + mock_response = b"User-agent: *\nAllow: /" + mock_urlopen.return_value = BytesIO(mock_response) + + assert is_robots_txt_allowed("http://example.com") + + +@patch('urllib.request.urlopen') +def test_is_robots_txt_allowed_false(mock_urlopen: MagicMock) -> None: + # Mock the response content of robots.txt + mock_response = b"User-agent: *\nDisallow: /" + mock_urlopen.return_value = BytesIO(mock_response) + + assert not is_robots_txt_allowed("http://example.com") + + +@patch('urllib.request.urlopen') +def test_is_robots_txt_allowed_mixed(mock_urlopen: MagicMock) -> None: + # Mock the response content of robots.txt + mock_response = b"User-agent: *\nDisallow: /private" + + mock_urlopen.return_value = BytesIO(mock_response) + assert is_robots_txt_allowed("http://example.com") + + mock_urlopen.return_value = BytesIO(mock_response) + assert not is_robots_txt_allowed("http://example.com/private") + + +def test_is_robots_txt_allowed_no_robots_txt() -> None: + # Check that websites with no robots.txt are set as crawlable + assert is_robots_txt_allowed("http://example.com") + + +def test_setup_robots_txt_parser() -> None: + robot_parser = setup_robots_txt_parser("http://example.com") + + assert isinstance(robot_parser, urllib.robotparser.RobotFileParser) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 54d1828..4b4777b 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,11 +1,13 @@ +from io import BytesIO from unittest.mock import MagicMock, mock_open, patch +import urllib.error import responses import pytest from tiny_web_crawler.core.spider import Spider -from tiny_web_crawler.logging import DEBUG +from tiny_web_crawler.logging import DEBUG, WARNING from tests.utils import setup_mock_response @responses.activate @@ -269,3 +271,160 @@ def test_start_with_save_to_file( ] mock_save_results.assert_called_once() + + +@responses.activate +@patch('urllib.request.urlopen') +def test_respect_robots_txt(mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://crawlable.com", + body="link", + status=200 + ) + setup_mock_response( + url="http://notcrawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else + BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + spider = Spider("http://crawlable.com", respect_robots_txt=True) + + with caplog.at_level(DEBUG): + spider.start() + + assert spider.crawl_result == { + "http://crawlable.com": { + "urls": ["http://notcrawlable.com"] + } + } + + assert "Skipped: Url doesn't allow crawling:" in caplog.text + + assert "http://notcrawlable.com/robots.txt" in spider.robots + + +@responses.activate +@patch('urllib.request.urlopen') +def test_respect_robots_txt_allowed(mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://crawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + spider = Spider("http://crawlable.com", respect_robots_txt=True) + + with caplog.at_level(DEBUG): + spider.crawl("http://crawlable.com") + + assert spider.crawl_result == { + "http://crawlable.com":{ + "urls": ["http://crawlable.com"] + } + } + + + +@responses.activate +@patch('urllib.request.urlopen') +def test_respect_robots_txt_not_allowed(mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://notcrawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + spider = Spider("http://notcrawlable.com", respect_robots_txt=True) + + with caplog.at_level(DEBUG): + spider.crawl("http://notcrawlable.com") + + assert spider.crawl_result == {} + + assert "Skipped: Url doesn't allow crawling:" in caplog.text + + assert "http://notcrawlable.com/robots.txt" in spider.robots + + +@responses.activate +@patch('urllib.request.urlopen') +def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://crawlable.com", + body="link", + status=200 + ) + setup_mock_response( + url="http://notcrawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else + BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + with caplog.at_level(WARNING): + spider = Spider("http://crawlable.com", respect_robots_txt=False) + + assert "Ignoring robots.txt files! You might be at risk of:" in caplog.text + + + with caplog.at_level(DEBUG): + spider.start() + + assert spider.crawl_result == { + "http://crawlable.com": { + "urls": ["http://notcrawlable.com"] + }, + "http://notcrawlable.com": { + "urls": ["http://crawlable.com"] + } + } + + assert not "Skipped: Url doesn't allow crawling:" in caplog.text + + assert "http://notcrawlable.com/robots.txt" not in spider.robots + + +@responses.activate +@patch('urllib.request.urlopen') +@patch('time.sleep', return_value=None) +def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://crawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nAllow: /\ncrawl-delay: 1") if url == "http://crawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + spider = Spider("http://crawlable.com", respect_robots_txt=True) + + with caplog.at_level(DEBUG): + spider.crawl("http://crawlable.com") + + assert mock_sleep.call_count == 1 + mock_sleep.assert_called_with(1.0) + + assert spider.crawl_result == { + "http://crawlable.com": { + "urls": ["http://notcrawlable.com"] + } + }