Added test cases for new robots_txt submodule

DataCrawl-AI · Jun 19, 2024 · 8c3ac23 · 8c3ac23
1 parent 448e2ec
commit 8c3ac23
Show file tree

Hide file tree

Showing 2 changed files with 243 additions and 1 deletion.
diff --git a/tests/networking/test_robots_txt.py b/tests/networking/test_robots_txt.py
@@ -0,0 +1,83 @@
+from unittest.mock import patch, MagicMock
+from io import BytesIO
+import urllib.robotparser
+
+from tiny_web_crawler.networking.robots_txt import get_robots_txt_url, is_robots_txt_allowed, setup_robots_txt_parser
+
+def test_get_robots_txt_url() -> None:
+    assert (
+        get_robots_txt_url("http://example")
+        == "http://example/robots.txt"
+    )
+
+    assert (
+        get_robots_txt_url("http://example/path")
+        == "http://example/robots.txt"
+    )
+
+    assert (
+        get_robots_txt_url("https://example/")
+        == "https://example/robots.txt"
+    )
+
+    assert (
+        get_robots_txt_url("http://example/path1/path2/path3/path4")
+        == "http://example/robots.txt"
+    )
+
+    assert (
+        get_robots_txt_url("http://example/path1/path2/path3/path4")
+        == "http://example/robots.txt"
+    )
+
+    assert (
+        get_robots_txt_url("http://example/path#fragment")
+        == "http://example/robots.txt"
+    )
+
+    assert (
+        get_robots_txt_url("http://example/path?query=test")
+        == "http://example/robots.txt"
+    )
+
+
+
+@patch('urllib.request.urlopen')
+def test_is_robots_txt_allowed_true(mock_urlopen: MagicMock) -> None:
+    # Mock the response content of robots.txt
+    mock_response = b"User-agent: *\nAllow: /"
+    mock_urlopen.return_value = BytesIO(mock_response)
+
+    assert is_robots_txt_allowed("http://example.com")
+
+
+@patch('urllib.request.urlopen')
+def test_is_robots_txt_allowed_false(mock_urlopen: MagicMock) -> None:
+    # Mock the response content of robots.txt
+    mock_response = b"User-agent: *\nDisallow: /"
+    mock_urlopen.return_value = BytesIO(mock_response)
+
+    assert not is_robots_txt_allowed("http://example.com")
+
+
+@patch('urllib.request.urlopen')
+def test_is_robots_txt_allowed_mixed(mock_urlopen: MagicMock) -> None:
+    # Mock the response content of robots.txt
+    mock_response = b"User-agent: *\nDisallow: /private"
+
+    mock_urlopen.return_value = BytesIO(mock_response)
+    assert is_robots_txt_allowed("http://example.com")
+
+    mock_urlopen.return_value = BytesIO(mock_response)
+    assert not is_robots_txt_allowed("http://example.com/private")
+
+
+def test_is_robots_txt_allowed_no_robots_txt() -> None:
+    # Check that websites with no robots.txt are set as crawlable
+    assert is_robots_txt_allowed("http://example.com")
+
+
+def test_setup_robots_txt_parser() -> None:
+    robot_parser = setup_robots_txt_parser("http://example.com")
+
+    assert isinstance(robot_parser, urllib.robotparser.RobotFileParser)
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
@@ -1,11 +1,13 @@
+from io import BytesIO
 from unittest.mock import MagicMock, mock_open, patch
+import urllib.error
 
 import responses
 
 import pytest
 
 from tiny_web_crawler.core.spider import Spider
-from tiny_web_crawler.logging import DEBUG
+from tiny_web_crawler.logging import DEBUG, WARNING
 from tests.utils import setup_mock_response
 
 @responses.activate
@@ -269,3 +271,160 @@ def test_start_with_save_to_file(
     ]
 
     mock_save_results.assert_called_once()
+
+
+@responses.activate
+@patch('urllib.request.urlopen')
+def test_respect_robots_txt(mock_urlopen, caplog) -> None: # type: ignore
+    setup_mock_response(
+        url="http://crawlable.com",
+        body="<html><body><a href='http://notcrawlable.com'>link</a></body></html>",
+        status=200
+    )
+    setup_mock_response(
+        url="http://notcrawlable.com",
+        body="<html><body><a href='http://crawlable.com'>link</a></body></html>",
+        status=200
+    )
+
+    mock_urlopen.side_effect = lambda url: (
+        BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else
+        BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else
+        urllib.error.URLError(f"No mock for {url}"))
+
+    spider = Spider("http://crawlable.com", respect_robots_txt=True)
+
+    with caplog.at_level(DEBUG):
+        spider.start()
+
+    assert spider.crawl_result == {
+        "http://crawlable.com": {
+            "urls": ["http://notcrawlable.com"]
+        }
+    }
+
+    assert "Skipped: Url doesn't allow crawling:" in caplog.text
+
+    assert "http://notcrawlable.com/robots.txt" in spider.robots
+
+
+@responses.activate
+@patch('urllib.request.urlopen')
+def test_respect_robots_txt_allowed(mock_urlopen, caplog) -> None: # type: ignore
+    setup_mock_response(
+        url="http://crawlable.com",
+        body="<html><body><a href='http://crawlable.com'>link</a></body></html>",
+        status=200
+    )
+
+    mock_urlopen.side_effect = lambda url: (
+        BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else
+        urllib.error.URLError(f"No mock for {url}"))
+
+    spider = Spider("http://crawlable.com", respect_robots_txt=True)
+
+    with caplog.at_level(DEBUG):
+        spider.crawl("http://crawlable.com")
+
+    assert spider.crawl_result == {
+        "http://crawlable.com":{
+            "urls": ["http://crawlable.com"]
+        }
+    }
+
+
+
+@responses.activate
+@patch('urllib.request.urlopen')
+def test_respect_robots_txt_not_allowed(mock_urlopen, caplog) -> None: # type: ignore
+    setup_mock_response(
+        url="http://notcrawlable.com",
+        body="<html><body><a href='http://crawlable.com'>link</a></body></html>",
+        status=200
+    )
+
+    mock_urlopen.side_effect = lambda url: (
+        BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else
+        urllib.error.URLError(f"No mock for {url}"))
+
+    spider = Spider("http://notcrawlable.com", respect_robots_txt=True)
+
+    with caplog.at_level(DEBUG):
+        spider.crawl("http://notcrawlable.com")
+
+    assert spider.crawl_result == {}
+
+    assert "Skipped: Url doesn't allow crawling:" in caplog.text
+
+    assert "http://notcrawlable.com/robots.txt" in spider.robots
+
+
+@responses.activate
+@patch('urllib.request.urlopen')
+def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ignore
+    setup_mock_response(
+        url="http://crawlable.com",
+        body="<html><body><a href='http://notcrawlable.com'>link</a></body></html>",
+        status=200
+    )
+    setup_mock_response(
+        url="http://notcrawlable.com",
+        body="<html><body><a href='http://crawlable.com'>link</a></body></html>",
+        status=200
+    )
+
+    mock_urlopen.side_effect = lambda url: (
+        BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else
+        BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else
+        urllib.error.URLError(f"No mock for {url}"))
+
+    with caplog.at_level(WARNING):
+        spider = Spider("http://crawlable.com", respect_robots_txt=False)
+
+    assert "Ignoring robots.txt files! You might be at risk of:" in caplog.text
+
+
+    with caplog.at_level(DEBUG):
+        spider.start()
+
+    assert spider.crawl_result == {
+        "http://crawlable.com": {
+            "urls": ["http://notcrawlable.com"]
+        },
+        "http://notcrawlable.com": {
+            "urls": ["http://crawlable.com"]
+        }
+    }
+
+    assert not "Skipped: Url doesn't allow crawling:" in caplog.text
+
+    assert "http://notcrawlable.com/robots.txt" not in spider.robots
+
+
+@responses.activate
+@patch('urllib.request.urlopen')
+@patch('time.sleep', return_value=None)
+def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> None: # type: ignore
+    setup_mock_response(
+        url="http://crawlable.com",
+        body="<html><body><a href='http://notcrawlable.com'>link</a></body></html>",
+        status=200
+    )
+
+    mock_urlopen.side_effect = lambda url: (
+        BytesIO(b"User-agent: *\nAllow: /\ncrawl-delay: 1") if url == "http://crawlable.com/robots.txt" else
+        urllib.error.URLError(f"No mock for {url}"))
+
+    spider = Spider("http://crawlable.com", respect_robots_txt=True)
+
+    with caplog.at_level(DEBUG):
+        spider.crawl("http://crawlable.com")
+
+    assert mock_sleep.call_count == 1
+    mock_sleep.assert_called_with(1.0)
+
+    assert spider.crawl_result == {
+        "http://crawlable.com": {
+            "urls": ["http://notcrawlable.com"]
+        }
+    }