Skip to content

Commit

Permalink
Added test cases for new robots_txt submodule
Browse files Browse the repository at this point in the history
  • Loading branch information
Mews committed Jun 19, 2024
1 parent 448e2ec commit 8c3ac23
Show file tree
Hide file tree
Showing 2 changed files with 243 additions and 1 deletion.
83 changes: 83 additions & 0 deletions tests/networking/test_robots_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from unittest.mock import patch, MagicMock
from io import BytesIO
import urllib.robotparser

from tiny_web_crawler.networking.robots_txt import get_robots_txt_url, is_robots_txt_allowed, setup_robots_txt_parser

def test_get_robots_txt_url() -> None:
assert (
get_robots_txt_url("http://example")
== "http://example/robots.txt"
)

assert (
get_robots_txt_url("http://example/path")
== "http://example/robots.txt"
)

assert (
get_robots_txt_url("https://example/")
== "https://example/robots.txt"
)

assert (
get_robots_txt_url("http://example/path1/path2/path3/path4")
== "http://example/robots.txt"
)

assert (
get_robots_txt_url("http://example/path1/path2/path3/path4")
== "http://example/robots.txt"
)

assert (
get_robots_txt_url("http://example/path#fragment")
== "http://example/robots.txt"
)

assert (
get_robots_txt_url("http://example/path?query=test")
== "http://example/robots.txt"
)



@patch('urllib.request.urlopen')
def test_is_robots_txt_allowed_true(mock_urlopen: MagicMock) -> None:
# Mock the response content of robots.txt
mock_response = b"User-agent: *\nAllow: /"
mock_urlopen.return_value = BytesIO(mock_response)

assert is_robots_txt_allowed("http://example.com")


@patch('urllib.request.urlopen')
def test_is_robots_txt_allowed_false(mock_urlopen: MagicMock) -> None:
# Mock the response content of robots.txt
mock_response = b"User-agent: *\nDisallow: /"
mock_urlopen.return_value = BytesIO(mock_response)

assert not is_robots_txt_allowed("http://example.com")


@patch('urllib.request.urlopen')
def test_is_robots_txt_allowed_mixed(mock_urlopen: MagicMock) -> None:
# Mock the response content of robots.txt
mock_response = b"User-agent: *\nDisallow: /private"

mock_urlopen.return_value = BytesIO(mock_response)
assert is_robots_txt_allowed("http://example.com")

mock_urlopen.return_value = BytesIO(mock_response)
assert not is_robots_txt_allowed("http://example.com/private")


def test_is_robots_txt_allowed_no_robots_txt() -> None:
# Check that websites with no robots.txt are set as crawlable
assert is_robots_txt_allowed("http://example.com")


def test_setup_robots_txt_parser() -> None:
robot_parser = setup_robots_txt_parser("http://example.com")

assert isinstance(robot_parser, urllib.robotparser.RobotFileParser)
161 changes: 160 additions & 1 deletion tests/test_crawler.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from io import BytesIO
from unittest.mock import MagicMock, mock_open, patch
import urllib.error

import responses

import pytest

from tiny_web_crawler.core.spider import Spider
from tiny_web_crawler.logging import DEBUG
from tiny_web_crawler.logging import DEBUG, WARNING
from tests.utils import setup_mock_response

@responses.activate
Expand Down Expand Up @@ -269,3 +271,160 @@ def test_start_with_save_to_file(
]

mock_save_results.assert_called_once()


@responses.activate
@patch('urllib.request.urlopen')
def test_respect_robots_txt(mock_urlopen, caplog) -> None: # type: ignore
setup_mock_response(
url="http://crawlable.com",
body="<html><body><a href='http://notcrawlable.com'>link</a></body></html>",
status=200
)
setup_mock_response(
url="http://notcrawlable.com",
body="<html><body><a href='http://crawlable.com'>link</a></body></html>",
status=200
)

mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else
BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else
urllib.error.URLError(f"No mock for {url}"))

spider = Spider("http://crawlable.com", respect_robots_txt=True)

with caplog.at_level(DEBUG):
spider.start()

assert spider.crawl_result == {
"http://crawlable.com": {
"urls": ["http://notcrawlable.com"]
}
}

assert "Skipped: Url doesn't allow crawling:" in caplog.text

assert "http://notcrawlable.com/robots.txt" in spider.robots


@responses.activate
@patch('urllib.request.urlopen')
def test_respect_robots_txt_allowed(mock_urlopen, caplog) -> None: # type: ignore
setup_mock_response(
url="http://crawlable.com",
body="<html><body><a href='http://crawlable.com'>link</a></body></html>",
status=200
)

mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else
urllib.error.URLError(f"No mock for {url}"))

spider = Spider("http://crawlable.com", respect_robots_txt=True)

with caplog.at_level(DEBUG):
spider.crawl("http://crawlable.com")

assert spider.crawl_result == {
"http://crawlable.com":{
"urls": ["http://crawlable.com"]
}
}



@responses.activate
@patch('urllib.request.urlopen')
def test_respect_robots_txt_not_allowed(mock_urlopen, caplog) -> None: # type: ignore
setup_mock_response(
url="http://notcrawlable.com",
body="<html><body><a href='http://crawlable.com'>link</a></body></html>",
status=200
)

mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else
urllib.error.URLError(f"No mock for {url}"))

spider = Spider("http://notcrawlable.com", respect_robots_txt=True)

with caplog.at_level(DEBUG):
spider.crawl("http://notcrawlable.com")

assert spider.crawl_result == {}

assert "Skipped: Url doesn't allow crawling:" in caplog.text

assert "http://notcrawlable.com/robots.txt" in spider.robots


@responses.activate
@patch('urllib.request.urlopen')
def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ignore
setup_mock_response(
url="http://crawlable.com",
body="<html><body><a href='http://notcrawlable.com'>link</a></body></html>",
status=200
)
setup_mock_response(
url="http://notcrawlable.com",
body="<html><body><a href='http://crawlable.com'>link</a></body></html>",
status=200
)

mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else
BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else
urllib.error.URLError(f"No mock for {url}"))

with caplog.at_level(WARNING):
spider = Spider("http://crawlable.com", respect_robots_txt=False)

assert "Ignoring robots.txt files! You might be at risk of:" in caplog.text


with caplog.at_level(DEBUG):
spider.start()

assert spider.crawl_result == {
"http://crawlable.com": {
"urls": ["http://notcrawlable.com"]
},
"http://notcrawlable.com": {
"urls": ["http://crawlable.com"]
}
}

assert not "Skipped: Url doesn't allow crawling:" in caplog.text

assert "http://notcrawlable.com/robots.txt" not in spider.robots


@responses.activate
@patch('urllib.request.urlopen')
@patch('time.sleep', return_value=None)
def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> None: # type: ignore
setup_mock_response(
url="http://crawlable.com",
body="<html><body><a href='http://notcrawlable.com'>link</a></body></html>",
status=200
)

mock_urlopen.side_effect = lambda url: (
BytesIO(b"User-agent: *\nAllow: /\ncrawl-delay: 1") if url == "http://crawlable.com/robots.txt" else
urllib.error.URLError(f"No mock for {url}"))

spider = Spider("http://crawlable.com", respect_robots_txt=True)

with caplog.at_level(DEBUG):
spider.crawl("http://crawlable.com")

assert mock_sleep.call_count == 1
mock_sleep.assert_called_with(1.0)

assert spider.crawl_result == {
"http://crawlable.com": {
"urls": ["http://notcrawlable.com"]
}
}

0 comments on commit 8c3ac23

Please sign in to comment.