Skip to content

Commit

Permalink
Merge pull request #112 from scrapinghub/registry-features
Browse files Browse the repository at this point in the history
move some registry functionalities from scrapy-poet
  • Loading branch information
kmike authored Jan 16, 2023
2 parents ddc6937 + e86bf37 commit d5f63d8
Show file tree
Hide file tree
Showing 4 changed files with 338 additions and 16 deletions.
15 changes: 15 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,21 @@
Changelog
=========

TBR
---

* Introduce new methods for :class:`web_poet.rules.RulesRegistry`:

* :meth:`web_poet.rules.RulesRegistry.add_rule`
* :meth:`web_poet.rules.RulesRegistry.overrides_for`
* :meth:`web_poet.rules.RulesRegistry.page_cls_for_item`

* Improved the performance of :meth:`web_poet.rules.RulesRegistry.search` where
passing a single parameter of either ``instead_of`` or ``to_return`` results
in *O(1)* look-up time instead of *O(N)*. Additionally, having either
``instead_of`` or ``to_return`` present in multi-parameter search calls would
filter the initial candidate results resulting in a faster search.

0.6.0 (2022-11-08)
------------------

Expand Down
52 changes: 52 additions & 0 deletions docs/page-objects/rules.rst
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,58 @@ in the :class:`~.ApplyRule` to be written into ``web_poet.default_registry``.
The next section explores this caveat further.

Using URLs against the registered rules
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

One of the important aspects of :class:`~.ApplyRule` is dictating which URLs it's
able to work using its ``for_patterns`` attribute. There are a few methods
available in :class:`~.RulesRegistry` which accepts a URL value (:class:`str`,
:class:`~.RequestUrl`, or :class:`~.ResponseUrl`) to find specific information
from the registered rules.

.. _rules-overrides_for-example:

Find the page object overrides
""""""""""""""""""""""""""""""

Suppose you want to see what are the :ref:`rules-intro-overrides` that are
available from a given webpage, you can use :meth:`~.RulesRegistry.overrides_for`
by passing the webpage URL. For example:

.. code-block:: python
from web_poet import default_registry
overrides = default_registry.overrides_for("http://books.toscrape.com/")
print(overrides)
# {
# <class 'OldProductPage'>: <class 'NewProductPage'>,
# <class 'OverriddenPage'>: <class 'UseThisPage'>,
# }
It returns a :class:`Mapping` where the *key* represents the page object class
that is overridden or replaced by the page object class in the *value*.

.. _rules-page_cls_for_item-example:

Identify the page object that could create the item
"""""""""""""""""""""""""""""""""""""""""""""""""""

Suppose you want to retrieve the page object class that is able to create the
item class that you want from a given webpage, you can use
:meth:`~.RulesRegistry.page_cls_for_item`. For example:

.. code-block:: python
from web_poet import default_registry
page_cls = default_registry.page_cls_for_item(
"http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html",
Book
)
print(page_cls) # BookPage
Using rules from External Packages
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
133 changes: 127 additions & 6 deletions tests/test_rules.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings

import attrs
import pytest
from url_matcher import Patterns
Expand All @@ -9,8 +11,11 @@
POTopLevelOverriden2,
)
from tests.po_lib.a_module import POModule, POModuleOverriden
from tests.po_lib.nested_package import PONestedPkg
from tests.po_lib.nested_package.a_nested_module import PONestedModule
from tests.po_lib.nested_package import PONestedPkg, PONestedPkgOverriden
from tests.po_lib.nested_package.a_nested_module import (
PONestedModule,
PONestedModuleOverriden,
)
from tests.po_lib_sub import POLibSub
from tests.po_lib_to_return import (
CustomProductPage,
Expand All @@ -20,7 +25,10 @@
LessProductPage,
MoreProductPage,
Product,
ProductFewerFields,
ProductMoreFields,
ProductPage,
ProductSeparate,
ProductSimilar,
SeparateProductPage,
SimilarProductPage,
Expand All @@ -34,6 +42,7 @@
default_registry,
handle_urls,
)
from web_poet.page_inputs.url import RequestUrl, ResponseUrl

POS = {
CustomProductPage,
Expand Down Expand Up @@ -154,7 +163,7 @@ def test_apply_rule_kwargs_only() -> None:
ApplyRule(
"example.com",
*[params[r] for r in remove],
**{k: v for k, v in params.items() if k not in remove} # type: ignore[arg-type]
**{k: v for k, v in params.items() if k not in remove}, # type: ignore[arg-type]
)


Expand Down Expand Up @@ -245,6 +254,10 @@ def test_registry_search() -> None:
assert len(rules) == 1
assert rules[0].instead_of == POTopLevelOverriden2

rules = default_registry.search(instead_of=None)
for rule in rules:
assert rule.instead_of is None

# param: to_return
rules = default_registry.search(to_return=Product)
assert rules == [
Expand All @@ -264,12 +277,34 @@ def test_registry_search() -> None:
),
]

rules = default_registry.search(to_return=None)
for rule in rules:
assert rule.to_return is None

# params: to_return and use
rules = default_registry.search(to_return=Product, use=ImprovedProductPage)
assert len(rules) == 1
assert rules[0].to_return == Product
assert rules[0].use == ImprovedProductPage

# params: to_return and instead_of
rules = default_registry.search(to_return=Product, instead_of=None)
assert len(rules) == 2
assert rules[0].to_return == Product
assert rules[0].instead_of is None
assert rules[1].to_return == Product
assert rules[1].instead_of is None

rules = default_registry.search(to_return=None, instead_of=ProductPage)
for rule in rules:
assert rule.to_return is None
assert rule.instead_of is None

rules = default_registry.search(to_return=None, instead_of=None)
assert len(rules) == 1
assert rules[0].to_return is None
assert rules[0].instead_of is None

# Such rules doesn't exist
rules = default_registry.search(use=POModuleOverriden)
assert len(rules) == 0
Expand All @@ -292,7 +327,7 @@ def test_registry_search_overrides_deprecation() -> None:
def test_init_rules() -> None:
rules = (
ApplyRule(
for_patterns=Patterns(include=["sample.com"]),
for_patterns=Patterns(include=["example.com"]),
use=POTopLevel1,
instead_of=POTopLevelOverriden2,
),
Expand All @@ -305,10 +340,96 @@ def test_init_rules() -> None:
assert default_registry.get_rules() != rules


def test_add_rule() -> None:
registry = RulesRegistry()

# Basic case of adding a rule
rule_1 = ApplyRule(
for_patterns=Patterns(include=["example.com"]),
use=POTopLevel1,
instead_of=POTopLevelOverriden1,
to_return=Product,
)
registry.add_rule(rule_1)
assert registry.get_rules() == [rule_1]

# Adding a second rule should not emit a warning as long as both the URL
# pattern and `.to_return` value is not the same.
rule_2 = ApplyRule(
for_patterns=Patterns(include=["example.com"]),
use=POTopLevel1,
instead_of=POTopLevelOverriden2,
to_return=ProductSimilar,
)
with warnings.catch_warnings(record=True) as warnings_emitted:
registry.add_rule(rule_2)
assert not warnings_emitted
assert registry.get_rules() == [rule_1, rule_2]

# Warnings should be raised for this case since it's the same URL pattern
# and `.to_return` value from one of the past rules.
rule_3 = ApplyRule(
for_patterns=Patterns(include=["example.com"]),
use=POTopLevel1,
instead_of=POTopLevelOverriden2,
to_return=Product,
)
# Since we're using f-strings to compare the warning emitted, don't use
# ``pytest.warns()`` here since it treats the msg as regex which translates
# the "(" and ")" characters differently from the expected message.
with warnings.catch_warnings(record=True) as warnings_emitted:
registry.add_rule(rule_3)
expected_msg = f"Consider updating the priority of these rules: {[rule_1, rule_3]}."
assert any([True for w in warnings_emitted if expected_msg in str(w.message)])
assert registry.get_rules() == [rule_1, rule_2, rule_3]


def test_overrides_for() -> None:
for cls in [str, RequestUrl, ResponseUrl]:
assert default_registry.overrides_for(cls("https://example.com")) == {
POTopLevelOverriden1: POTopLevel1,
POTopLevelOverriden2: POTopLevel2,
POModuleOverriden: POModule,
PONestedPkgOverriden: PONestedPkg,
PONestedModuleOverriden: PONestedModule,
ProductPage: CustomProductPageNoReturns,
}

assert default_registry.overrides_for(cls("https://example.org")) == {
PONestedModuleOverriden: PONestedModule,
PONestedPkgOverriden: PONestedPkg,
}


def test_page_cls_for_item() -> None:
# This is not associated with any rule.
class FakeItem:
pass

method = default_registry.page_cls_for_item

for cls in [str, RequestUrl, ResponseUrl]:
url = cls("https://example.com")
assert method(url, ProductSimilar) == CustomProductPageNoReturns
assert method(url, Product) == CustomProductPageDataTypeOnly
assert method(url, ProductSeparate) == SeparateProductPage
assert method(url, ProductFewerFields) == LessProductPage
assert method(url, ProductMoreFields) == MoreProductPage

# Type is ignored since item_cls shouldn't be None
assert method(url, None) is None # type: ignore[arg-type]

# When there's no rule specifying to return this FakeItem
assert method(url, FakeItem) is None

# When the URL itself doesn't have any ``to_return`` in any of its rules
assert method(cls("https://example.org"), FakeItem) is None


def test_from_override_rules_deprecation_using_ApplyRule() -> None:
rules = [
ApplyRule(
for_patterns=Patterns(include=["sample.com"]),
for_patterns=Patterns(include=["example.com"]),
use=POTopLevel1,
instead_of=POTopLevelOverriden2,
)
Expand All @@ -325,7 +446,7 @@ def test_from_override_rules_deprecation_using_ApplyRule() -> None:
def test_from_override_rules_deprecation_using_OverrideRule() -> None:
rules = [
OverrideRule(
for_patterns=Patterns(include=["sample.com"]),
for_patterns=Patterns(include=["example.com"]),
use=POTopLevel1,
instead_of=POTopLevelOverriden2,
)
Expand Down
Loading

0 comments on commit d5f63d8

Please sign in to comment.