diff --git a/paperqa/types.py b/paperqa/types.py index d1ab001a9..ed1384c8f 100644 --- a/paperqa/types.py +++ b/paperqa/types.py @@ -54,6 +54,9 @@ class Doc(Embeddable): citation: str dockey: DocKey + def __hash__(self) -> int: + return hash((self.docname, self.dockey)) + class Text(Embeddable): text: str diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index dc6733e77..189fc7347 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -641,21 +641,31 @@ async def my_callback(result): assert len(my_results) > 1 -def test_duplicate(): +def test_duplicate() -> None: + """Check Docs doesn't store duplicates, while checking nonduplicate docs are stored.""" docs = Docs() assert docs.add_url( "https://en.wikipedia.org/wiki/Frederick_Bates_(politician)", citation="WikiMedia Foundation, 2023, Accessed now", - dockey="test", + dockey="test1", ) assert ( docs.add_url( "https://en.wikipedia.org/wiki/Frederick_Bates_(politician)", citation="WikiMedia Foundation, 2023, Accessed now", - dockey="test", + dockey="test1", ) is None ) + assert len(docs.docs) == 1, "Should have added only one document" + assert docs.add_url( + "https://en.wikipedia.org/wiki/National_Flag_of_Canada_Day", + citation="WikiMedia Foundation, 2023, Accessed now", + dockey="test2", + ) + assert ( + len(set(docs.docs.values())) == 2 + ), "Unique documents should be hashed as unique" def test_custom_embedding():