Merge pull request #12 from zzstoatzz/trafilatura

add license and `trafilatura`
zzstoatzz · Nov 14, 2024 · eb721bf · eb721bf
2 parents f5382ed + ed64288
commit eb721bf
Show file tree

Hide file tree

Showing 8 changed files with 53 additions and 77 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Nate Nowack
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE. 
diff --git a/README.md b/README.md
@@ -1,9 +1,21 @@
-## `raggy`
+# raggy
+
+A Python library for scraping and document processing.
+
+## Installation
 
 ```python
 pip install raggy
 ```
 
+For additional features:
+```python
+pip install raggy[scrapling]  # Enhanced web scraping via Scrapling
+pip install raggy[chroma]     # ChromaDB support
+pip install raggy[tpuf]       # TurboPuffer support
+pip install raggy[pdf]        # PDF processing
+```
+
 Read the [docs](https://zzstoatzz.github.io/raggy/)
 
 ### What is it?
@@ -16,6 +28,12 @@ A Python library for:
 
 See this [example](https://github.com/zzstoatzz/raggy/blob/main/examples/chat_with_X/website.py) to chat with any website, or this [example](https://github.com/zzstoatzz/raggy/blob/main/examples/chat_with_X/repo.py) to chat with any GitHub repo.
 
+### License and Dependencies
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+When installing the optional `[scrapling]` dependency, please note that Scrapling is licensed under the BSD-3-Clause license. By using this optional feature, you agree to comply with [Scrapling's license terms](https://github.com/path/to/scrapling/LICENSE).
+
 ### Contributing
 
 We welcome contributions! See our [contributing guide](https://zzstoatzz.github.io/raggy/contributing) for details.
diff --git a/examples/chat_with_X/website.py b/examples/chat_with_X/website.py
@@ -14,27 +14,17 @@
 from datetime import timedelta
 
 import httpx
-import trafilatura
-from bs4 import BeautifulSoup
 from marvin.beta.assistants import Assistant
 from prefect import flow, task
 from rich.status import Status
 
-import raggy
 from raggy.documents import Document
 from raggy.loaders.web import SitemapLoader
 from raggy.vectorstores.tpuf import TurboPuffer, multi_query_tpuf
 
 TPUF_NS = "demo"
 
 
-def html_parser(html: str) -> str:
-    return trafilatura.extract(html) or BeautifulSoup(html, "html.parser").get_text()
-
-
-raggy.settings.html_parser = html_parser
-
-
 def get_last_modified(context, parameters):
     """Cache based on Last-Modified header of the first URL."""
     try:

diff --git a/examples/refresh_vectorstore/chroma_collection.py b/examples/refresh_vectorstore/chroma_collection.py
@@ -2,35 +2,22 @@
 # dependencies = [
 #     "prefect",
 #     "raggy[chroma]",
-#     "trafilatura",
 # ]
 # ///
 
 from datetime import timedelta
 from typing import Literal
 
-from bs4 import BeautifulSoup
 from chromadb.api.models.Collection import Document as ChromaDocument
 from prefect import flow, task
 from prefect.tasks import task_input_hash
 
-import raggy
 from raggy.documents import Document
 from raggy.loaders.base import Loader
 from raggy.loaders.github import GitHubRepoLoader
 from raggy.loaders.web import SitemapLoader
 from raggy.vectorstores.chroma import Chroma, ChromaClientType
 
-
-def html_parser(html: str) -> str:
-    import trafilatura
-
-    return trafilatura.extract(html) or BeautifulSoup(html, "html.parser").get_text()
-
-
-raggy.settings.html_parser = html_parser
-
-
 prefect_loaders = [
     SitemapLoader(
         urls=[

diff --git a/examples/refresh_vectorstore/tpuf_namespace.py b/examples/refresh_vectorstore/tpuf_namespace.py
@@ -2,41 +2,21 @@
 # dependencies = [
 #     "prefect",
 #     "raggy[tpuf]",
-#     "trafilatura",
 # ]
 # ///
 
 from datetime import timedelta
 
-from bs4 import BeautifulSoup
 from prefect import flow, task
 from prefect.tasks import task_input_hash
 from prefect.utilities.annotations import quote
 
-import raggy
 from raggy.documents import Document
 from raggy.loaders.base import Loader
 from raggy.loaders.github import GitHubRepoLoader
 from raggy.loaders.web import SitemapLoader
 from raggy.vectorstores.tpuf import TurboPuffer
 
-
-def html_parser(html: str) -> str:
-    import trafilatura
-
-    trafilatura_config = trafilatura.settings.use_config()  # type: ignore
-    # disable signal, so it can run in a worker thread
-    # https://github.com/adbar/trafilatura/issues/202
-    trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
-    return (
-        trafilatura.extract(html, config=trafilatura_config)
-        or BeautifulSoup(html, "html.parser").get_text()
-    )
-
-
-raggy.settings.html_parser = html_parser
-
-
 loaders = {
     "prefect-2": [
         SitemapLoader(

diff --git a/examples/scrape_sitemap.py b/examples/scrape_sitemap.py
@@ -1,39 +1,23 @@
 # /// script
 # dependencies = [
 #     "raggy",
-#     "trafilatura",
 #     "rich",
 # ]
 # ///
 
 import asyncio
 
-from bs4 import BeautifulSoup
 from rich.console import Console
 from rich.panel import Panel
 from rich.text import Text
 
-import raggy
 from raggy.documents import Document, DocumentMetadata
 from raggy.loaders.web import SitemapLoader
 
 console = Console()
 
 
-def html_parser(html: str) -> str:
-    import trafilatura
-
-    trafilatura_config = trafilatura.settings.use_config()  # type: ignore
-    trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
-    return (
-        trafilatura.extract(html, config=trafilatura_config)
-        or BeautifulSoup(html, "html.parser").get_text()
-    )
-
-
 async def main(urls: list[str]) -> list[Document]:
-    raggy.settings.html_parser = html_parser
-
     loader = SitemapLoader(urls=urls, create_excerpts=False)
     docs = await loader.load()
     console.print(f"\n[bold green]✓[/] Scraped {len(docs)} documents\n")

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "pypdf",
     "tenacity",
     "tiktoken",
+    "trafilatura",
     "xxhash",
     "yake",
 ]
@@ -55,6 +56,7 @@ tests = [
 chroma = ["chromadb"]
 tpuf = ["turbopuffer"]
 pdf = ["pypdf"]
+scrapling = ["scrapling"]
 
 [project.scripts]
 raggy = "raggy.cli:app"

diff --git a/src/raggy/settings.py b/src/raggy/settings.py
@@ -1,34 +1,28 @@
 from typing import Callable
 
-from bs4 import BeautifulSoup
 from pydantic import Field, SecretStr, field_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 
 def default_html_parser(html: str) -> str:
-    """The default HTML parser. This uses `bs4`'s `html.parser`, which is not very good.
-    Like, at all.
-
-    In fact it's really bad. You should definitely set `raggy.settings.html_parser` to a
-    `Callable[[str], str]` that parses HTML well.
-
+    """The default HTML parser using trafilatura or bs4 as a fallback.
     Args:
         html: The HTML to parse.
 
     Returns:
         The parsed HTML.
     """
-    from raggy.utilities.logging import get_logger
-
-    get_logger().warning_kv(
-        "USING DEFAULT HTML PARSER",
-        (
-            "BeautifulSoup's html.parser is the default parser and is not very good. "
-            "Consider setting `raggy.settings.html_parser` to a `Callable[[str], str]` that parses HTML well."
-        ),
-        "red",
+    import trafilatura
+    from bs4 import BeautifulSoup
+
+    trafilatura_config = trafilatura.settings.use_config()  # type: ignore
+    # disable signal, so it can run in a worker thread
+    # https://github.com/adbar/trafilatura/issues/202
+    trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
+    return (
+        trafilatura.extract(html, config=trafilatura_config)
+        or BeautifulSoup(html, "html.parser").get_text()
     )
-    return BeautifulSoup(html, "html.parser").get_text()
 
 
 class ChromaSettings(BaseSettings):