Skip to content

Commit

Permalink
Merge pull request #12 from zzstoatzz/trafilatura
Browse files Browse the repository at this point in the history
add license and `trafilatura`
  • Loading branch information
zzstoatzz authored Nov 14, 2024
2 parents f5382ed + ed64288 commit eb721bf
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 77 deletions.
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 Nate Nowack

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
## `raggy`
# raggy

A Python library for scraping and document processing.

## Installation

```python
pip install raggy
```

For additional features:
```python
pip install raggy[scrapling] # Enhanced web scraping via Scrapling
pip install raggy[chroma] # ChromaDB support
pip install raggy[tpuf] # TurboPuffer support
pip install raggy[pdf] # PDF processing
```

Read the [docs](https://zzstoatzz.github.io/raggy/)

### What is it?
Expand All @@ -16,6 +28,12 @@ A Python library for:

See this [example](https://github.com/zzstoatzz/raggy/blob/main/examples/chat_with_X/website.py) to chat with any website, or this [example](https://github.com/zzstoatzz/raggy/blob/main/examples/chat_with_X/repo.py) to chat with any GitHub repo.

### License and Dependencies

This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

When installing the optional `[scrapling]` dependency, please note that Scrapling is licensed under the BSD-3-Clause license. By using this optional feature, you agree to comply with [Scrapling's license terms](https://github.com/path/to/scrapling/LICENSE).

### Contributing

We welcome contributions! See our [contributing guide](https://zzstoatzz.github.io/raggy/contributing) for details.
10 changes: 0 additions & 10 deletions examples/chat_with_X/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,17 @@
from datetime import timedelta

import httpx
import trafilatura
from bs4 import BeautifulSoup
from marvin.beta.assistants import Assistant
from prefect import flow, task
from rich.status import Status

import raggy
from raggy.documents import Document
from raggy.loaders.web import SitemapLoader
from raggy.vectorstores.tpuf import TurboPuffer, multi_query_tpuf

TPUF_NS = "demo"


def html_parser(html: str) -> str:
return trafilatura.extract(html) or BeautifulSoup(html, "html.parser").get_text()


raggy.settings.html_parser = html_parser


def get_last_modified(context, parameters):
"""Cache based on Last-Modified header of the first URL."""
try:
Expand Down
13 changes: 0 additions & 13 deletions examples/refresh_vectorstore/chroma_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,22 @@
# dependencies = [
# "prefect",
# "raggy[chroma]",
# "trafilatura",
# ]
# ///

from datetime import timedelta
from typing import Literal

from bs4 import BeautifulSoup
from chromadb.api.models.Collection import Document as ChromaDocument
from prefect import flow, task
from prefect.tasks import task_input_hash

import raggy
from raggy.documents import Document
from raggy.loaders.base import Loader
from raggy.loaders.github import GitHubRepoLoader
from raggy.loaders.web import SitemapLoader
from raggy.vectorstores.chroma import Chroma, ChromaClientType


def html_parser(html: str) -> str:
import trafilatura

return trafilatura.extract(html) or BeautifulSoup(html, "html.parser").get_text()


raggy.settings.html_parser = html_parser


prefect_loaders = [
SitemapLoader(
urls=[
Expand Down
20 changes: 0 additions & 20 deletions examples/refresh_vectorstore/tpuf_namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,21 @@
# dependencies = [
# "prefect",
# "raggy[tpuf]",
# "trafilatura",
# ]
# ///

from datetime import timedelta

from bs4 import BeautifulSoup
from prefect import flow, task
from prefect.tasks import task_input_hash
from prefect.utilities.annotations import quote

import raggy
from raggy.documents import Document
from raggy.loaders.base import Loader
from raggy.loaders.github import GitHubRepoLoader
from raggy.loaders.web import SitemapLoader
from raggy.vectorstores.tpuf import TurboPuffer


def html_parser(html: str) -> str:
import trafilatura

trafilatura_config = trafilatura.settings.use_config() # type: ignore
# disable signal, so it can run in a worker thread
# https://github.com/adbar/trafilatura/issues/202
trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
return (
trafilatura.extract(html, config=trafilatura_config)
or BeautifulSoup(html, "html.parser").get_text()
)


raggy.settings.html_parser = html_parser


loaders = {
"prefect-2": [
SitemapLoader(
Expand Down
16 changes: 0 additions & 16 deletions examples/scrape_sitemap.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,23 @@
# /// script
# dependencies = [
# "raggy",
# "trafilatura",
# "rich",
# ]
# ///

import asyncio

from bs4 import BeautifulSoup
from rich.console import Console
from rich.panel import Panel
from rich.text import Text

import raggy
from raggy.documents import Document, DocumentMetadata
from raggy.loaders.web import SitemapLoader

console = Console()


def html_parser(html: str) -> str:
import trafilatura

trafilatura_config = trafilatura.settings.use_config() # type: ignore
trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
return (
trafilatura.extract(html, config=trafilatura_config)
or BeautifulSoup(html, "html.parser").get_text()
)


async def main(urls: list[str]) -> list[Document]:
raggy.settings.html_parser = html_parser

loader = SitemapLoader(urls=urls, create_excerpts=False)
docs = await loader.load()
console.print(f"\n[bold green]✓[/] Scraped {len(docs)} documents\n")
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies = [
"pypdf",
"tenacity",
"tiktoken",
"trafilatura",
"xxhash",
"yake",
]
Expand Down Expand Up @@ -55,6 +56,7 @@ tests = [
chroma = ["chromadb"]
tpuf = ["turbopuffer"]
pdf = ["pypdf"]
scrapling = ["scrapling"]

[project.scripts]
raggy = "raggy.cli:app"
Expand Down
28 changes: 11 additions & 17 deletions src/raggy/settings.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,28 @@
from typing import Callable

from bs4 import BeautifulSoup
from pydantic import Field, SecretStr, field_validator
from pydantic_settings import BaseSettings, SettingsConfigDict


def default_html_parser(html: str) -> str:
"""The default HTML parser. This uses `bs4`'s `html.parser`, which is not very good.
Like, at all.
In fact it's really bad. You should definitely set `raggy.settings.html_parser` to a
`Callable[[str], str]` that parses HTML well.
"""The default HTML parser using trafilatura or bs4 as a fallback.
Args:
html: The HTML to parse.
Returns:
The parsed HTML.
"""
from raggy.utilities.logging import get_logger

get_logger().warning_kv(
"USING DEFAULT HTML PARSER",
(
"BeautifulSoup's html.parser is the default parser and is not very good. "
"Consider setting `raggy.settings.html_parser` to a `Callable[[str], str]` that parses HTML well."
),
"red",
import trafilatura
from bs4 import BeautifulSoup

trafilatura_config = trafilatura.settings.use_config() # type: ignore
# disable signal, so it can run in a worker thread
# https://github.com/adbar/trafilatura/issues/202
trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
return (
trafilatura.extract(html, config=trafilatura_config)
or BeautifulSoup(html, "html.parser").get_text()
)
return BeautifulSoup(html, "html.parser").get_text()


class ChromaSettings(BaseSettings):
Expand Down

0 comments on commit eb721bf

Please sign in to comment.