Skip to content

Commit

Permalink
Pre-commit fix
Browse files Browse the repository at this point in the history
  • Loading branch information
skrawcz committed Mar 4, 2024
1 parent 59c28ca commit 2e06300
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@

# from langchain_core import documents


def article_regex() -> str:
"""This assumes you're using the furo theme for sphinx"""
return r'<article role="main" id="furo-main-content">(.*?)</article>'


def article_text(url: str, article_regex: str) -> str:
"""Pulls URL and takes out relevant HTML.
Expand All @@ -20,13 +22,14 @@ def article_text(url: str, article_regex: str) -> str:
try:
html = requests.get(url)
except requests.exceptions.RequestException:
raise Exception(f'Failed to get URL: {url}')
raise Exception(f"Failed to get URL: {url}")
article = re.findall(article_regex, html.text, re.DOTALL)
if not article:
raise ValueError(f"No article found in {url}")
text = article[0].strip()
return text


def html_chunker() -> text_splitter.HTMLHeaderTextSplitter:
"""Return HTML chunker object.
Expand All @@ -39,6 +42,7 @@ def html_chunker() -> text_splitter.HTMLHeaderTextSplitter:
]
return text_splitter.HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)


def text_chunker(
chunk_size: int = 256, chunk_overlap: int = 32
) -> text_splitter.RecursiveCharacterTextSplitter:
Expand All @@ -52,6 +56,7 @@ def text_chunker(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)


def chunked_text(
article_text: str,
html_chunker: text_splitter.HTMLHeaderTextSplitter,
Expand All @@ -76,4 +81,4 @@ def chunked_text(
article_regex,
article_text,
chunked_text,
]
]
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def sitemap_text(sitemap_url: str = "https://hamilton.dagworks.io/en/latest/site
try:
sitemap = requests.get(sitemap_url)
except Exception as e:
raise RuntimeError(f'Failed to fetch sitemap from {sitemap_url}. Original error: {str(e)}')
raise RuntimeError(f"Failed to fetch sitemap from {sitemap_url}. Original error: {str(e)}")
return sitemap.text


Expand Down Expand Up @@ -82,4 +82,4 @@ def chunked_url_text(urls_from_sitemap: ps.DataFrame) -> ps.DataFrame:
["chunked_url_text"],
inputs={"app_name": "chunking_spark_job", "num_partitions": 4},
)
print(result["chunked_url_text"].show())
print(result["chunked_url_text"].show())

0 comments on commit 2e06300

Please sign in to comment.