Skip to content

Commit

Permalink
feat: enable web scraping to parse and save pdf content
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob-Powell committed May 12, 2024
1 parent b2ea91c commit 599c59e
Show file tree
Hide file tree
Showing 13 changed files with 217 additions and 21 deletions.
8 changes: 8 additions & 0 deletions lib/chatbot-api/functions/api-handler/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class WebsiteDocumentRequest(BaseModel):
address: str
followLinks: bool
limit: int
contentTypes: Optional[list]


class RssFeedDocumentRequest(BaseModel):
Expand All @@ -44,12 +45,14 @@ class RssFeedDocumentRequest(BaseModel):
limit: int
title: Optional[str]
followLinks: bool
contentTypes: Optional[list]


class RssFeedCrawlerUpdateRequest(BaseModel):
documentType: str
followLinks: bool
limit: int
contentTypes: Optional[str]


class ListDocumentsRequest(BaseModel):
Expand Down Expand Up @@ -237,6 +240,7 @@ def add_website(input: dict):
crawler_properties={
"follow_links": request.followLinks,
"limit": limit,
"content_types": request.contentTypes,
},
)

Expand All @@ -263,6 +267,7 @@ def add_rss_feed(
crawler_properties={
"follow_links": request.followLinks,
"limit": request.limit,
"content_types": request.contentTypes,
},
)

Expand All @@ -282,6 +287,7 @@ def update_rss_feed(input: dict):
document_type="rssfeed",
follow_links=request.followLinks,
limit=request.limit,
content_types=request.contentTypes,
)
return {
"workspaceId": result["workspace_id"],
Expand All @@ -295,6 +301,7 @@ def _convert_document(document: dict):
document["crawler_properties"] = {
"followLinks": document["crawler_properties"]["follow_links"],
"limit": document["crawler_properties"]["limit"],
"contentTypes": document["crawler_properties"]["content_types"],
}
return {
"id": document["document_id"],
Expand All @@ -315,6 +322,7 @@ def _convert_document(document: dict):
"crawlerProperties": {
"followLinks": document.get("crawler_properties").get("follow_links", None),
"limit": document.get("crawler_properties").get("limit", None),
"contentTypes": document.get("crawler_properties").get("content_types", None),
}
if document.get("crawler_properties", None) != None
else None,
Expand Down
3 changes: 3 additions & 0 deletions lib/chatbot-api/schema/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ input CalculateEmbeddingsInput {
type CrawlerProperties @aws_cognito_user_pools {
followLinks: Boolean
limit: Int
contentTypes: [String!]!
}

type CrossEncoderData @aws_cognito_user_pools {
Expand Down Expand Up @@ -190,6 +191,7 @@ input RssFeedInput {
limit: Int!
title: String
followLinks: Boolean!
contentTypes: [String!]!
}

input SemanticSearchInput {
Expand Down Expand Up @@ -261,6 +263,7 @@ input WebsiteInput {
address: String!
followLinks: Boolean!
limit: Int!
contentTypes: [String!]!
}

type Workspace @aws_cognito_user_pools {
Expand Down
2 changes: 1 addition & 1 deletion lib/shared/file-import-batch-job/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ langchain==0.1.5
opensearch-py==2.3.1
psycopg2-binary==2.9.7
pgvector==0.2.2
pydantic==2.3.0
pydantic==2.4.0
urllib3<2
openai==0.28.0
beautifulsoup4==4.12.2
Expand Down
7 changes: 7 additions & 0 deletions lib/shared/layers/python-sdk/python/genai_core/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k
if "limit" in kwargs and "follow_links" in kwargs:
follow_links = kwargs["follow_links"]
limit = kwargs["limit"]
content_types = kwargs["content_types"]
response = documents_table.update_item(
Key={"workspace_id": workspace_id, "document_id": document_id},
UpdateExpression="SET #crawler_properties=:crawler_properties, updated_at=:timestampValue",
Expand All @@ -367,6 +368,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k
":crawler_properties": {
"follow_links": follow_links,
"limit": limit,
"content_types": content_types,
},
":timestampValue": timestamp,
},
Expand Down Expand Up @@ -479,6 +481,7 @@ def _process_document(
crawler_properties = kwargs["crawler_properties"]
follow_links = crawler_properties["follow_links"]
limit = crawler_properties["limit"]
content_types = crawler_properties["content_types"]

if document_sub_type == "sitemap":
follow_links = False
Expand Down Expand Up @@ -514,6 +517,7 @@ def _process_document(
"processed_urls": [],
"follow_links": follow_links,
"limit": limit,
"content_types": content_types,
"done": False,
},
cls=genai_core.utils.json.CustomEncoder,
Expand Down Expand Up @@ -712,6 +716,9 @@ def batch_crawl_websites():
"limit": int(post["crawler_properties"]["M"]["limit"]["N"])
if "crawler_properties" in post
else 250,
"content_types": post["crawler_properties"]["M"]["content_types"]["L"]
if "crawler_properties" in post and "content_types" in post["crawler_properties"]["M"]
else ["text/html"],
},
)
set_status(workspace_id, document_id, "processed")
Expand Down
43 changes: 33 additions & 10 deletions lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import requests
import genai_core.chunks
import genai_core.documents
import pdfplumber
import io
from typing import List
from bs4 import BeautifulSoup
from urllib.parse import urlparse
Expand All @@ -21,6 +23,7 @@ def crawl_urls(
processed_urls: List[str],
follow_links: bool,
limit: int,
content_types: List[str],
):
workspace_id = workspace["workspace_id"]
document_id = document["document_id"]
Expand All @@ -47,7 +50,7 @@ def crawl_urls(
print(f"Processing url {document_sub_id}: {current_url}")

try:
content, local_links, _ = parse_url(current_url)
content, local_links, _ = parse_url(current_url, content_types)
except:
print(f"Failed to parse url: {current_url}")
continue
Expand Down Expand Up @@ -96,7 +99,7 @@ def crawl_urls(
}


def parse_url(url: str):
def parse_url(url: str, content_types_supported: list):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
Expand All @@ -105,14 +108,34 @@ def parse_url(url: str):
base_url = f"{root_url_parse.scheme}://{root_url_parse.netloc}"

response = requests.get(url, headers=headers, timeout=20)
if "text/html" not in response.headers["Content-Type"]:
raise Exception(
f"Invalid content type {response.headers['Content-Type']}")
soup = BeautifulSoup(response.content, "html.parser")
content = soup.get_text(separator=' ')
content = re.sub(r"[ \n]+", " ", content)

links = list(set([a["href"] for a in soup.find_all("a", href=True)]))
content_type = response.headers["Content-Type"]
links = []

if ("text/html" in content_type) and ("text/html" in content_types_supported):
soup = BeautifulSoup(response.content, "html.parser")
content = soup.get_text(separator=' ')
content = re.sub(r"[ \n]+", " ", content)
links = [a["href"] for a in soup.find_all("a", href=True)]

elif ("application/pdf" in content_type) and ("application/pdf" in content_types_supported):
pdf_bytes = response.content # Get the bytes content of the response
pdf_stream = io.BytesIO(pdf_bytes) # Create a BytesIO stream from the bytes
with pdfplumber.open(pdf_stream) as pdf:
content = []
for page in pdf.pages:
if page.extract_text():
content.append(page.extract_text().replace('\n', ' '))

# Extract links from annotations
annotations = page.annots
if annotations:
for annot in annotations:
if annot['uri']:
links.append(annot['uri'])
content = ' '.join(content)
else:
raise Exception(f"Unsupported content type {content_type} found at: {url}")

local_links = []
external_links = []

Expand Down
2 changes: 2 additions & 0 deletions lib/shared/web-crawler-batch-job/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def main():
processed_urls = data["processed_urls"]
follow_links = data["follow_links"]
limit = data["limit"]
content_types = data["content_types"]

return genai_core.websites.crawler.crawl_urls(
workspace=workspace,
Expand All @@ -31,6 +32,7 @@ def main():
processed_urls=processed_urls,
follow_links=follow_links,
limit=limit,
content_types=content_types,
)

if __name__ == "__main__":
Expand Down
5 changes: 3 additions & 2 deletions lib/shared/web-crawler-batch-job/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ langchain==0.1.5
opensearch-py==2.3.1
psycopg2-binary==2.9.7
pgvector==0.2.2
pydantic==2.3.0
pydantic==2.4.0
urllib3<2
openai==0.28.0
beautifulsoup4==4.12.2
requests==2.31.0
attrs==23.1.0
feedparser==6.0.10
aws_xray_sdk==2.12.1
defusedxml==0.7.1
defusedxml==0.7.1
pdfplumber==0.11.0
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ export class DocumentsClient {
sitemap: boolean,
address: string,
followLinks: boolean,
limit: number
limit: number,
contentTypes: string[]
): Promise<GraphQLResult<GraphQLQuery<AddWebsiteMutation>>> {
const result = API.graphql<GraphQLQuery<AddWebsiteMutation>>({
query: addWebsite,
Expand All @@ -130,6 +131,7 @@ export class DocumentsClient {
address,
followLinks,
limit,
contentTypes,
},
},
});
Expand All @@ -141,7 +143,8 @@ export class DocumentsClient {
address: string,
title: string,
limit: number,
followLinks: boolean
followLinks: boolean,
contentTypes: string[]
): Promise<GraphQLResult<GraphQLQuery<AddRssFeedMutation>>> {
const result = API.graphql<GraphQLQuery<AddRssFeedMutation>>({
query: addRssFeed,
Expand All @@ -152,6 +155,7 @@ export class DocumentsClient {
title,
limit,
followLinks,
contentTypes,
},
},
});
Expand Down Expand Up @@ -222,7 +226,8 @@ export class DocumentsClient {
workspaceId: string,
feedId: string,
followLinks: boolean,
limit: number
limit: number,
contentTypes: string[]
): Promise<GraphQLResult<GraphQLQuery<UpdateRssFeedMutation>>> {
const result = API.graphql<GraphQLQuery<UpdateRssFeedMutation>>({
query: addRssFeed,
Expand All @@ -232,6 +237,7 @@ export class DocumentsClient {
documentId: feedId,
followLinks,
limit,
contentTypes,
},
},
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ import {
Input,
SpaceBetween,
Toggle,
Multiselect,
} from "@cloudscape-design/components";
import { AddDataData } from "./types";
import { AddDataData, SelectOption, multiselectOptions } from "./types";
import { generateSelectedOptions } from "./utils";
import { useForm } from "../../../common/hooks/use-form";
import { useContext, useState } from "react";
import { AppContext } from "../../../common/app-context";
Expand All @@ -31,6 +33,7 @@ interface AddRssSubscriptionData {
rssFeedTitle: string;
linkLimit: number;
followLinks: boolean;
contentTypes: (string | undefined)[];
}

export default function AddRssSubscription(props: AddRssSubscriptionProps) {
Expand All @@ -46,6 +49,7 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
rssFeedTitle: "",
linkLimit: 250,
followLinks: true,
contentTypes: ["text/html"],
};
},
validate: (form) => {
Expand Down Expand Up @@ -77,13 +81,15 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
setGlobalError(undefined);

const apiClient = new ApiClient(appContext);
const contentTypesToUse = data.contentTypes.filter((ct): ct is string => ct !== undefined);
try {
await apiClient.documents.addRssFeedSubscription(
props.data.workspace.value,
data.rssFeedUrl,
data.rssFeedTitle,
data.linkLimit,
data.followLinks
data.followLinks,
contentTypesToUse
);

setFlashbarItem({
Expand All @@ -109,6 +115,20 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
props.setSubmitting(false);
};

const handleContentTypeChange = (selectedOptions: ReadonlyArray<SelectOption>) => {
const options: SelectOption[] = selectedOptions.map(option => {
if (option.value === undefined) {
throw new Error(`Option value cannot be undefined`);
}
return {
label: option.label,
value: option.value,
description: option.description
};
});
onChange({ contentTypes: options.map(option => option.value) });
};

const hasReadyWorkspace =
typeof props.data.workspace?.value !== "undefined" &&
typeof props.selectedWorkspace !== "undefined" &&
Expand Down Expand Up @@ -191,6 +211,18 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
}
/>
</FormField>
<FormField
label="Enabled Content Types"
errorText={errors.contentTypes}
description="Content Types to Enable for crawlingl"
>
<Multiselect
disabled={props.submitting}
selectedOptions={generateSelectedOptions(data.contentTypes)}
options={multiselectOptions}
onChange={({ detail }) => handleContentTypeChange(detail.selectedOptions)}
/>
</FormField>
</SpaceBetween>
</Container>
{flashbarItem !== null && <Flashbar items={[flashbarItem]} />}
Expand Down
Loading

0 comments on commit 599c59e

Please sign in to comment.