Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: enable web scraping to parse and save pdf content #474

Merged
merged 2 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions lib/chatbot-api/functions/api-handler/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class WebsiteDocumentRequest(BaseModel):
address: str
followLinks: bool
limit: int
contentTypes: Optional[list]


class RssFeedDocumentRequest(BaseModel):
Expand All @@ -44,12 +45,14 @@ class RssFeedDocumentRequest(BaseModel):
limit: int
title: Optional[str] = None
followLinks: bool
contentTypes: Optional[list]


class RssFeedCrawlerUpdateRequest(BaseModel):
documentType: str
followLinks: bool
limit: int
contentTypes: Optional[str]


class ListDocumentsRequest(BaseModel):
Expand Down Expand Up @@ -237,6 +240,7 @@ def add_website(input: dict):
crawler_properties={
"follow_links": request.followLinks,
"limit": limit,
"content_types": request.contentTypes,
},
)

Expand All @@ -263,6 +267,7 @@ def add_rss_feed(
crawler_properties={
"follow_links": request.followLinks,
"limit": request.limit,
"content_types": request.contentTypes,
},
)

Expand All @@ -282,6 +287,7 @@ def update_rss_feed(input: dict):
document_type="rssfeed",
follow_links=request.followLinks,
limit=request.limit,
content_types=request.contentTypes,
)
return {
"workspaceId": result["workspace_id"],
Expand All @@ -295,6 +301,7 @@ def _convert_document(document: dict):
document["crawler_properties"] = {
"followLinks": document["crawler_properties"]["follow_links"],
"limit": document["crawler_properties"]["limit"],
"contentTypes": document["crawler_properties"]["content_types"],
}
return {
"id": document["document_id"],
Expand All @@ -315,6 +322,7 @@ def _convert_document(document: dict):
"crawlerProperties": {
"followLinks": document.get("crawler_properties").get("follow_links", None),
"limit": document.get("crawler_properties").get("limit", None),
"contentTypes": document.get("crawler_properties").get("content_types", None),
}
if document.get("crawler_properties", None) != None
else None,
Expand Down
3 changes: 3 additions & 0 deletions lib/chatbot-api/schema/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ input CalculateEmbeddingsInput {
type CrawlerProperties @aws_cognito_user_pools {
followLinks: Boolean
limit: Int
contentTypes: [String!]!
}

type CrossEncoderData @aws_cognito_user_pools {
Expand Down Expand Up @@ -190,6 +191,7 @@ input RssFeedInput {
limit: Int!
title: String
followLinks: Boolean!
contentTypes: [String!]!
}

input SemanticSearchInput {
Expand Down Expand Up @@ -261,6 +263,7 @@ input WebsiteInput {
address: String!
followLinks: Boolean!
limit: Int!
contentTypes: [String!]!
}

type Workspace @aws_cognito_user_pools {
Expand Down
7 changes: 7 additions & 0 deletions lib/shared/layers/python-sdk/python/genai_core/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k
if "limit" in kwargs and "follow_links" in kwargs:
follow_links = kwargs["follow_links"]
limit = kwargs["limit"]
content_types = kwargs["content_types"]
response = documents_table.update_item(
Key={"workspace_id": workspace_id, "document_id": document_id},
UpdateExpression="SET #crawler_properties=:crawler_properties, updated_at=:timestampValue",
Expand All @@ -367,6 +368,7 @@ def update_document(workspace_id: str, document_id: str, document_type: str, **k
":crawler_properties": {
"follow_links": follow_links,
"limit": limit,
"content_types": content_types,
},
":timestampValue": timestamp,
},
Expand Down Expand Up @@ -479,6 +481,7 @@ def _process_document(
crawler_properties = kwargs["crawler_properties"]
follow_links = crawler_properties["follow_links"]
limit = crawler_properties["limit"]
content_types = crawler_properties["content_types"]

if document_sub_type == "sitemap":
follow_links = False
Expand Down Expand Up @@ -514,6 +517,7 @@ def _process_document(
"processed_urls": [],
"follow_links": follow_links,
"limit": limit,
"content_types": content_types,
"done": False,
},
cls=genai_core.utils.json.CustomEncoder,
Expand Down Expand Up @@ -712,6 +716,9 @@ def batch_crawl_websites():
"limit": int(post["crawler_properties"]["M"]["limit"]["N"])
if "crawler_properties" in post
else 250,
"content_types": post["crawler_properties"]["M"]["content_types"]["L"]
if "crawler_properties" in post and "content_types" in post["crawler_properties"]["M"]
else ["text/html"],
},
)
set_status(workspace_id, document_id, "processed")
Expand Down
43 changes: 33 additions & 10 deletions lib/shared/layers/python-sdk/python/genai_core/websites/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import requests
import genai_core.chunks
import genai_core.documents
import pdfplumber
import io
from typing import List
from bs4 import BeautifulSoup
from urllib.parse import urlparse
Expand All @@ -21,6 +23,7 @@ def crawl_urls(
processed_urls: List[str],
follow_links: bool,
limit: int,
content_types: List[str],
):
workspace_id = workspace["workspace_id"]
document_id = document["document_id"]
Expand All @@ -47,7 +50,7 @@ def crawl_urls(
print(f"Processing url {document_sub_id}: {current_url}")

try:
content, local_links, _ = parse_url(current_url)
content, local_links, _ = parse_url(current_url, content_types)
except:
print(f"Failed to parse url: {current_url}")
continue
Expand Down Expand Up @@ -96,7 +99,7 @@ def crawl_urls(
}


def parse_url(url: str):
def parse_url(url: str, content_types_supported: list):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
Expand All @@ -105,14 +108,34 @@ def parse_url(url: str):
base_url = f"{root_url_parse.scheme}://{root_url_parse.netloc}"

response = requests.get(url, headers=headers, timeout=20)
if "text/html" not in response.headers["Content-Type"]:
raise Exception(
f"Invalid content type {response.headers['Content-Type']}")
soup = BeautifulSoup(response.content, "html.parser")
content = soup.get_text(separator=' ')
content = re.sub(r"[ \n]+", " ", content)

links = list(set([a["href"] for a in soup.find_all("a", href=True)]))
content_type = response.headers["Content-Type"]
links = []

if ("text/html" in content_type) and ("text/html" in content_types_supported):
soup = BeautifulSoup(response.content, "html.parser")
content = soup.get_text(separator=' ')
content = re.sub(r"[ \n]+", " ", content)
links = [a["href"] for a in soup.find_all("a", href=True)]

elif ("application/pdf" in content_type) and ("application/pdf" in content_types_supported):
pdf_bytes = response.content # Get the bytes content of the response
pdf_stream = io.BytesIO(pdf_bytes) # Create a BytesIO stream from the bytes
with pdfplumber.open(pdf_stream) as pdf:
content = []
for page in pdf.pages:
if page.extract_text():
content.append(page.extract_text().replace('\n', ' '))

# Extract links from annotations
annotations = page.annots
if annotations:
for annot in annotations:
if annot['uri']:
links.append(annot['uri'])
content = ' '.join(content)
else:
raise Exception(f"Unsupported content type {content_type} found at: {url}")

local_links = []
external_links = []

Expand Down
2 changes: 2 additions & 0 deletions lib/shared/web-crawler-batch-job/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def main():
processed_urls = data["processed_urls"]
follow_links = data["follow_links"]
limit = data["limit"]
content_types = data["content_types"]

return genai_core.websites.crawler.crawl_urls(
workspace=workspace,
Expand All @@ -31,6 +32,7 @@ def main():
processed_urls=processed_urls,
follow_links=follow_links,
limit=limit,
content_types=content_types,
)

if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion lib/shared/web-crawler-batch-job/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ requests==2.31.0
attrs==23.1.0
feedparser==6.0.10
aws_xray_sdk==2.12.1
defusedxml==0.7.1
defusedxml==0.7.1
pdfplumber==0.11.0
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ export class DocumentsClient {
sitemap: boolean,
address: string,
followLinks: boolean,
limit: number
limit: number,
contentTypes: string[]
): Promise<GraphQLResult<GraphQLQuery<AddWebsiteMutation>>> {
const result = API.graphql<GraphQLQuery<AddWebsiteMutation>>({
query: addWebsite,
Expand All @@ -130,6 +131,7 @@ export class DocumentsClient {
address,
followLinks,
limit,
contentTypes,
},
},
});
Expand All @@ -141,7 +143,8 @@ export class DocumentsClient {
address: string,
title: string,
limit: number,
followLinks: boolean
followLinks: boolean,
contentTypes: string[]
): Promise<GraphQLResult<GraphQLQuery<AddRssFeedMutation>>> {
const result = API.graphql<GraphQLQuery<AddRssFeedMutation>>({
query: addRssFeed,
Expand All @@ -152,6 +155,7 @@ export class DocumentsClient {
title,
limit,
followLinks,
contentTypes,
},
},
});
Expand Down Expand Up @@ -222,7 +226,8 @@ export class DocumentsClient {
workspaceId: string,
feedId: string,
followLinks: boolean,
limit: number
limit: number,
contentTypes: string[]
): Promise<GraphQLResult<GraphQLQuery<UpdateRssFeedMutation>>> {
const result = API.graphql<GraphQLQuery<UpdateRssFeedMutation>>({
query: addRssFeed,
Expand All @@ -232,6 +237,7 @@ export class DocumentsClient {
documentId: feedId,
followLinks,
limit,
contentTypes,
},
},
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ import {
Input,
SpaceBetween,
Toggle,
Multiselect,
} from "@cloudscape-design/components";
import { AddDataData } from "./types";
import { AddDataData, SelectOption, multiselectOptions } from "./types";
import { generateSelectedOptions } from "./utils";
import { useForm } from "../../../common/hooks/use-form";
import { useContext, useState } from "react";
import { AppContext } from "../../../common/app-context";
Expand All @@ -31,6 +33,7 @@ interface AddRssSubscriptionData {
rssFeedTitle: string;
linkLimit: number;
followLinks: boolean;
contentTypes: (string | undefined)[];
}

export default function AddRssSubscription(props: AddRssSubscriptionProps) {
Expand All @@ -46,6 +49,7 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
rssFeedTitle: "",
linkLimit: 250,
followLinks: true,
contentTypes: ["text/html"],
};
},
validate: (form) => {
Expand Down Expand Up @@ -77,13 +81,15 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
setGlobalError(undefined);

const apiClient = new ApiClient(appContext);
const contentTypesToUse = data.contentTypes.filter((ct): ct is string => ct !== undefined);
try {
await apiClient.documents.addRssFeedSubscription(
props.data.workspace.value,
data.rssFeedUrl,
data.rssFeedTitle,
data.linkLimit,
data.followLinks
data.followLinks,
contentTypesToUse
);

setFlashbarItem({
Expand All @@ -109,6 +115,20 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
props.setSubmitting(false);
};

const handleContentTypeChange = (selectedOptions: ReadonlyArray<SelectOption>) => {
const options: SelectOption[] = selectedOptions.map(option => {
if (option.value === undefined) {
throw new Error(`Option value cannot be undefined`);
}
return {
label: option.label,
value: option.value,
description: option.description
};
});
onChange({ contentTypes: options.map(option => option.value) });
};

const hasReadyWorkspace =
typeof props.data.workspace?.value !== "undefined" &&
typeof props.selectedWorkspace !== "undefined" &&
Expand Down Expand Up @@ -191,6 +211,18 @@ export default function AddRssSubscription(props: AddRssSubscriptionProps) {
}
/>
</FormField>
<FormField
label="Enabled Content Types"
errorText={errors.contentTypes}
description="Content Types to Enable for crawlingl"
>
<Multiselect
disabled={props.submitting}
selectedOptions={generateSelectedOptions(data.contentTypes)}
options={multiselectOptions}
onChange={({ detail }) => handleContentTypeChange(detail.selectedOptions)}
/>
</FormField>
</SpaceBetween>
</Container>
{flashbarItem !== null && <Flashbar items={[flashbarItem]} />}
Expand Down
Loading
Loading