diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index e807f381e87..6402bd744bc 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -102,6 +102,7 @@ class DocumentSource(str, Enum): S3 = "s3" R2 = "r2" GOOGLE_CLOUD_STORAGE = "google_cloud_storage" + GITHUB_PAGES = "github_pages" OCI_STORAGE = "oci_storage" NOT_APPLICABLE = "not_applicable" diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py index 1a3d605d3a5..a8ce792bfda 100644 --- a/backend/danswer/connectors/factory.py +++ b/backend/danswer/connectors/factory.py @@ -44,6 +44,7 @@ from danswer.connectors.zendesk.connector import ZendeskConnector from danswer.connectors.zulip.connector import ZulipConnector from danswer.db.credentials import backend_update_credential_json +from danswer.connectors.github_pages.connector import GitHubPagesConnector from danswer.db.models import Credential @@ -95,6 +96,7 @@ def identify_connector_class( DocumentSource.R2: BlobStorageConnector, DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector, DocumentSource.OCI_STORAGE: BlobStorageConnector, + DocumentSource.GITHUB_PAGES: GitHubPagesConnector, } connector_by_source = connector_map.get(source, {}) diff --git a/backend/danswer/connectors/github_pages/__init__.py b/backend/danswer/connectors/github_pages/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/danswer/connectors/github_pages/connector.py b/backend/danswer/connectors/github_pages/connector.py new file mode 100644 index 00000000000..6437e62c7ad --- /dev/null +++ b/backend/danswer/connectors/github_pages/connector.py @@ -0,0 +1,117 @@ +import os +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin +from typing import Any, List, Optional +from requests.auth import HTTPBasicAuth +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +_TIMEOUT = 60 + + +class GitHubPagesConnector(LoadConnector, PollConnector): + def __init__( + self, + base_url: str, + batch_size: int = INDEX_BATCH_SIZE, + ) -> None: + self.base_url = base_url + self.batch_size = batch_size + self.visited_urls = set() + self.auth: Optional[HTTPBasicAuth] = None # Will be used for authenticated requests + + def load_credentials(self, credentials: dict[str, Any]) -> None: + # Load credentials if provided, otherwise remain unauthenticated + github_username = credentials.get("github_username") + github_token = credentials.get("github_personal_access_token") + if github_username and github_token: + self.auth = HTTPBasicAuth(github_username, github_token) + else: + self.auth = None # No authentication if credentials are not provided + + def _crawl_github_pages(self, url: str, batch_size: int) -> List[str]: + to_visit = [url] + crawled_urls = [] + + while to_visit and len(crawled_urls) < batch_size: + current_url = to_visit.pop() + if current_url not in self.visited_urls: + try: + # Make request with or without authentication based on the credentials + if self.auth: + response = requests.get(current_url, timeout=_TIMEOUT, auth=self.auth) + else: + response = requests.get(current_url, timeout=_TIMEOUT) + + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + # Add current URL to visited and crawled lists + self.visited_urls.add(current_url) + crawled_urls.append(current_url) + + # Extract all links and queue them for crawling + for link in soup.find_all('a'): + href = link.get('href') + if href: + full_url = urljoin(self.base_url, href) + if full_url.startswith(self.base_url) and full_url not in self.visited_urls: + to_visit.append(full_url) + + except Exception as e: + logger.error(f"Error while accessing {current_url}: {e}") + + return crawled_urls + + def _index_pages(self, urls: List[str]): + documents = [] + for url in urls: + documents.append( + Document( + id=url, + sections=[Section(link=url, text="")], # No content extraction needed + source=DocumentSource.GITHUB_PAGES, + semantic_identifier=url, + metadata={"url": url}, + ) + ) + return documents + + def _pull_all_pages(self): + all_crawled_urls = [] + while True: + crawled_urls = self._crawl_github_pages(self.base_url, self.batch_size) + if not crawled_urls: + break + all_crawled_urls.extend(crawled_urls) + yield self._index_pages(crawled_urls) + + def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput: + yield from self._pull_all_pages() + + +if __name__ == "__main__": + connector = GitHubPagesConnector( + base_url=os.environ["GITHUB_PAGES_BASE_URL"] + ) + + # Load credentials if provided (otherwise unauthenticated) + credentials = { + "github_username": os.getenv("GITHUB_USERNAME", ""), + "github_personal_access_token": os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN", ""), + } + + connector.load_credentials(credentials) + + document_batches = connector.poll_source(0, 0) + print(next(document_batches)) diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index b5e735b0e65..6225fe6de62 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -1068,6 +1068,20 @@ export const GithubIcon = ({ ); }; +export const GitHubPagesIcon = ({ + size = 16, + className = defaultTailwindCSS, +}: IconProps) => { + // GitHub Pages Icon adjustment for surrounding whitespace + return ( +