diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index e807f381e87..6402bd744bc 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -102,6 +102,7 @@ class DocumentSource(str, Enum): S3 = "s3" R2 = "r2" GOOGLE_CLOUD_STORAGE = "google_cloud_storage" + GITHUB_PAGES = "github_pages" OCI_STORAGE = "oci_storage" NOT_APPLICABLE = "not_applicable" diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py index 1a3d605d3a5..a8ce792bfda 100644 --- a/backend/danswer/connectors/factory.py +++ b/backend/danswer/connectors/factory.py @@ -44,6 +44,7 @@ from danswer.connectors.zendesk.connector import ZendeskConnector from danswer.connectors.zulip.connector import ZulipConnector from danswer.db.credentials import backend_update_credential_json +from danswer.connectors.github_pages.connector import GitHubPagesConnector from danswer.db.models import Credential @@ -95,6 +96,7 @@ def identify_connector_class( DocumentSource.R2: BlobStorageConnector, DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector, DocumentSource.OCI_STORAGE: BlobStorageConnector, + DocumentSource.GITHUB_PAGES: GitHubPagesConnector, } connector_by_source = connector_map.get(source, {}) diff --git a/backend/danswer/connectors/github_pages/__init__.py b/backend/danswer/connectors/github_pages/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/danswer/connectors/github_pages/connector.py b/backend/danswer/connectors/github_pages/connector.py new file mode 100644 index 00000000000..6437e62c7ad --- /dev/null +++ b/backend/danswer/connectors/github_pages/connector.py @@ -0,0 +1,117 @@ +import os +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin +from typing import Any, List, Optional +from requests.auth import HTTPBasicAuth +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +_TIMEOUT = 60 + + +class GitHubPagesConnector(LoadConnector, PollConnector): + def __init__( + self, + base_url: str, + batch_size: int = INDEX_BATCH_SIZE, + ) -> None: + self.base_url = base_url + self.batch_size = batch_size + self.visited_urls = set() + self.auth: Optional[HTTPBasicAuth] = None # Will be used for authenticated requests + + def load_credentials(self, credentials: dict[str, Any]) -> None: + # Load credentials if provided, otherwise remain unauthenticated + github_username = credentials.get("github_username") + github_token = credentials.get("github_personal_access_token") + if github_username and github_token: + self.auth = HTTPBasicAuth(github_username, github_token) + else: + self.auth = None # No authentication if credentials are not provided + + def _crawl_github_pages(self, url: str, batch_size: int) -> List[str]: + to_visit = [url] + crawled_urls = [] + + while to_visit and len(crawled_urls) < batch_size: + current_url = to_visit.pop() + if current_url not in self.visited_urls: + try: + # Make request with or without authentication based on the credentials + if self.auth: + response = requests.get(current_url, timeout=_TIMEOUT, auth=self.auth) + else: + response = requests.get(current_url, timeout=_TIMEOUT) + + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + # Add current URL to visited and crawled lists + self.visited_urls.add(current_url) + crawled_urls.append(current_url) + + # Extract all links and queue them for crawling + for link in soup.find_all('a'): + href = link.get('href') + if href: + full_url = urljoin(self.base_url, href) + if full_url.startswith(self.base_url) and full_url not in self.visited_urls: + to_visit.append(full_url) + + except Exception as e: + logger.error(f"Error while accessing {current_url}: {e}") + + return crawled_urls + + def _index_pages(self, urls: List[str]): + documents = [] + for url in urls: + documents.append( + Document( + id=url, + sections=[Section(link=url, text="")], # No content extraction needed + source=DocumentSource.GITHUB_PAGES, + semantic_identifier=url, + metadata={"url": url}, + ) + ) + return documents + + def _pull_all_pages(self): + all_crawled_urls = [] + while True: + crawled_urls = self._crawl_github_pages(self.base_url, self.batch_size) + if not crawled_urls: + break + all_crawled_urls.extend(crawled_urls) + yield self._index_pages(crawled_urls) + + def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput: + yield from self._pull_all_pages() + + +if __name__ == "__main__": + connector = GitHubPagesConnector( + base_url=os.environ["GITHUB_PAGES_BASE_URL"] + ) + + # Load credentials if provided (otherwise unauthenticated) + credentials = { + "github_username": os.getenv("GITHUB_USERNAME", ""), + "github_personal_access_token": os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN", ""), + } + + connector.load_credentials(credentials) + + document_batches = connector.poll_source(0, 0) + print(next(document_batches)) diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index b5e735b0e65..6225fe6de62 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -1068,6 +1068,20 @@ export const GithubIcon = ({ ); }; +export const GitHubPagesIcon = ({ + size = 16, + className = defaultTailwindCSS, +}: IconProps) => { + // GitHub Pages Icon adjustment for surrounding whitespace + return ( +
+ GitHub Pages Logo +
+ ); +}; export const GmailIcon = ({ size = 16, diff --git a/web/src/lib/connectors/connectors.ts b/web/src/lib/connectors/connectors.ts index 21d7802bfce..09f1cf2894d 100644 --- a/web/src/lib/connectors/connectors.ts +++ b/web/src/lib/connectors/connectors.ts @@ -141,6 +141,27 @@ export const connectorConfigs: Record< }, ], }, + github_pages: { + description: "Configure GitHub Pages connector", + values: [ + { + type: "text", + query: "Enter the base URL of your GitHub Pages site:", + label: "GitHub Pages Base URL", + name: "github_pages_base_url", + optional: false, + }, + { + type: "number", + query: "Enter the batch size for indexing (number of pages per batch):", + label: "Index Batch Size", + name: "github_pages_batch_size", + optional: true, + default: 10, // Default batch size + }, + ], + }, + gitlab: { description: "Configure GitLab connector", values: [ diff --git a/web/src/lib/connectors/credentials.ts b/web/src/lib/connectors/credentials.ts index 424a07c82fe..51948d5c936 100644 --- a/web/src/lib/connectors/credentials.ts +++ b/web/src/lib/connectors/credentials.ts @@ -18,6 +18,11 @@ export interface Credential extends CredentialBase { export interface GithubCredentialJson { github_access_token: string; } +export interface GitthubPagesCredentialJson { + github_username: string; + github_access_token: string; + github_repo: string; +} export interface GitlabCredentialJson { gitlab_url: string; diff --git a/web/src/lib/sources.ts b/web/src/lib/sources.ts index bbc63847adb..95e36e67cf3 100644 --- a/web/src/lib/sources.ts +++ b/web/src/lib/sources.ts @@ -8,6 +8,7 @@ import { DropboxIcon, FileIcon, GithubIcon, + GitHubPagesIcon, GitlabIcon, GlobeIcon, GmailIcon, @@ -93,6 +94,12 @@ const SOURCE_METADATA_MAP: SourceMap = { category: SourceCategory.CodeRepository, docs: "https://docs.danswer.dev/connectors/github", }, + github_pages: { + icon: GitHubPagesIcon, + displayName: "Github Pages", + category: SourceCategory.Wiki, + docs: "https://docs.danswer.dev/connectors/github_pages", + }, gitlab: { icon: GitlabIcon, displayName: "Gitlab", diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 23fab57648f..81a59d04c6d 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -211,6 +211,7 @@ export interface UserGroup { const validSources = [ "web", "github", + "github_pages", "gitlab", "slack", "google_drive",