diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 7b3ea8e81bb..40c783ed3ac 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -92,6 +92,7 @@ class DocumentSource(str, Enum): GMAIL = "gmail" REQUESTTRACKER = "requesttracker" GITHUB = "github" + GITHUB_PAGES = "github_pages" GITLAB = "gitlab" GURU = "guru" BOOKSTACK = "bookstack" diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py index 40f926b31d1..16f94ca2653 100644 --- a/backend/danswer/connectors/factory.py +++ b/backend/danswer/connectors/factory.py @@ -19,6 +19,7 @@ from danswer.connectors.fireflies.connector import FirefliesConnector from danswer.connectors.freshdesk.connector import FreshdeskConnector from danswer.connectors.github.connector import GithubConnector +from danswer.connectors.github_pages.connector import GithubPagesConnector from danswer.connectors.gitlab.connector import GitlabConnector from danswer.connectors.gmail.connector import GmailConnector from danswer.connectors.gong.connector import GongConnector @@ -68,6 +69,7 @@ def identify_connector_class( InputType.SLIM_RETRIEVAL: SlackPollConnector, }, DocumentSource.GITHUB: GithubConnector, + DocumentSource.GITHUB_PAGES: GithubPagesConnector, DocumentSource.GMAIL: GmailConnector, DocumentSource.GITLAB: GitlabConnector, DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector, diff --git a/backend/danswer/connectors/github_pages/__init__.py b/backend/danswer/connectors/github_pages/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/danswer/connectors/github_pages/connector.py b/backend/danswer/connectors/github_pages/connector.py new file mode 100644 index 00000000000..694bd112869 --- /dev/null +++ b/backend/danswer/connectors/github_pages/connector.py @@ -0,0 +1,121 @@ +import os +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin +from typing import Any, List, Optional +from requests.auth import HTTPBasicAuth +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +_TIMEOUT = 60 +_MAX_DEPTH = 5 + + +class GitHubPagesConnector(LoadConnector, PollConnector): + def __init__( + self, + base_url: str, + batch_size: int = INDEX_BATCH_SIZE, + ) -> None: + self.base_url = base_url + self.batch_size = batch_size + self.visited_urls = set() + self.auth: Optional[HTTPBasicAuth] = None + + def load_credentials(self, credentials: dict[str, Any]) -> None: + github_username = credentials.get("github_username") + github_token = credentials.get("github_personal_access_token") + if github_username and github_token: + self.auth = HTTPBasicAuth(github_username, github_token) + else: + self.auth = None + + def _crawl_github_pages(self, url: str, batch_size: int, depth: int = 0) -> List[str]: + if depth > _MAX_DEPTH: + return [] + + to_visit = [url] + crawled_urls = [] + + while to_visit and len(crawled_urls) < batch_size: + current_url = to_visit.pop() + if current_url not in self.visited_urls: + try: + response = requests.get(current_url, timeout=_TIMEOUT, auth=self.auth) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + self.visited_urls.add(current_url) + crawled_urls.append(current_url) + + for link in soup.find_all('a'): + href = link.get('href') + if href: + full_url = urljoin(self.base_url, href) + if full_url.startswith(self.base_url) and full_url not in self.visited_urls: + to_visit.append(full_url) + + except requests.exceptions.RequestException as e: + logger.error(f"Error accessing {current_url}: {e}") + + return crawled_urls + + def _index_pages(self, urls: List[str]) -> List[Document]: + documents = [] + for url in urls: + try: + response = requests.get(url, timeout=_TIMEOUT, auth=self.auth) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + text_content = soup.get_text() + + documents.append( + Document( + id=url, + sections=[Section(link=url, text=text_content)], + source=DocumentSource.GITHUB_PAGES, + semantic_identifier=url, + metadata={"url": url}, + ) + ) + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch content for indexing from {url}: {e}") + + return documents + + def _pull_all_pages(self) -> GenerateDocumentsOutput: + all_crawled_urls = [] + while True: + crawled_urls = self._crawl_github_pages(self.base_url, self.batch_size) + if not crawled_urls: + break + all_crawled_urls.extend(crawled_urls) + yield self._index_pages(crawled_urls) + + def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput: + yield from self._pull_all_pages() + + +if __name__ == "__main__": + connector = GitHubPagesConnector( + base_url=os.environ["GITHUB_PAGES_BASE_URL"] + ) + + credentials = { + "github_username": os.getenv("GITHUB_USERNAME", ""), + "github_personal_access_token": os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN", ""), + } + + connector.load_credentials(credentials) + + document_batches = connector.poll_source(0, 0) + print(next(document_batches)) diff --git a/web/public/GithubPages.png b/web/public/GithubPages.png new file mode 100644 index 00000000000..b69cabe8ed6 Binary files /dev/null and b/web/public/GithubPages.png differ diff --git a/web/src/components/admin/connectors/ConnectorTitle.tsx b/web/src/components/admin/connectors/ConnectorTitle.tsx index 6e2da252aec..530d951d941 100644 --- a/web/src/components/admin/connectors/ConnectorTitle.tsx +++ b/web/src/components/admin/connectors/ConnectorTitle.tsx @@ -2,6 +2,7 @@ import { ConfluenceConfig, Connector, GithubConfig, + GithubPagesConfig, GitlabConfig, GoogleDriveConfig, JiraConfig, @@ -40,6 +41,9 @@ export const ConnectorTitle = ({ "Repo", `${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}` ); + } else if (connector.source === "github_pages") { + const typedConnector = connector as Connector; + additionalMetadata.set("Site URL", typedConnector.connector_specific_config.base_url); } else if (connector.source === "gitlab") { const typedConnector = connector as Connector; additionalMetadata.set( diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index a2f01084fd7..34eb082c0b2 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -1036,6 +1036,13 @@ export const GithubIcon = ({ ); +export const GithubPagesIcon = ({ + size = 16, + className = defaultTailwindCSS, +}: IconProps) => ( + +); + export const GmailIcon = ({ size = 16, className = defaultTailwindCSS, diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index 89dca2f287f..7c3249ef8cc 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -171,6 +171,50 @@ export const connectorConfigs: Record< ], advanced_values: [], }, + github_pages: { + description: "Configure GitHub Pages connector", + values: [ + { + type: "text", + query: "Enter the base URL of the GitHub Pages site:", + label: "Base URL", + name: "base_url", + optional: false, + }, + { + type: "checkbox", + query: "Authenticate requests with GitHub credentials?", + label: "Use GitHub Authentication", + description: "Enable this if your GitHub Pages site requires authentication", + name: "use_authentication", + optional: true, + }, + ], + advanced_values: [ + { + type: "text", + query: "Enter your GitHub username (if using authentication):", + label: "GitHub Username", + name: "github_username", + optional: true, + }, + { + type: "text", + query: "Enter your GitHub personal access token:", + label: "GitHub Personal Access Token", + name: "github_personal_access_token", + optional: true, + }, + { + type: "number", + query: "Set the batch size for indexing (default is 10):", + label: "Batch Size", + name: "batch_size", + optional: true, + default: 10, + }, + ], + }, gitlab: { description: "Configure GitLab connector", values: [ @@ -1055,6 +1099,14 @@ export interface GithubConfig { include_issues: boolean; } +export interface GithubPagesConfig { + base_url: string; + use_authentication?: boolean; + github_username?: string; + github_personal_access_token?: string; + batch_size?: number; +} + export interface GitlabConfig { project_owner: string; project_name: string; diff --git a/web/src/lib/sources.ts b/web/src/lib/sources.ts index fde648db209..47f5495795d 100644 --- a/web/src/lib/sources.ts +++ b/web/src/lib/sources.ts @@ -38,6 +38,7 @@ import { XenforoIcon, FreshdeskIcon, FirefliesIcon, + GithubPagesIcon, } from "@/components/icons/icons"; import { ValidSources } from "./types"; import { @@ -95,6 +96,12 @@ const SOURCE_METADATA_MAP: SourceMap = { category: SourceCategory.CodeRepository, docs: "https://docs.danswer.dev/connectors/github", }, + github_pages: { + icon: GithubPagesIcon, + displayName: "Github Pages", + category: SourceCategory.Wiki, + docs: "https://docs.danswer.dev/connectors/github_pages", + }, gitlab: { icon: GitlabIcon, displayName: "Gitlab", diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index c9f8d46b220..8cc46426c9b 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -228,6 +228,7 @@ export interface UserGroup { const validSources = [ "web", "github", + "github_pages", "gitlab", "slack", "google_drive",