Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add Github Pages Connector #3009

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/danswer/configs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class DocumentSource(str, Enum):
GMAIL = "gmail"
REQUESTTRACKER = "requesttracker"
GITHUB = "github"
GITHUB_PAGES = "github_pages"
GITLAB = "gitlab"
GURU = "guru"
BOOKSTACK = "bookstack"
Expand Down
2 changes: 2 additions & 0 deletions backend/danswer/connectors/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from danswer.connectors.fireflies.connector import FirefliesConnector
from danswer.connectors.freshdesk.connector import FreshdeskConnector
from danswer.connectors.github.connector import GithubConnector
from danswer.connectors.github_pages.connector import GithubPagesConnector
from danswer.connectors.gitlab.connector import GitlabConnector
from danswer.connectors.gmail.connector import GmailConnector
from danswer.connectors.gong.connector import GongConnector
Expand Down Expand Up @@ -68,6 +69,7 @@ def identify_connector_class(
InputType.SLIM_RETRIEVAL: SlackPollConnector,
},
DocumentSource.GITHUB: GithubConnector,
DocumentSource.GITHUB_PAGES: GithubPagesConnector,
DocumentSource.GMAIL: GmailConnector,
DocumentSource.GITLAB: GitlabConnector,
DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector,
Expand Down
Empty file.
121 changes: 121 additions & 0 deletions backend/danswer/connectors/github_pages/connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import Any, List, Optional
from requests.auth import HTTPBasicAuth
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger

logger = setup_logger()

_TIMEOUT = 60
_MAX_DEPTH = 5


class GitHubPagesConnector(LoadConnector, PollConnector):
def __init__(
self,
base_url: str,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.base_url = base_url
self.batch_size = batch_size
self.visited_urls = set()
self.auth: Optional[HTTPBasicAuth] = None

def load_credentials(self, credentials: dict[str, Any]) -> None:
github_username = credentials.get("github_username")
github_token = credentials.get("github_personal_access_token")
if github_username and github_token:
self.auth = HTTPBasicAuth(github_username, github_token)
else:
self.auth = None

def _crawl_github_pages(self, url: str, batch_size: int, depth: int = 0) -> List[str]:
if depth > _MAX_DEPTH:
return []

to_visit = [url]
crawled_urls = []

while to_visit and len(crawled_urls) < batch_size:
current_url = to_visit.pop()
if current_url not in self.visited_urls:
try:
response = requests.get(current_url, timeout=_TIMEOUT, auth=self.auth)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

self.visited_urls.add(current_url)
crawled_urls.append(current_url)

for link in soup.find_all('a'):
href = link.get('href')
if href:
full_url = urljoin(self.base_url, href)
if full_url.startswith(self.base_url) and full_url not in self.visited_urls:
to_visit.append(full_url)

except requests.exceptions.RequestException as e:
logger.error(f"Error accessing {current_url}: {e}")

return crawled_urls

def _index_pages(self, urls: List[str]) -> List[Document]:
documents = []
for url in urls:
try:
response = requests.get(url, timeout=_TIMEOUT, auth=self.auth)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
text_content = soup.get_text()

documents.append(
Document(
id=url,
sections=[Section(link=url, text=text_content)],
source=DocumentSource.GITHUB_PAGES,
semantic_identifier=url,
metadata={"url": url},
)
)
except requests.exceptions.RequestException as e:
logger.error(f"Failed to fetch content for indexing from {url}: {e}")

return documents

def _pull_all_pages(self) -> GenerateDocumentsOutput:
all_crawled_urls = []
while True:
crawled_urls = self._crawl_github_pages(self.base_url, self.batch_size)
if not crawled_urls:
break
all_crawled_urls.extend(crawled_urls)
yield self._index_pages(crawled_urls)

def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput:
yield from self._pull_all_pages()


if __name__ == "__main__":
connector = GitHubPagesConnector(
base_url=os.environ["GITHUB_PAGES_BASE_URL"]
)

credentials = {
"github_username": os.getenv("GITHUB_USERNAME", ""),
"github_personal_access_token": os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN", ""),
}

connector.load_credentials(credentials)

document_batches = connector.poll_source(0, 0)
print(next(document_batches))
Binary file added web/public/GithubPages.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions web/src/components/admin/connectors/ConnectorTitle.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {
ConfluenceConfig,
Connector,
GithubConfig,
GithubPagesConfig,
GitlabConfig,
GoogleDriveConfig,
JiraConfig,
Expand Down Expand Up @@ -40,6 +41,9 @@ export const ConnectorTitle = ({
"Repo",
`${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}`
);
} else if (connector.source === "github_pages") {
const typedConnector = connector as Connector<GithubPagesConfig>;
additionalMetadata.set("Site URL", typedConnector.connector_specific_config.base_url);
} else if (connector.source === "gitlab") {
const typedConnector = connector as Connector<GitlabConfig>;
additionalMetadata.set(
Expand Down
7 changes: 7 additions & 0 deletions web/src/components/icons/icons.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,13 @@ export const GithubIcon = ({
<LogoIcon size={size} className={className} src="/Github.png" />
);

export const GithubPagesIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<LogoIcon size={size} className={className} src="/GithubPages.png" />
);

export const GmailIcon = ({
size = 16,
className = defaultTailwindCSS,
Expand Down
52 changes: 52 additions & 0 deletions web/src/lib/connectors/connectors.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,50 @@ export const connectorConfigs: Record<
],
advanced_values: [],
},
github_pages: {
description: "Configure GitHub Pages connector",
values: [
{
type: "text",
query: "Enter the base URL of the GitHub Pages site:",
label: "Base URL",
name: "base_url",
optional: false,
},
{
type: "checkbox",
query: "Authenticate requests with GitHub credentials?",
label: "Use GitHub Authentication",
description: "Enable this if your GitHub Pages site requires authentication",
name: "use_authentication",
optional: true,
},
],
advanced_values: [
{
type: "text",
query: "Enter your GitHub username (if using authentication):",
label: "GitHub Username",
name: "github_username",
optional: true,
},
{
type: "text",
query: "Enter your GitHub personal access token:",
label: "GitHub Personal Access Token",
name: "github_personal_access_token",
optional: true,
},
{
type: "number",
query: "Set the batch size for indexing (default is 10):",
label: "Batch Size",
name: "batch_size",
optional: true,
default: 10,
},
],
},
gitlab: {
description: "Configure GitLab connector",
values: [
Expand Down Expand Up @@ -1055,6 +1099,14 @@ export interface GithubConfig {
include_issues: boolean;
}

export interface GithubPagesConfig {
base_url: string;
use_authentication?: boolean;
github_username?: string;
github_personal_access_token?: string;
batch_size?: number;
}

export interface GitlabConfig {
project_owner: string;
project_name: string;
Expand Down
7 changes: 7 additions & 0 deletions web/src/lib/sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import {
XenforoIcon,
FreshdeskIcon,
FirefliesIcon,
GithubPagesIcon,
} from "@/components/icons/icons";
import { ValidSources } from "./types";
import {
Expand Down Expand Up @@ -95,6 +96,12 @@ const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.CodeRepository,
docs: "https://docs.danswer.dev/connectors/github",
},
github_pages: {
icon: GithubPagesIcon,
displayName: "Github Pages",
category: SourceCategory.Wiki,
docs: "https://docs.danswer.dev/connectors/github_pages",
},
gitlab: {
icon: GitlabIcon,
displayName: "Gitlab",
Expand Down
1 change: 1 addition & 0 deletions web/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ export interface UserGroup {
const validSources = [
"web",
"github",
"github_pages",
"gitlab",
"slack",
"google_drive",
Expand Down