Skip to content

Commit

Permalink
feat: add Github Pages Connector
Browse files Browse the repository at this point in the history
Signed-off-by: Akhilender <akhilenderb9@gmail.com>
  • Loading branch information
akhilender-bongirwar committed Nov 3, 2024
1 parent 938d578 commit 59bc907
Show file tree
Hide file tree
Showing 10 changed files with 195 additions and 0 deletions.
1 change: 1 addition & 0 deletions backend/danswer/configs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class DocumentSource(str, Enum):
GMAIL = "gmail"
REQUESTTRACKER = "requesttracker"
GITHUB = "github"
GITHUB_PAGES = "github_pages"
GITLAB = "gitlab"
GURU = "guru"
BOOKSTACK = "bookstack"
Expand Down
2 changes: 2 additions & 0 deletions backend/danswer/connectors/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from danswer.connectors.file.connector import LocalFileConnector
from danswer.connectors.freshdesk.connector import FreshdeskConnector
from danswer.connectors.github.connector import GithubConnector
from danswer.connectors.github_pages.connector import GithubPagesConnector
from danswer.connectors.gitlab.connector import GitlabConnector
from danswer.connectors.gmail.connector import GmailConnector
from danswer.connectors.gong.connector import GongConnector
Expand Down Expand Up @@ -67,6 +68,7 @@ def identify_connector_class(
InputType.SLIM_RETRIEVAL: SlackPollConnector,
},
DocumentSource.GITHUB: GithubConnector,
DocumentSource.GITHUB_PAGES: GithubPagesConnector,
DocumentSource.GMAIL: GmailConnector,
DocumentSource.GITLAB: GitlabConnector,
DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector,
Expand Down
Empty file.
121 changes: 121 additions & 0 deletions backend/danswer/connectors/github_pages/connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import Any, List, Optional
from requests.auth import HTTPBasicAuth
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger

logger = setup_logger()

_TIMEOUT = 60
_MAX_DEPTH = 5


class GitHubPagesConnector(LoadConnector, PollConnector):
def __init__(
self,
base_url: str,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.base_url = base_url
self.batch_size = batch_size
self.visited_urls = set()
self.auth: Optional[HTTPBasicAuth] = None

def load_credentials(self, credentials: dict[str, Any]) -> None:
github_username = credentials.get("github_username")
github_token = credentials.get("github_personal_access_token")
if github_username and github_token:
self.auth = HTTPBasicAuth(github_username, github_token)
else:
self.auth = None

def _crawl_github_pages(self, url: str, batch_size: int, depth: int = 0) -> List[str]:
if depth > _MAX_DEPTH:
return []

to_visit = [url]
crawled_urls = []

while to_visit and len(crawled_urls) < batch_size:
current_url = to_visit.pop()
if current_url not in self.visited_urls:
try:
response = requests.get(current_url, timeout=_TIMEOUT, auth=self.auth)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

self.visited_urls.add(current_url)
crawled_urls.append(current_url)

for link in soup.find_all('a'):
href = link.get('href')
if href:
full_url = urljoin(self.base_url, href)
if full_url.startswith(self.base_url) and full_url not in self.visited_urls:
to_visit.append(full_url)

except requests.exceptions.RequestException as e:
logger.error(f"Error accessing {current_url}: {e}")

return crawled_urls

def _index_pages(self, urls: List[str]) -> List[Document]:
documents = []
for url in urls:
try:
response = requests.get(url, timeout=_TIMEOUT, auth=self.auth)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
text_content = soup.get_text()

documents.append(
Document(
id=url,
sections=[Section(link=url, text=text_content)],
source=DocumentSource.GITHUB_PAGES,
semantic_identifier=url,
metadata={"url": url},
)
)
except requests.exceptions.RequestException as e:
logger.error(f"Failed to fetch content for indexing from {url}: {e}")

return documents

def _pull_all_pages(self) -> GenerateDocumentsOutput:
all_crawled_urls = []
while True:
crawled_urls = self._crawl_github_pages(self.base_url, self.batch_size)
if not crawled_urls:
break
all_crawled_urls.extend(crawled_urls)
yield self._index_pages(crawled_urls)

def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput:
yield from self._pull_all_pages()


if __name__ == "__main__":
connector = GitHubPagesConnector(
base_url=os.environ["GITHUB_PAGES_BASE_URL"]
)

credentials = {
"github_username": os.getenv("GITHUB_USERNAME", ""),
"github_personal_access_token": os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN", ""),
}

connector.load_credentials(credentials)

document_batches = connector.poll_source(0, 0)
print(next(document_batches))
Binary file added web/public/GithubPages.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions web/src/components/admin/connectors/ConnectorTitle.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {
ConfluenceConfig,
Connector,
GithubConfig,
GithubPagesConfig,
GitlabConfig,
GoogleDriveConfig,
JiraConfig,
Expand Down Expand Up @@ -40,6 +41,9 @@ export const ConnectorTitle = ({
"Repo",
`${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}`
);
} else if (connector.source === "github_pages") {
const typedConnector = connector as Connector<GithubPagesConfig>;
additionalMetadata.set("Site URL", typedConnector.connector_specific_config.base_url);
} else if (connector.source === "gitlab") {
const typedConnector = connector as Connector<GitlabConfig>;
additionalMetadata.set(
Expand Down
7 changes: 7 additions & 0 deletions web/src/components/icons/icons.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1035,6 +1035,13 @@ export const GithubIcon = ({
<LogoIcon size={size} className={className} src="/Github.png" />
);

export const GithubPagesIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<LogoIcon size={size} className={className} src="/GithubPages.png" />
);

export const GmailIcon = ({
size = 16,
className = defaultTailwindCSS,
Expand Down
52 changes: 52 additions & 0 deletions web/src/lib/connectors/connectors.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,50 @@ export const connectorConfigs: Record<
],
advanced_values: [],
},
github_pages: {
description: "Configure GitHub Pages connector",
values: [
{
type: "text",
query: "Enter the base URL of the GitHub Pages site:",
label: "Base URL",
name: "base_url",
optional: false,
},
{
type: "checkbox",
query: "Authenticate requests with GitHub credentials?",
label: "Use GitHub Authentication",
description: "Enable this if your GitHub Pages site requires authentication",
name: "use_authentication",
optional: true,
},
],
advanced_values: [
{
type: "text",
query: "Enter your GitHub username (if using authentication):",
label: "GitHub Username",
name: "github_username",
optional: true,
},
{
type: "text",
query: "Enter your GitHub personal access token:",
label: "GitHub Personal Access Token",
name: "github_personal_access_token",
optional: true,
},
{
type: "number",
query: "Set the batch size for indexing (default is 10):",
label: "Batch Size",
name: "batch_size",
optional: true,
default: 10,
},
],
},
gitlab: {
description: "Configure GitLab connector",
values: [
Expand Down Expand Up @@ -1051,6 +1095,14 @@ export interface GithubConfig {
include_issues: boolean;
}

export interface GithubPagesConfig {
base_url: string;
use_authentication?: boolean;
github_username?: string;
github_personal_access_token?: string;
batch_size?: number;
}

export interface GitlabConfig {
project_owner: string;
project_name: string;
Expand Down
7 changes: 7 additions & 0 deletions web/src/lib/sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import {
ColorSlackIcon,
XenforoIcon,
FreshdeskIcon,
GithubPagesIcon,
} from "@/components/icons/icons";
import { ValidSources } from "./types";
import {
Expand Down Expand Up @@ -94,6 +95,12 @@ const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.CodeRepository,
docs: "https://docs.danswer.dev/connectors/github",
},
github_pages: {
icon: GithubPagesIcon,
displayName: "Github Pages",
category: SourceCategory.Wiki,
docs: "https://docs.danswer.dev/connectors/github_pages",
},
gitlab: {
icon: GitlabIcon,
displayName: "Gitlab",
Expand Down
1 change: 1 addition & 0 deletions web/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ export interface UserGroup {
const validSources = [
"web",
"github",
"github_pages",
"gitlab",
"slack",
"google_drive",
Expand Down

0 comments on commit 59bc907

Please sign in to comment.