Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#2282 fixed New Connector Added #2412

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/danswer/configs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class DocumentSource(str, Enum):
S3 = "s3"
R2 = "r2"
GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
GITHUB_PAGES = "github_pages"
OCI_STORAGE = "oci_storage"
NOT_APPLICABLE = "not_applicable"

Expand Down
2 changes: 2 additions & 0 deletions backend/danswer/connectors/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from danswer.connectors.zendesk.connector import ZendeskConnector
from danswer.connectors.zulip.connector import ZulipConnector
from danswer.db.credentials import backend_update_credential_json
from danswer.connectors.github_pages.connector import GitHubPagesConnector
from danswer.db.models import Credential


Expand Down Expand Up @@ -95,6 +96,7 @@ def identify_connector_class(
DocumentSource.R2: BlobStorageConnector,
DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector,
DocumentSource.OCI_STORAGE: BlobStorageConnector,
DocumentSource.GITHUB_PAGES: GitHubPagesConnector,
}
connector_by_source = connector_map.get(source, {})

Expand Down
Empty file.
117 changes: 117 additions & 0 deletions backend/danswer/connectors/github_pages/connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import Any, List, Optional
from requests.auth import HTTPBasicAuth
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger

logger = setup_logger()

_TIMEOUT = 60


class GitHubPagesConnector(LoadConnector, PollConnector):
def __init__(
self,
base_url: str,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.base_url = base_url
self.batch_size = batch_size
self.visited_urls = set()
self.auth: Optional[HTTPBasicAuth] = None # Will be used for authenticated requests

def load_credentials(self, credentials: dict[str, Any]) -> None:
# Load credentials if provided, otherwise remain unauthenticated
github_username = credentials.get("github_username")
github_token = credentials.get("github_personal_access_token")
if github_username and github_token:
self.auth = HTTPBasicAuth(github_username, github_token)
else:
self.auth = None # No authentication if credentials are not provided

def _crawl_github_pages(self, url: str, batch_size: int) -> List[str]:
to_visit = [url]
crawled_urls = []

while to_visit and len(crawled_urls) < batch_size:
current_url = to_visit.pop()
if current_url not in self.visited_urls:
try:
# Make request with or without authentication based on the credentials
if self.auth:
response = requests.get(current_url, timeout=_TIMEOUT, auth=self.auth)
else:
response = requests.get(current_url, timeout=_TIMEOUT)

response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

# Add current URL to visited and crawled lists
self.visited_urls.add(current_url)
crawled_urls.append(current_url)

# Extract all links and queue them for crawling
for link in soup.find_all('a'):
href = link.get('href')
if href:
full_url = urljoin(self.base_url, href)
if full_url.startswith(self.base_url) and full_url not in self.visited_urls:
to_visit.append(full_url)

except Exception as e:
logger.error(f"Error while accessing {current_url}: {e}")

return crawled_urls

def _index_pages(self, urls: List[str]):
documents = []
for url in urls:
documents.append(
Document(
id=url,
sections=[Section(link=url, text="")], # No content extraction needed
source=DocumentSource.GITHUB_PAGES,
semantic_identifier=url,
metadata={"url": url},
)
)
return documents

def _pull_all_pages(self):
all_crawled_urls = []
while True:
crawled_urls = self._crawl_github_pages(self.base_url, self.batch_size)
if not crawled_urls:
break
all_crawled_urls.extend(crawled_urls)
yield self._index_pages(crawled_urls)

def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput:
yield from self._pull_all_pages()


if __name__ == "__main__":
connector = GitHubPagesConnector(
base_url=os.environ["GITHUB_PAGES_BASE_URL"]
)

# Load credentials if provided (otherwise unauthenticated)
credentials = {
"github_username": os.getenv("GITHUB_USERNAME", ""),
"github_personal_access_token": os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN", ""),
}

connector.load_credentials(credentials)

document_batches = connector.poll_source(0, 0)
print(next(document_batches))
14 changes: 14 additions & 0 deletions web/src/components/icons/icons.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1068,6 +1068,20 @@ export const GithubIcon = ({
</div>
);
};
export const GitHubPagesIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => {
// GitHub Pages Icon adjustment for surrounding whitespace
return (
<div
style={{ width: `${size + 4}px`, height: `${size + 4}px` }}
className={`w-[${size + 4}px] h-[${size + 4}px] -m-0.5 ` + className}
>
<Image src={githubPagesSVG} alt="GitHub Pages Logo" width="96" height="96" />
</div>
);
};

export const GmailIcon = ({
size = 16,
Expand Down
21 changes: 21 additions & 0 deletions web/src/lib/connectors/connectors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,27 @@ export const connectorConfigs: Record<
},
],
},
github_pages: {
description: "Configure GitHub Pages connector",
values: [
{
type: "text",
query: "Enter the base URL of your GitHub Pages site:",
label: "GitHub Pages Base URL",
name: "github_pages_base_url",
optional: false,
},
{
type: "number",
query: "Enter the batch size for indexing (number of pages per batch):",
label: "Index Batch Size",
name: "github_pages_batch_size",
optional: true,
default: 10, // Default batch size
},
],
},

gitlab: {
description: "Configure GitLab connector",
values: [
Expand Down
5 changes: 5 additions & 0 deletions web/src/lib/connectors/credentials.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ export interface Credential<T> extends CredentialBase<T> {
export interface GithubCredentialJson {
github_access_token: string;
}
export interface GitthubPagesCredentialJson {
github_username: string;
github_access_token: string;
github_repo: string;
}

export interface GitlabCredentialJson {
gitlab_url: string;
Expand Down
7 changes: 7 additions & 0 deletions web/src/lib/sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
DropboxIcon,
FileIcon,
GithubIcon,
GitHubPagesIcon,
GitlabIcon,
GlobeIcon,
GmailIcon,
Expand Down Expand Up @@ -93,6 +94,12 @@ const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.CodeRepository,
docs: "https://docs.danswer.dev/connectors/github",
},
github_pages: {
icon: GitHubPagesIcon,
displayName: "Github Pages",
category: SourceCategory.Wiki,
docs: "https://docs.danswer.dev/connectors/github_pages",
},
gitlab: {
icon: GitlabIcon,
displayName: "Gitlab",
Expand Down
1 change: 1 addition & 0 deletions web/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ export interface UserGroup {
const validSources = [
"web",
"github",
"github_pages",
"gitlab",
"slack",
"google_drive",
Expand Down