-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add slim connector description #3303
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,12 +12,15 @@ | |
from danswer.configs.app_configs import INDEX_BATCH_SIZE | ||
from danswer.configs.constants import DocumentSource | ||
from danswer.connectors.interfaces import GenerateDocumentsOutput | ||
from danswer.connectors.interfaces import GenerateSlimDocumentOutput | ||
from danswer.connectors.interfaces import LoadConnector | ||
from danswer.connectors.interfaces import PollConnector | ||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch | ||
from danswer.connectors.interfaces import SlimConnector | ||
from danswer.connectors.models import ConnectorMissingCredentialError | ||
from danswer.connectors.models import Document | ||
from danswer.connectors.models import Section | ||
from danswer.connectors.models import SlimDocument | ||
from danswer.utils.logger import setup_logger | ||
|
||
|
||
|
@@ -28,6 +31,8 @@ | |
SLAB_GRAPHQL_MAX_TRIES = 10 | ||
SLAB_API_URL = "https://api.slab.com/v1/graphql" | ||
|
||
_SLIM_BATCH_SIZE = 1000 | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ^1000 is typical for this |
||
|
||
def run_graphql_request( | ||
graphql_query: dict, bot_token: str, max_tries: int = SLAB_GRAPHQL_MAX_TRIES | ||
|
@@ -158,21 +163,26 @@ def get_slab_url_from_title_id(base_url: str, title: str, page_id: str) -> str: | |
return urljoin(urljoin(base_url, "posts/"), url_id) | ||
|
||
|
||
class SlabConnector(LoadConnector, PollConnector): | ||
class SlabConnector(LoadConnector, PollConnector, SlimConnector): | ||
def __init__( | ||
self, | ||
base_url: str, | ||
batch_size: int = INDEX_BATCH_SIZE, | ||
slab_bot_token: str | None = None, | ||
) -> None: | ||
self.base_url = base_url | ||
self.batch_size = batch_size | ||
self.slab_bot_token = slab_bot_token | ||
self._slab_bot_token: str | None = None | ||
|
||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: | ||
self.slab_bot_token = credentials["slab_bot_token"] | ||
self._slab_bot_token = credentials["slab_bot_token"] | ||
return None | ||
|
||
@property | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was added for cleanliness. However, its specific to this connector so can be ignored |
||
def slab_bot_token(self) -> str: | ||
if self._slab_bot_token is None: | ||
raise ConnectorMissingCredentialError("Slab") | ||
return self._slab_bot_token | ||
|
||
def _iterate_posts( | ||
self, time_filter: Callable[[datetime], bool] | None = None | ||
) -> GenerateDocumentsOutput: | ||
|
@@ -227,3 +237,21 @@ def poll_source( | |
yield from self._iterate_posts( | ||
time_filter=lambda t: start_time <= t <= end_time | ||
) | ||
|
||
def retrieve_all_slim_documents( | ||
self, | ||
start: SecondsSinceUnixEpoch | None = None, | ||
end: SecondsSinceUnixEpoch | None = None, | ||
) -> GenerateSlimDocumentOutput: | ||
slim_doc_batch: list[SlimDocument] = [] | ||
for post_id in get_all_post_ids(self.slab_bot_token): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Notice here we call only get_all_post_ids(self.slab_bot_token). This means when running this connector, we never retrieve all the additional information that load and poll connectors do retrieve, meaning this is a much lighter/faster process. make sure:
|
||
slim_doc_batch.append( | ||
SlimDocument( | ||
id=post_id, | ||
) | ||
) | ||
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE: | ||
yield slim_doc_batch | ||
slim_doc_batch = [] | ||
if slim_doc_batch: | ||
yield slim_doc_batch |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import json | ||
import os | ||
import time | ||
from pathlib import Path | ||
|
||
import pytest | ||
|
||
from danswer.configs.constants import DocumentSource | ||
from danswer.connectors.models import Document | ||
from danswer.connectors.slab.connector import SlabConnector | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is only necessary if loading data from an outside folder |
||
def load_test_data(file_name: str = "test_slab_data.json") -> dict[str, str]: | ||
current_dir = Path(__file__).parent | ||
with open(current_dir / file_name, "r") as f: | ||
return json.load(f) | ||
|
||
|
||
@pytest.fixture | ||
def slab_connector() -> SlabConnector: | ||
connector = SlabConnector( | ||
base_url="https://onyx-test.slab.com/", | ||
) | ||
connector.load_credentials( | ||
{ | ||
"slab_bot_token": os.environ["SLAB_BOT_TOKEN"], | ||
} | ||
) | ||
return connector | ||
|
||
|
||
@pytest.mark.xfail( | ||
reason=( | ||
"Need a test account with a slab subscription to run this test." | ||
"Trial only lasts 14 days." | ||
) | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there is not already a test verifying the content of at least one of the retrieved documents, please add one! |
||
def test_slab_connector_basic(slab_connector: SlabConnector) -> None: | ||
all_docs: list[Document] = [] | ||
target_test_doc_id = "jcp6cohu" | ||
target_test_doc: Document | None = None | ||
for doc_batch in slab_connector.poll_source(0, time.time()): | ||
for doc in doc_batch: | ||
all_docs.append(doc) | ||
if doc.id == target_test_doc_id: | ||
target_test_doc = doc | ||
|
||
assert len(all_docs) == 6 | ||
assert target_test_doc is not None | ||
|
||
desired_test_data = load_test_data() | ||
assert ( | ||
target_test_doc.semantic_identifier == desired_test_data["semantic_identifier"] | ||
) | ||
assert target_test_doc.source == DocumentSource.SLAB | ||
assert target_test_doc.metadata == {} | ||
assert target_test_doc.primary_owners is None | ||
assert target_test_doc.secondary_owners is None | ||
assert target_test_doc.title is None | ||
assert target_test_doc.from_ingestion_api is False | ||
assert target_test_doc.additional_info is None | ||
|
||
assert len(target_test_doc.sections) == 1 | ||
section = target_test_doc.sections[0] | ||
# Need to replace the weird apostrophe with a normal one | ||
assert section.text.replace("\u2019", "'") == desired_test_data["section_text"] | ||
assert section.link == desired_test_data["link"] | ||
|
||
|
||
@pytest.mark.xfail( | ||
reason=( | ||
"Need a test account with a slab subscription to run this test." | ||
"Trial only lasts 14 days." | ||
) | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test just checks that the ids retrieved through the load_from_state() is a subset of the ids retrieved from retrieve_all_slim_documents(). We dont check direct equality between the 2 sets because there are some circumstances where the slim connector might find an id that is filtered out by load/poll connector. This is a case that is handled downstream and is therefor okay |
||
def test_slab_connector_slim(slab_connector: SlabConnector) -> None: | ||
# Get all doc IDs from the full connector | ||
all_full_doc_ids = set() | ||
for doc_batch in slab_connector.load_from_state(): | ||
all_full_doc_ids.update([doc.id for doc in doc_batch]) | ||
|
||
# Get all doc IDs from the slim connector | ||
all_slim_doc_ids = set() | ||
for slim_doc_batch in slab_connector.retrieve_all_slim_documents(): | ||
all_slim_doc_ids.update([doc.id for doc in slim_doc_batch]) | ||
|
||
# The set of full doc IDs should be always be a subset of the slim doc IDs | ||
assert all_full_doc_ids.issubset(all_slim_doc_ids) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We keep long texts to check against in another file as a json for code cleanliness |
||
"section_text": "Learn about Posts\nWelcome\nThis is a post, where you can edit, share, and collaborate in real time with your team. We'd love to show you how it works!\nReading and editing\nClick the mode button to toggle between read and edit modes. You can only make changes to a post when editing.\nOrganize your posts\nWhen in edit mode, you can add topics to a post, which will keep it organized for the right 👀 to see.\nSmart mentions\nMentions are references to users, posts, topics and third party tools that show details on hover. Paste in a link for automatic conversion.\nLook back in time\nYou are ready to begin writing. You can always bring back this tour in the help menu.\nGreat job!\nYou are ready to begin writing. You can always bring back this tour in the help menu.\n\n", | ||
"link": "https://onyx-test.slab.com/posts/learn-about-posts-jcp6cohu", | ||
"semantic_identifier": "Learn about Posts" | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
^ This is required to make the tests run in github.
When submitting the PR for merging, make sure to share this with one of the developers @ danswer