-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
🎉 Source GitHub: Use CDK caching and convert PR-related streams to incremental #7250
Changes from 12 commits
4b098d0
624e42a
4acf0d4
a6cfa79
3895daa
19bc4e8
8ceac16
5a37198
b167b63
fa4214c
cc29d5f
1b88cf5
441eeb2
624e24c
378fa9d
081b112
7f1a9af
853a4e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,43 +10,16 @@ | |
from urllib import parse | ||
|
||
import requests | ||
import vcr | ||
from airbyte_cdk.models import SyncMode | ||
from airbyte_cdk.sources.streams.http import HttpStream, HttpSubStream | ||
from requests.exceptions import HTTPError | ||
from vcr.cassette import Cassette | ||
|
||
|
||
def request_cache() -> Cassette: | ||
""" | ||
Builds VCR instance. | ||
It deletes file everytime we create it, normally should be called only once. | ||
We can't use NamedTemporaryFile here because yaml serializer doesn't work well with empty files. | ||
""" | ||
filename = "request_cache.yml" | ||
try: | ||
os.remove(filename) | ||
except FileNotFoundError: | ||
pass | ||
|
||
return vcr.use_cassette(str(filename), record_mode="new_episodes", serializer="yaml") | ||
|
||
|
||
class GithubStream(HttpStream, ABC): | ||
cache = request_cache() | ||
url_base = "https://api.github.com/" | ||
|
||
# To prevent dangerous behavior, the `vcr` library prohibits the use of nested caching. | ||
# Here's an example of dangerous behavior: | ||
# cache = Cassette.use('whatever') | ||
# with cache: | ||
# with cache: | ||
# pass | ||
# | ||
# Therefore, we will only use `cache` for the top-level stream, so as not to cause possible difficulties. | ||
top_level_stream = True | ||
|
||
primary_key = "id" | ||
use_cache = True | ||
|
||
# GitHub pagination could be from 1 to 100. | ||
page_size = 100 | ||
|
@@ -100,11 +73,7 @@ def backoff_time(self, response: requests.Response) -> Union[int, float]: | |
|
||
def read_records(self, stream_slice: Mapping[str, any] = None, **kwargs) -> Iterable[Mapping[str, Any]]: | ||
try: | ||
if self.top_level_stream: | ||
with self.cache: | ||
yield from super().read_records(stream_slice=stream_slice, **kwargs) | ||
else: | ||
yield from super().read_records(stream_slice=stream_slice, **kwargs) | ||
yield from super().read_records(stream_slice=stream_slice, **kwargs) | ||
except HTTPError as e: | ||
error_msg = str(e) | ||
|
||
|
@@ -459,7 +428,7 @@ def is_sorted_descending(self) -> bool: | |
""" | ||
Depending if there any state we read stream in ascending or descending order. | ||
""" | ||
return self._first_read | ||
return not self._first_read | ||
|
||
|
||
class CommitComments(SemiIncrementalGithubStream): | ||
|
@@ -686,23 +655,38 @@ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str: | |
# Pull request substreams | ||
|
||
|
||
class PullRequestSubstream(HttpSubStream, GithubStream, ABC): | ||
top_level_stream = False | ||
class PullRequestSubstream(HttpSubStream, SemiIncrementalGithubStream, ABC): | ||
use_cache = False | ||
|
||
def __init__(self, parent: PullRequests, **kwargs): | ||
super().__init__(parent=parent, **kwargs) | ||
|
||
def stream_slices( | ||
self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None | ||
) -> Iterable[Optional[Mapping[str, Any]]]: | ||
parent_stream_slices = super().stream_slices(sync_mode=sync_mode, cursor_field=cursor_field, stream_state=stream_state) | ||
parent_stream_slices = list(super().stream_slices(sync_mode=sync_mode, cursor_field=cursor_field, stream_state=stream_state)) | ||
if self.parent.is_sorted_descending: | ||
parent_stream_slices.reverse() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand this part of the code correctly, a list of all parent records is being created here. So if, for example, pull_requests stream has 5000 records then all of them will be placed in the list. We can't do that because it kills the idea of stream_slices being a generator function and the idea that we output one record at a time and not store all record for specific stream. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed to force the pull_requests stream to fetch records in ascending order via an override param in the stream_state. |
||
|
||
for parent_stream_slice in parent_stream_slices: | ||
yield { | ||
"pull_request_number": parent_stream_slice["parent"]["number"], | ||
"repository": parent_stream_slice["parent"]["repository"], | ||
} | ||
|
||
# We've already determined the list of pull requests to run the stream against. | ||
# Skip the start_point_map and cursor_field logic in SemiIncrementalGithubStream.read_records. | ||
cjwooo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def read_records( | ||
self, | ||
sync_mode: SyncMode, | ||
cursor_field: List[str] = None, | ||
stream_slice: Mapping[str, Any] = None, | ||
stream_state: Mapping[str, Any] = None, | ||
) -> Iterable[Mapping[str, Any]]: | ||
yield from super(SemiIncrementalGithubStream, self).read_records( | ||
sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state | ||
) | ||
Zirochkaa marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
class PullRequestStats(PullRequestSubstream): | ||
""" | ||
|
@@ -731,19 +715,29 @@ class Reviews(PullRequestSubstream): | |
API docs: https://docs.github.com/en/rest/reference/pulls#list-reviews-for-a-pull-request | ||
""" | ||
|
||
cursor_field = "submitted_at" | ||
|
||
def path( | ||
self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None | ||
) -> str: | ||
return f"repos/{stream_slice['repository']}/pulls/{stream_slice['pull_request_number']}/reviews" | ||
|
||
# Set the parent stream state's cursor field before fetching its records | ||
def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: | ||
parent_state = deepcopy(stream_state) or {} | ||
for repository in self.repositories: | ||
if repository in parent_state and self.cursor_field in parent_state[repository]: | ||
parent_state[repository][self.parent.cursor_field] = parent_state[repository][self.cursor_field] | ||
yield from super().stream_slices(stream_state=parent_state, **kwargs) | ||
|
||
|
||
# Reactions streams | ||
|
||
|
||
class ReactionStream(GithubStream, ABC): | ||
|
||
parent_key = "id" | ||
top_level_stream = False | ||
use_cache = False | ||
|
||
def __init__(self, **kwargs): | ||
self._stream_kwargs = deepcopy(kwargs) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand why? Did we have an error connected to this? Did we send the wrong
direction
parameter or what?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes it was sending the wrong direction. https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-github/source_github/streams.py#L495-L496 states we want to sort in ascending order for the first run, then descending order for subsequent runs to allow the incremental behavior in https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-github/source_github/streams.py#L701-L702. However, the current stream version is setting
is_sorted_descending
to true ifself._first_read
is true, which is the opposite behavior.