Skip to content

Commit

Permalink
#12013 source GA to Beta: always sync data from two days ago
Browse files Browse the repository at this point in the history
  • Loading branch information
davydov-d committed Apr 28, 2022
1 parent 42a58b0 commit 52eee69
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ COPY main.py ./
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.1.19
LABEL io.airbyte.version=0.1.20
LABEL io.airbyte.name=airbyte/source-google-analytics-v4
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{"stream": "new_users_per_day", "data": {"ga_date": "2021-12-10", "ga_country": "United States", "ga_region": "Washington", "ga_newUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563255199}
{"stream": "devices", "data": {"ga_date": "2021-12-10", "ga_deviceCategory": "desktop", "ga_operatingSystem": "Macintosh", "ga_browser": "Firefox", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563253285}
{"stream": "daily_active_users", "data": {"ga_date": "2021-12-10", "ga_1dayUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563251092}
{"stream": "weekly_active_users", "data": {"ga_date": "2021-12-15", "ga_7dayUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563249172}
{"stream": "locations", "data": {"ga_date": "2021-12-10", "ga_continent": "Americas", "ga_subContinent": "Northern America", "ga_country": "United States", "ga_region": "Washington", "ga_metro": "Seattle-Tacoma WA", "ga_city": "Seattle", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563230934}
{"stream": "pages", "data": {"ga_date": "2021-12-10", "ga_hostname": "www.surveymonkey.com", "ga_pagePath": "/apps/NKI5TOTqk4tS5BZyJXU9YQ_3D_3D/preview", "ga_pageviews": 1, "ga_uniquePageviews": 1, "ga_avgTimeOnPage": 0.0, "ga_entrances": 1, "ga_entranceRate": 100.0, "ga_bounceRate": 100.0, "ga_exits": 1, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563227527}
{"stream": "new_users_per_day", "data": {"ga_date": "2021-12-10", "ga_country": "United States", "ga_region": "Washington", "ga_newUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563255199}
{"stream": "devices", "data": {"ga_date": "2021-12-10", "ga_deviceCategory": "desktop", "ga_operatingSystem": "Macintosh", "ga_browser": "Firefox", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563253285}
{"stream": "daily_active_users", "data": {"ga_date": "2021-12-10", "ga_1dayUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563251092}
{"stream": "weekly_active_users", "data": {"ga_date": "2021-12-15", "ga_7dayUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563249172}
{"stream": "locations", "data": {"ga_date": "2021-12-10", "ga_continent": "Americas", "ga_subContinent": "Northern America", "ga_country": "United States", "ga_region": "Washington", "ga_metro": "Seattle-Tacoma WA", "ga_city": "Seattle", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563230934}
{"stream": "pages", "data": {"ga_date": "2021-12-10", "ga_hostname": "www.surveymonkey.com", "ga_pagePath": "/apps/NKI5TOTqk4tS5BZyJXU9YQ_3D_3D/preview", "ga_pageviews": 1, "ga_uniquePageviews": 1, "ga_avgTimeOnPage": 0.0, "ga_entrances": 1, "ga_entranceRate": 100.0, "ga_bounceRate": 100.0, "ga_exits": 1, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563227527}
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def get_json_schema(self) -> Mapping[str, Any]:
if data_format:
metric_data["format"] = data_format
schema["properties"][metric] = metric_data

schema["properties"]["isDataGolden"] = {"type": "boolean"}
return schema

def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -> Iterable[Optional[Mapping[str, Any]]]:
Expand All @@ -226,14 +226,15 @@ def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -
...]
"""

today = pendulum.now().date()
end_date = pendulum.now().date()
start_date = pendulum.parse(self.start_date).date()
if stream_state:
prev_end_date = pendulum.parse(stream_state.get(self.cursor_field)).date()
start_date = prev_end_date.add(days=1)
end_date = today
if start_date > end_date:
return [None]
start_date = prev_end_date.add(days=1) # do not include previous `end_date`
# always resync 2 previous days to be sure data is golden
# https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article
# https://github.com/airbytehq/airbyte/issues/12013#issuecomment-1111255503
start_date = start_date.subtract(days=2)

date_slices = []
slice_start_date = start_date
Expand Down Expand Up @@ -403,7 +404,7 @@ def parse_response(self, response: requests.Response, **kwargs: Any) -> Iterable
record[metric_name.replace("ga:", "ga_")] = value

record["view_id"] = self.view_id

record["isDataGolden"] = report.get("data", {}).get("isDataGolden", True)
yield record

def check_for_sampled_result(self, data: Mapping) -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
#

import pendulum


class StreamStateProxy(dict):
"""
docstring
"""

DATE_FORMAT = "%Y-%m-%d"

@classmethod
def str_to_dt(cls, _str):
return _str and pendulum.parse(_str).date()

@classmethod
def dt_to_str(cls, dt):
return dt and dt.strftime(cls.DATE_FORMAT)

@property
def non_golden_dates(self):
return list(map(self.str_to_dt, self.get("non_golden_dates", [])))

@property
def golden_date(self):
return self.str_to_dt(self.get("golden_date") or self.get("ga_date")) # support legacy format

@golden_date.setter
def golden_date(self, val):
self["golden_date"] = self.dt_to_str(val)

def non_golden_date_sequences(self, max_length):
seq_start, seq_end = None, None
today = pendulum.now().date()
for current_day in self.non_golden_dates:
if current_day > today:
continue
if seq_start is None:
seq_start = seq_end = current_day
continue
next_day = seq_end.add(days=1)
if current_day != next_day:
# sequence broken
yield [seq_start, seq_end]
seq_start, seq_end = None, None
continue
seq_end = current_day
if (seq_end - seq_start).days == max_length:
yield [seq_start, seq_end]
seq_start, seq_end = None, None
continue
if seq_start and seq_end:
yield [seq_start, seq_end]

def update_with_record(self, date, is_data_golden):
date = self.str_to_dt(date)
if is_data_golden:
if date in self.non_golden_dates:
self.non_golden_dates.remove(date)
self.golden_date = max(self.golden_date, date) if self.golden_date else date
if not is_data_golden:
self.non_golden_dates.append(date)
self.non_golden_dates.sort()
if self.golden_date and self.golden_date < date:
self.golden_date = None
# record_date, record_golden =
# if current_stream_state
# current_date, current_golden = current_stream_state.get(self.cursor_field, ["", False])
# return {self.cursor_field: max(, )}
Original file line number Diff line number Diff line change
Expand Up @@ -310,13 +310,15 @@ def test_unknown_metrics_or_dimensions_error_validation(mock_metrics_dimensions_


@freeze_time("2021-11-30")
def test_stream_slices_limited_by_current_date(test_config, mock_metrics_dimensions_type_list_link):
def test_stream_slice_limits(test_config, mock_metrics_dimensions_type_list_link):
test_config["window_in_days"] = 14
g = GoogleAnalyticsV4IncrementalObjectsBase(config=test_config)
stream_state = {"ga_date": "2021-11-25"}
slices = g.stream_slices(stream_state=stream_state)
current_date = pendulum.now().date().strftime("%Y-%m-%d")
assert slices == [{"startDate": "2021-11-26", "endDate": current_date}]
expected_start_date = "2021-11-24" # always resync two days back
expected_end_date = current_date # do not try to sync future dates
assert slices == [{"startDate": expected_start_date, "endDate": expected_end_date}]


@freeze_time("2021-11-30")
Expand Down
2 changes: 2 additions & 0 deletions docs/integrations/sources/google-analytics-v4.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ When sampling occurs, a warning is logged to the sync log.
## IsDataGolden

Google Analytics API may return provisional or incomplete data. When this occurs, the returned data will set the flag `isDataGolden` to false, and the connector will log a warning to the sync log.
The connector always syncs data from 2 days ago due to the fact it [takes](https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article) Google Analytics up to 48 hours to update the data. To determine whether data is finished processing or not, the `isDataGolden` flag is exposed and should be used.

## Reading Custom Reports

Expand Down Expand Up @@ -159,6 +160,7 @@ Incremental sync is supported only if you add `ga:date` dimension to your custom

| Version | Date | Pull Request | Subject |
|:--------|:-----------|:---------------------------------------------------------|:---------------------------------------------------------------------------------------------|
| 0.1.20 | 2022-04-28 | [12150](https://github.com/airbytehq/airbyte/pull/12150) | Expose `isDataGOlden` field and always resync data two days back to make sure it is golden |
| 0.1.19 | 2022-04-19 | [12150](https://github.com/airbytehq/airbyte/pull/12150) | Minor changes to documentation |
| 0.1.18 | 2022-04-07 | [11803](https://github.com/airbytehq/airbyte/pull/11803) | Improved documentation |
| 0.1.17 | 2022-03-31 | [11512](https://github.com/airbytehq/airbyte/pull/11512) | Improved Unit and Acceptance tests coverage, fixed `read` with abnormally large state values |
Expand Down

0 comments on commit 52eee69

Please sign in to comment.