From 52eee69f6991ed32e9e0ca05d471d5fcded0b2d8 Mon Sep 17 00:00:00 2001 From: Denys Davydov Date: Thu, 28 Apr 2022 10:47:43 +0300 Subject: [PATCH 1/7] #12013 source GA to Beta: always sync data from two days ago --- .../source-google-analytics-v4/Dockerfile | 2 +- .../integration_tests/expected_records.txt | 12 ++-- .../source_google_analytics_v4/source.py | 15 ++-- .../source_google_analytics_v4/state.py | 72 +++++++++++++++++++ .../unit_tests/unit_test.py | 6 +- .../sources/google-analytics-v4.md | 2 + 6 files changed, 93 insertions(+), 16 deletions(-) create mode 100644 airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/state.py diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile b/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile index fffcbe5b003c..56f7b74e9cd3 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile +++ b/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile @@ -12,5 +12,5 @@ COPY main.py ./ ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.1.19 +LABEL io.airbyte.version=0.1.20 LABEL io.airbyte.name=airbyte/source-google-analytics-v4 diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt b/airbyte-integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt index 3701178cfbe6..0a2dd91721f2 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt +++ b/airbyte-integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt @@ -1,6 +1,6 @@ -{"stream": "new_users_per_day", "data": {"ga_date": "2021-12-10", "ga_country": "United States", "ga_region": "Washington", "ga_newUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563255199} -{"stream": "devices", "data": {"ga_date": "2021-12-10", "ga_deviceCategory": "desktop", "ga_operatingSystem": "Macintosh", "ga_browser": "Firefox", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563253285} -{"stream": "daily_active_users", "data": {"ga_date": "2021-12-10", "ga_1dayUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563251092} -{"stream": "weekly_active_users", "data": {"ga_date": "2021-12-15", "ga_7dayUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563249172} -{"stream": "locations", "data": {"ga_date": "2021-12-10", "ga_continent": "Americas", "ga_subContinent": "Northern America", "ga_country": "United States", "ga_region": "Washington", "ga_metro": "Seattle-Tacoma WA", "ga_city": "Seattle", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563230934} -{"stream": "pages", "data": {"ga_date": "2021-12-10", "ga_hostname": "www.surveymonkey.com", "ga_pagePath": "/apps/NKI5TOTqk4tS5BZyJXU9YQ_3D_3D/preview", "ga_pageviews": 1, "ga_uniquePageviews": 1, "ga_avgTimeOnPage": 0.0, "ga_entrances": 1, "ga_entranceRate": 100.0, "ga_bounceRate": 100.0, "ga_exits": 1, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563227527} \ No newline at end of file +{"stream": "new_users_per_day", "data": {"ga_date": "2021-12-10", "ga_country": "United States", "ga_region": "Washington", "ga_newUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563255199} +{"stream": "devices", "data": {"ga_date": "2021-12-10", "ga_deviceCategory": "desktop", "ga_operatingSystem": "Macintosh", "ga_browser": "Firefox", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563253285} +{"stream": "daily_active_users", "data": {"ga_date": "2021-12-10", "ga_1dayUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563251092} +{"stream": "weekly_active_users", "data": {"ga_date": "2021-12-15", "ga_7dayUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563249172} +{"stream": "locations", "data": {"ga_date": "2021-12-10", "ga_continent": "Americas", "ga_subContinent": "Northern America", "ga_country": "United States", "ga_region": "Washington", "ga_metro": "Seattle-Tacoma WA", "ga_city": "Seattle", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563230934} +{"stream": "pages", "data": {"ga_date": "2021-12-10", "ga_hostname": "www.surveymonkey.com", "ga_pagePath": "/apps/NKI5TOTqk4tS5BZyJXU9YQ_3D_3D/preview", "ga_pageviews": 1, "ga_uniquePageviews": 1, "ga_avgTimeOnPage": 0.0, "ga_entrances": 1, "ga_entranceRate": 100.0, "ga_bounceRate": 100.0, "ga_exits": 1, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563227527} \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py b/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py index b6fcc161d5f2..b3ed0813364e 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py +++ b/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py @@ -209,7 +209,7 @@ def get_json_schema(self) -> Mapping[str, Any]: if data_format: metric_data["format"] = data_format schema["properties"][metric] = metric_data - + schema["properties"]["isDataGolden"] = {"type": "boolean"} return schema def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -> Iterable[Optional[Mapping[str, Any]]]: @@ -226,14 +226,15 @@ def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) - ...] """ - today = pendulum.now().date() + end_date = pendulum.now().date() start_date = pendulum.parse(self.start_date).date() if stream_state: prev_end_date = pendulum.parse(stream_state.get(self.cursor_field)).date() - start_date = prev_end_date.add(days=1) - end_date = today - if start_date > end_date: - return [None] + start_date = prev_end_date.add(days=1) # do not include previous `end_date` + # always resync 2 previous days to be sure data is golden + # https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article + # https://github.com/airbytehq/airbyte/issues/12013#issuecomment-1111255503 + start_date = start_date.subtract(days=2) date_slices = [] slice_start_date = start_date @@ -403,7 +404,7 @@ def parse_response(self, response: requests.Response, **kwargs: Any) -> Iterable record[metric_name.replace("ga:", "ga_")] = value record["view_id"] = self.view_id - + record["isDataGolden"] = report.get("data", {}).get("isDataGolden", True) yield record def check_for_sampled_result(self, data: Mapping) -> None: diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/state.py b/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/state.py new file mode 100644 index 000000000000..3e6ec082e54c --- /dev/null +++ b/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/state.py @@ -0,0 +1,72 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# + +import pendulum + + +class StreamStateProxy(dict): + """ + docstring + """ + + DATE_FORMAT = "%Y-%m-%d" + + @classmethod + def str_to_dt(cls, _str): + return _str and pendulum.parse(_str).date() + + @classmethod + def dt_to_str(cls, dt): + return dt and dt.strftime(cls.DATE_FORMAT) + + @property + def non_golden_dates(self): + return list(map(self.str_to_dt, self.get("non_golden_dates", []))) + + @property + def golden_date(self): + return self.str_to_dt(self.get("golden_date") or self.get("ga_date")) # support legacy format + + @golden_date.setter + def golden_date(self, val): + self["golden_date"] = self.dt_to_str(val) + + def non_golden_date_sequences(self, max_length): + seq_start, seq_end = None, None + today = pendulum.now().date() + for current_day in self.non_golden_dates: + if current_day > today: + continue + if seq_start is None: + seq_start = seq_end = current_day + continue + next_day = seq_end.add(days=1) + if current_day != next_day: + # sequence broken + yield [seq_start, seq_end] + seq_start, seq_end = None, None + continue + seq_end = current_day + if (seq_end - seq_start).days == max_length: + yield [seq_start, seq_end] + seq_start, seq_end = None, None + continue + if seq_start and seq_end: + yield [seq_start, seq_end] + + def update_with_record(self, date, is_data_golden): + date = self.str_to_dt(date) + if is_data_golden: + if date in self.non_golden_dates: + self.non_golden_dates.remove(date) + self.golden_date = max(self.golden_date, date) if self.golden_date else date + if not is_data_golden: + self.non_golden_dates.append(date) + self.non_golden_dates.sort() + if self.golden_date and self.golden_date < date: + self.golden_date = None + # record_date, record_golden = + # if current_stream_state + # current_date, current_golden = current_stream_state.get(self.cursor_field, ["", False]) + # return {self.cursor_field: max(, )} diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py index b95ed88cbdf3..60663d7087fe 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py +++ b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py @@ -310,13 +310,15 @@ def test_unknown_metrics_or_dimensions_error_validation(mock_metrics_dimensions_ @freeze_time("2021-11-30") -def test_stream_slices_limited_by_current_date(test_config, mock_metrics_dimensions_type_list_link): +def test_stream_slice_limits(test_config, mock_metrics_dimensions_type_list_link): test_config["window_in_days"] = 14 g = GoogleAnalyticsV4IncrementalObjectsBase(config=test_config) stream_state = {"ga_date": "2021-11-25"} slices = g.stream_slices(stream_state=stream_state) current_date = pendulum.now().date().strftime("%Y-%m-%d") - assert slices == [{"startDate": "2021-11-26", "endDate": current_date}] + expected_start_date = "2021-11-24" # always resync two days back + expected_end_date = current_date # do not try to sync future dates + assert slices == [{"startDate": expected_start_date, "endDate": expected_end_date}] @freeze_time("2021-11-30") diff --git a/docs/integrations/sources/google-analytics-v4.md b/docs/integrations/sources/google-analytics-v4.md index 7b57aabc8483..ef205684997b 100644 --- a/docs/integrations/sources/google-analytics-v4.md +++ b/docs/integrations/sources/google-analytics-v4.md @@ -97,6 +97,7 @@ When sampling occurs, a warning is logged to the sync log. ## IsDataGolden Google Analytics API may return provisional or incomplete data. When this occurs, the returned data will set the flag `isDataGolden` to false, and the connector will log a warning to the sync log. +The connector always syncs data from 2 days ago due to the fact it [takes](https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article) Google Analytics up to 48 hours to update the data. To determine whether data is finished processing or not, the `isDataGolden` flag is exposed and should be used. ## Reading Custom Reports @@ -159,6 +160,7 @@ Incremental sync is supported only if you add `ga:date` dimension to your custom | Version | Date | Pull Request | Subject | |:--------|:-----------|:---------------------------------------------------------|:---------------------------------------------------------------------------------------------| +| 0.1.20 | 2022-04-28 | [12150](https://github.com/airbytehq/airbyte/pull/12150) | Expose `isDataGOlden` field and always resync data two days back to make sure it is golden | | 0.1.19 | 2022-04-19 | [12150](https://github.com/airbytehq/airbyte/pull/12150) | Minor changes to documentation | | 0.1.18 | 2022-04-07 | [11803](https://github.com/airbytehq/airbyte/pull/11803) | Improved documentation | | 0.1.17 | 2022-03-31 | [11512](https://github.com/airbytehq/airbyte/pull/11512) | Improved Unit and Acceptance tests coverage, fixed `read` with abnormally large state values | From 3e371d57f392ee8edc6bbc60f410769905ef5ef7 Mon Sep 17 00:00:00 2001 From: Denys Davydov Date: Thu, 28 Apr 2022 10:50:28 +0300 Subject: [PATCH 2/7] #12013 GA to Beta: fix changelog --- docs/integrations/sources/google-analytics-v4.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/sources/google-analytics-v4.md b/docs/integrations/sources/google-analytics-v4.md index ef205684997b..8f52d238e6c2 100644 --- a/docs/integrations/sources/google-analytics-v4.md +++ b/docs/integrations/sources/google-analytics-v4.md @@ -160,7 +160,7 @@ Incremental sync is supported only if you add `ga:date` dimension to your custom | Version | Date | Pull Request | Subject | |:--------|:-----------|:---------------------------------------------------------|:---------------------------------------------------------------------------------------------| -| 0.1.20 | 2022-04-28 | [12150](https://github.com/airbytehq/airbyte/pull/12150) | Expose `isDataGOlden` field and always resync data two days back to make sure it is golden | +| 0.1.20 | 2022-04-28 | [12426](https://github.com/airbytehq/airbyte/pull/12426) | Expose `isDataGOlden` field and always resync data two days back to make sure it is golden | | 0.1.19 | 2022-04-19 | [12150](https://github.com/airbytehq/airbyte/pull/12150) | Minor changes to documentation | | 0.1.18 | 2022-04-07 | [11803](https://github.com/airbytehq/airbyte/pull/11803) | Improved documentation | | 0.1.17 | 2022-03-31 | [11512](https://github.com/airbytehq/airbyte/pull/11512) | Improved Unit and Acceptance tests coverage, fixed `read` with abnormally large state values | From 3c2e7cf8be7dcfe989688c86806a2d57d8f9e5af Mon Sep 17 00:00:00 2001 From: Denys Davydov Date: Thu, 28 Apr 2022 10:53:03 +0300 Subject: [PATCH 3/7] #12013 source GA to Beta: rm odd file --- .../source_google_analytics_v4/state.py | 72 ------------------- 1 file changed, 72 deletions(-) delete mode 100644 airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/state.py diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/state.py b/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/state.py deleted file mode 100644 index 3e6ec082e54c..000000000000 --- a/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/state.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. -# - -import pendulum - - -class StreamStateProxy(dict): - """ - docstring - """ - - DATE_FORMAT = "%Y-%m-%d" - - @classmethod - def str_to_dt(cls, _str): - return _str and pendulum.parse(_str).date() - - @classmethod - def dt_to_str(cls, dt): - return dt and dt.strftime(cls.DATE_FORMAT) - - @property - def non_golden_dates(self): - return list(map(self.str_to_dt, self.get("non_golden_dates", []))) - - @property - def golden_date(self): - return self.str_to_dt(self.get("golden_date") or self.get("ga_date")) # support legacy format - - @golden_date.setter - def golden_date(self, val): - self["golden_date"] = self.dt_to_str(val) - - def non_golden_date_sequences(self, max_length): - seq_start, seq_end = None, None - today = pendulum.now().date() - for current_day in self.non_golden_dates: - if current_day > today: - continue - if seq_start is None: - seq_start = seq_end = current_day - continue - next_day = seq_end.add(days=1) - if current_day != next_day: - # sequence broken - yield [seq_start, seq_end] - seq_start, seq_end = None, None - continue - seq_end = current_day - if (seq_end - seq_start).days == max_length: - yield [seq_start, seq_end] - seq_start, seq_end = None, None - continue - if seq_start and seq_end: - yield [seq_start, seq_end] - - def update_with_record(self, date, is_data_golden): - date = self.str_to_dt(date) - if is_data_golden: - if date in self.non_golden_dates: - self.non_golden_dates.remove(date) - self.golden_date = max(self.golden_date, date) if self.golden_date else date - if not is_data_golden: - self.non_golden_dates.append(date) - self.non_golden_dates.sort() - if self.golden_date and self.golden_date < date: - self.golden_date = None - # record_date, record_golden = - # if current_stream_state - # current_date, current_golden = current_stream_state.get(self.cursor_field, ["", False]) - # return {self.cursor_field: max(, )} From 2a091ee10b7aa88e5af9bdaa9bc46860712008be Mon Sep 17 00:00:00 2001 From: Denys Davydov Date: Thu, 28 Apr 2022 14:06:42 +0300 Subject: [PATCH 4/7] #12013 Source GA to Beta: comment out integration tests --- .../acceptance-test-config.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml b/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml index a697c0fc0955..920381712869 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml +++ b/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml @@ -20,10 +20,12 @@ tests: empty_streams: [] expect_records: path: "integration_tests/expected_records.txt" - incremental: - - config_path: "secrets/service_config.json" - configured_catalog_path: "integration_tests/configured_catalog.json" - future_state_path: "integration_tests/abnormal_state.json" +# Since the connector makes 2 days look back window, it can not pass SAT where all records produce cursor value greater ao equal to a state value +# see https://github.com/airbytehq/airbyte/issues/12013 for details +# incremental: +# - config_path: "secrets/service_config.json" +# configured_catalog_path: "integration_tests/configured_catalog.json" +# future_state_path: "integration_tests/abnormal_state.json" full_refresh: - config_path: "secrets/service_config.json" configured_catalog_path: "integration_tests/configured_catalog.json" From 37fc7f1860cd1d6080b62f2254c821814fae543e Mon Sep 17 00:00:00 2001 From: Denys Davydov Date: Sun, 1 May 2022 12:05:00 +0300 Subject: [PATCH 5/7] #12013 expose isDataGolden field, assume missing field equals False --- .../source_google_analytics_v4/source.py | 12 ++++----- ...records.json => response_golden_data.json} | 1 + ...lse.json => response_non_golden_data.json} | 1 - .../unit_tests/unit_test.py | 27 ++++++++++++------- 4 files changed, 25 insertions(+), 16 deletions(-) rename airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/{response_with_records.json => response_golden_data.json} (96%) rename airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/{response_is_data_golden_false.json => response_non_golden_data.json} (96%) diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py b/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py index b3ed0813364e..70a0f5cdc0de 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py +++ b/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py @@ -231,10 +231,10 @@ def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) - if stream_state: prev_end_date = pendulum.parse(stream_state.get(self.cursor_field)).date() start_date = prev_end_date.add(days=1) # do not include previous `end_date` - # always resync 2 previous days to be sure data is golden - # https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article - # https://github.com/airbytehq/airbyte/issues/12013#issuecomment-1111255503 - start_date = start_date.subtract(days=2) + # always resync 2 previous days to be sure data is golden + # https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article + # https://github.com/airbytehq/airbyte/issues/12013#issuecomment-1111255503 + start_date = start_date.subtract(days=2) date_slices = [] slice_start_date = start_date @@ -404,11 +404,11 @@ def parse_response(self, response: requests.Response, **kwargs: Any) -> Iterable record[metric_name.replace("ga:", "ga_")] = value record["view_id"] = self.view_id - record["isDataGolden"] = report.get("data", {}).get("isDataGolden", True) + record["isDataGolden"] = report.get("data", {}).get("isDataGolden", False) yield record def check_for_sampled_result(self, data: Mapping) -> None: - if not data.get("isDataGolden", True): + if not data.get("isDataGolden", False): self.logger.warning(DATA_IS_NOT_GOLDEN_MSG) if data.get("samplesReadCounts", False): self.logger.warning(RESULT_IS_SAMPLED_MSG) diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_with_records.json b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_golden_data.json similarity index 96% rename from airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_with_records.json rename to airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_golden_data.json index be89bd585876..38b5a1af404c 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_with_records.json +++ b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_golden_data.json @@ -23,6 +23,7 @@ ] } ], + "isDataGolden": true, "totals": [ { "values": ["158"] diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_is_data_golden_false.json b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_non_golden_data.json similarity index 96% rename from airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_is_data_golden_false.json rename to airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_non_golden_data.json index ff7e3d23ad23..486c27180ec3 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_is_data_golden_false.json +++ b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_non_golden_data.json @@ -23,7 +23,6 @@ ] } ], - "isDataGolden": false, "totals": [ { "values": ["158"] diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py index 60663d7087fe..f5134467f412 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py +++ b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py @@ -11,7 +11,7 @@ import pendulum import pytest -from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode +from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode, Type from airbyte_cdk.sources.streams.http.auth import NoAuth from freezegun import freeze_time from source_google_analytics_v4.source import ( @@ -81,10 +81,10 @@ def mock_api_returns_no_records(requests_mock): @pytest.fixture def mock_api_returns_valid_records(requests_mock): """API returns valid data for given date based slice""" - yield requests_mock.post( - "https://analyticsreporting.googleapis.com/v4/reports:batchGet", - json=json.loads(read_file("response_with_records.json")), - ) + response = json.loads(read_file("response_golden_data.json")) + for report in response["reports"]: + assert report["data"]["isDataGolden"] is True + yield requests_mock.post("https://analyticsreporting.googleapis.com/v4/reports:batchGet", json=response) @pytest.fixture @@ -99,10 +99,10 @@ def mock_api_returns_sampled_results(requests_mock): @pytest.fixture def mock_api_returns_is_data_golden_false(requests_mock): """API returns valid data for given date based slice""" - yield requests_mock.post( - "https://analyticsreporting.googleapis.com/v4/reports:batchGet", - json=json.loads(read_file("response_is_data_golden_false.json")), - ) + response = json.loads(read_file("response_non_golden_data.json")) + for report in response["reports"]: + assert "isDataGolden" not in report["data"] + yield requests_mock.post("https://analyticsreporting.googleapis.com/v4/reports:batchGet", json=response) @pytest.fixture @@ -372,3 +372,12 @@ def test_connection_fail_due_to_http_status( assert "Please check the permissions for the requested view_id" in error assert test_config["view_id"] in error assert json_resp["error"] in error + + +def test_is_data_golden_flag_missing_equals_false( + mock_api_returns_is_data_golden_false, test_config, configured_catalog, mock_metrics_dimensions_type_list_link, mock_auth_call +): + source = SourceGoogleAnalyticsV4() + for message in source.read(logging.getLogger(), test_config, configured_catalog): + if message.type == Type.RECORD: + assert message.record.data["isDataGolden"] is False From cebb43b6b00c776384394a44cac3068b75aa6130 Mon Sep 17 00:00:00 2001 From: Denys Davydov Date: Tue, 3 May 2022 12:35:07 +0300 Subject: [PATCH 6/7] #12013 expose isDataGOlden flag: reword docs --- docs/integrations/sources/google-analytics-v4.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/integrations/sources/google-analytics-v4.md b/docs/integrations/sources/google-analytics-v4.md index 8f52d238e6c2..e62dd233d110 100644 --- a/docs/integrations/sources/google-analytics-v4.md +++ b/docs/integrations/sources/google-analytics-v4.md @@ -97,7 +97,9 @@ When sampling occurs, a warning is logged to the sync log. ## IsDataGolden Google Analytics API may return provisional or incomplete data. When this occurs, the returned data will set the flag `isDataGolden` to false, and the connector will log a warning to the sync log. -The connector always syncs data from 2 days ago due to the fact it [takes](https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article) Google Analytics up to 48 hours to update the data. To determine whether data is finished processing or not, the `isDataGolden` flag is exposed and should be used. +The connector adds a lookback window of 2 days to ensure any previously synced non-golden data is re-synced with its potential updates. This is done because [Google Analytics takes up to 48 hours](https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article) to update the data. For example: +- If your last sync occurred 5 days ago and a sync kicks off today, it will attempt to sync data from 7 days ago up to the latest data available. +To determine whether data is finished processing or not, the `isDataGolden` flag is exposed and should be used. ## Reading Custom Reports From 1dec82fa1aebbe7785a8d18a5913ba21c7ef5522 Mon Sep 17 00:00:00 2001 From: Octavia Squidington III Date: Tue, 3 May 2022 10:25:09 +0000 Subject: [PATCH 7/7] auto-bump connector version --- .../src/main/resources/seed/source_definitions.yaml | 2 +- .../init/src/main/resources/seed/source_specs.yaml | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index 2da12aedfb70..80856857111a 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -279,7 +279,7 @@ - name: Google Analytics sourceDefinitionId: eff3616a-f9c3-11eb-9a03-0242ac130003 dockerRepository: airbyte/source-google-analytics-v4 - dockerImageTag: 0.1.19 + dockerImageTag: 0.1.20 documentationUrl: https://docs.airbyte.io/integrations/sources/google-analytics-v4 icon: google-analytics.svg sourceType: api diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index 95d0bd486068..49d781e978a9 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -2797,7 +2797,7 @@ oauthFlowOutputParameters: - - "access_token" - - "refresh_token" -- dockerImage: "airbyte/source-google-analytics-v4:0.1.19" +- dockerImage: "airbyte/source-google-analytics-v4:0.1.20" spec: documentationUrl: "https://docs.airbyte.io/integrations/sources/google-analytics-v4" connectionSpecification: @@ -2826,10 +2826,10 @@ - "2020-06-01" window_in_days: type: "integer" - title: "Window in days (Optional)" - description: "The amount of days each stream slice would consist of beginning\ - \ from start_date. Bigger the value - faster the fetch. (Min=1, as for\ - \ a Day; Max=364, as for a Year)." + title: "Data request window (Optional)" + description: "The amount of data batched by the number of days. The bigger\ + \ the value, the bigger the batch size and the lower the API requests\ + \ made. (Min=1, as for a Day; Max=364, as for a Year)." examples: - 30 - 60