Source Google Analytics: always sync data from two days ago (#12426)

* #12013 source GA to Beta: always sync data from two days ago * #12013 GA to Beta: fix changelog * #12013 source GA to Beta: rm odd file * #12013 Source GA to Beta: comment out integration tests * #12013 expose isDataGolden field, assume missing field equals False * #12013 expose isDataGOlden flag: reword docs * auto-bump connector version Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
airbytehq · May 23, 2022 · 43b4bb4 · 43b4bb4
1 parent bbce6ed
commit 43b4bb4
Show file tree

Hide file tree

Showing 10 changed files with 55 additions and 37 deletions.
diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml
@@ -279,7 +279,7 @@
 - name: Google Analytics
   sourceDefinitionId: eff3616a-f9c3-11eb-9a03-0242ac130003
   dockerRepository: airbyte/source-google-analytics-v4
-  dockerImageTag: 0.1.19
+  dockerImageTag: 0.1.20
   documentationUrl: https://docs.airbyte.io/integrations/sources/google-analytics-v4
   icon: google-analytics.svg
   sourceType: api

diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml
@@ -2797,7 +2797,7 @@
         oauthFlowOutputParameters:
         - - "access_token"
         - - "refresh_token"
-- dockerImage: "airbyte/source-google-analytics-v4:0.1.19"
+- dockerImage: "airbyte/source-google-analytics-v4:0.1.20"
   spec:
     documentationUrl: "https://docs.airbyte.io/integrations/sources/google-analytics-v4"
     connectionSpecification:
@@ -2826,10 +2826,10 @@
           - "2020-06-01"
         window_in_days:
           type: "integer"
-          title: "Window in days (Optional)"
-          description: "The amount of days each stream slice would consist of beginning\
-            \ from start_date. Bigger the value - faster the fetch. (Min=1, as for\
-            \ a Day; Max=364, as for a Year)."
+          title: "Data request window (Optional)"
+          description: "The amount of data batched by the number of days. The bigger\
+            \ the value, the bigger the batch size and the lower the API requests\
+            \ made. (Min=1, as for a Day; Max=364, as for a Year)."
           examples:
           - 30
           - 60

diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile b/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile
@@ -12,5 +12,5 @@ COPY main.py ./
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=0.1.19
+LABEL io.airbyte.version=0.1.20
 LABEL io.airbyte.name=airbyte/source-google-analytics-v4
diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml b/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml
@@ -20,10 +20,12 @@ tests:
       empty_streams: []
       expect_records:
         path: "integration_tests/expected_records.txt"
-  incremental:
-    - config_path: "secrets/service_config.json"
-      configured_catalog_path: "integration_tests/configured_catalog.json"
-      future_state_path: "integration_tests/abnormal_state.json"
+# Since the connector makes 2 days look back window, it can not pass SAT where all records produce cursor value greater ao equal to a state value
+# see https://github.com/airbytehq/airbyte/issues/12013 for details
+#  incremental:
+#    - config_path: "secrets/service_config.json"
+#      configured_catalog_path: "integration_tests/configured_catalog.json"
+#      future_state_path: "integration_tests/abnormal_state.json"
   full_refresh:
     - config_path: "secrets/service_config.json"
       configured_catalog_path: "integration_tests/configured_catalog.json"
diff --git a/...integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt b/...integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt
@@ -1,6 +1,6 @@
-{"stream": "new_users_per_day", "data": {"ga_date": "2021-12-10", "ga_country": "United States", "ga_region": "Washington", "ga_newUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563255199}
-{"stream": "devices", "data": {"ga_date": "2021-12-10", "ga_deviceCategory": "desktop", "ga_operatingSystem": "Macintosh", "ga_browser": "Firefox", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563253285}
-{"stream": "daily_active_users", "data": {"ga_date": "2021-12-10", "ga_1dayUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563251092}
-{"stream": "weekly_active_users", "data": {"ga_date": "2021-12-15", "ga_7dayUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563249172}
-{"stream": "locations", "data": {"ga_date": "2021-12-10", "ga_continent": "Americas", "ga_subContinent": "Northern America", "ga_country": "United States", "ga_region": "Washington", "ga_metro": "Seattle-Tacoma WA", "ga_city": "Seattle", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563230934}
-{"stream": "pages", "data": {"ga_date": "2021-12-10", "ga_hostname": "www.surveymonkey.com", "ga_pagePath": "/apps/NKI5TOTqk4tS5BZyJXU9YQ_3D_3D/preview", "ga_pageviews": 1, "ga_uniquePageviews": 1, "ga_avgTimeOnPage": 0.0, "ga_entrances": 1, "ga_entranceRate": 100.0, "ga_bounceRate": 100.0, "ga_exits": 1, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563227527}
+{"stream": "new_users_per_day", "data": {"ga_date": "2021-12-10", "ga_country": "United States", "ga_region": "Washington", "ga_newUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563255199}
+{"stream": "devices", "data": {"ga_date": "2021-12-10", "ga_deviceCategory": "desktop", "ga_operatingSystem": "Macintosh", "ga_browser": "Firefox", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563253285}
+{"stream": "daily_active_users", "data": {"ga_date": "2021-12-10", "ga_1dayUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563251092}
+{"stream": "weekly_active_users", "data": {"ga_date": "2021-12-15", "ga_7dayUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563249172}
+{"stream": "locations", "data": {"ga_date": "2021-12-10", "ga_continent": "Americas", "ga_subContinent": "Northern America", "ga_country": "United States", "ga_region": "Washington", "ga_metro": "Seattle-Tacoma WA", "ga_city": "Seattle", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563230934}
+{"stream": "pages", "data": {"ga_date": "2021-12-10", "ga_hostname": "www.surveymonkey.com", "ga_pagePath": "/apps/NKI5TOTqk4tS5BZyJXU9YQ_3D_3D/preview", "ga_pageviews": 1, "ga_uniquePageviews": 1, "ga_avgTimeOnPage": 0.0, "ga_entrances": 1, "ga_entranceRate": 100.0, "ga_bounceRate": 100.0, "ga_exits": 1, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563227527}
diff --git a/...e-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py b/...e-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py
@@ -209,7 +209,7 @@ def get_json_schema(self) -> Mapping[str, Any]:
             if data_format:
                 metric_data["format"] = data_format
             schema["properties"][metric] = metric_data
-
+        schema["properties"]["isDataGolden"] = {"type": "boolean"}
         return schema
 
     def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -> Iterable[Optional[Mapping[str, Any]]]:
@@ -226,14 +226,15 @@ def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -
             ...]
         """
 
-        today = pendulum.now().date()
+        end_date = pendulum.now().date()
         start_date = pendulum.parse(self.start_date).date()
         if stream_state:
             prev_end_date = pendulum.parse(stream_state.get(self.cursor_field)).date()
-            start_date = prev_end_date.add(days=1)
-        end_date = today
-        if start_date > end_date:
-            return [None]
+            start_date = prev_end_date.add(days=1)  # do not include previous `end_date`
+        # always resync 2 previous days to be sure data is golden
+        # https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article
+        # https://github.com/airbytehq/airbyte/issues/12013#issuecomment-1111255503
+        start_date = start_date.subtract(days=2)
 
         date_slices = []
         slice_start_date = start_date
@@ -403,11 +404,11 @@ def parse_response(self, response: requests.Response, **kwargs: Any) -> Iterable
                         record[metric_name.replace("ga:", "ga_")] = value
 
                 record["view_id"] = self.view_id
-
+                record["isDataGolden"] = report.get("data", {}).get("isDataGolden", False)
                 yield record
 
     def check_for_sampled_result(self, data: Mapping) -> None:
-        if not data.get("isDataGolden", True):
+        if not data.get("isDataGolden", False):
             self.logger.warning(DATA_IS_NOT_GOLDEN_MSG)
         if data.get("samplesReadCounts", False):
             self.logger.warning(RESULT_IS_SAMPLED_MSG)

diff --git a/...-v4/unit_tests/response_with_records.json → ...s-v4/unit_tests/response_golden_data.json b/...-v4/unit_tests/response_with_records.json → ...s-v4/unit_tests/response_golden_data.json
@@ -23,6 +23,7 @@
             ]
           }
         ],
+        "isDataGolden": true,
         "totals": [
           {
             "values": ["158"]

diff --git a/..._tests/response_is_data_golden_false.json → .../unit_tests/response_non_golden_data.json b/..._tests/response_is_data_golden_false.json → .../unit_tests/response_non_golden_data.json
@@ -23,7 +23,6 @@
             ]
           }
         ],
-        "isDataGolden": false,
         "totals": [
           {
             "values": ["158"]

diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py
@@ -11,7 +11,7 @@
 
 import pendulum
 import pytest
-from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode
+from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode, Type
 from airbyte_cdk.sources.streams.http.auth import NoAuth
 from freezegun import freeze_time
 from source_google_analytics_v4.source import (
@@ -81,10 +81,10 @@ def mock_api_returns_no_records(requests_mock):
 @pytest.fixture
 def mock_api_returns_valid_records(requests_mock):
     """API returns valid data for given date based slice"""
-    yield requests_mock.post(
-        "https://analyticsreporting.googleapis.com/v4/reports:batchGet",
-        json=json.loads(read_file("response_with_records.json")),
-    )
+    response = json.loads(read_file("response_golden_data.json"))
+    for report in response["reports"]:
+        assert report["data"]["isDataGolden"] is True
+    yield requests_mock.post("https://analyticsreporting.googleapis.com/v4/reports:batchGet", json=response)
 
 
 @pytest.fixture
@@ -99,10 +99,10 @@ def mock_api_returns_sampled_results(requests_mock):
 @pytest.fixture
 def mock_api_returns_is_data_golden_false(requests_mock):
     """API returns valid data for given date based slice"""
-    yield requests_mock.post(
-        "https://analyticsreporting.googleapis.com/v4/reports:batchGet",
-        json=json.loads(read_file("response_is_data_golden_false.json")),
-    )
+    response = json.loads(read_file("response_non_golden_data.json"))
+    for report in response["reports"]:
+        assert "isDataGolden" not in report["data"]
+    yield requests_mock.post("https://analyticsreporting.googleapis.com/v4/reports:batchGet", json=response)
 
 
 @pytest.fixture
@@ -310,13 +310,15 @@ def test_unknown_metrics_or_dimensions_error_validation(mock_metrics_dimensions_
 
 
 @freeze_time("2021-11-30")
-def test_stream_slices_limited_by_current_date(test_config, mock_metrics_dimensions_type_list_link):
+def test_stream_slice_limits(test_config, mock_metrics_dimensions_type_list_link):
     test_config["window_in_days"] = 14
     g = GoogleAnalyticsV4IncrementalObjectsBase(config=test_config)
     stream_state = {"ga_date": "2021-11-25"}
     slices = g.stream_slices(stream_state=stream_state)
     current_date = pendulum.now().date().strftime("%Y-%m-%d")
-    assert slices == [{"startDate": "2021-11-26", "endDate": current_date}]
+    expected_start_date = "2021-11-24"  # always resync two days back
+    expected_end_date = current_date  # do not try to sync future dates
+    assert slices == [{"startDate": expected_start_date, "endDate": expected_end_date}]
 
 
 @freeze_time("2021-11-30")
@@ -370,3 +372,12 @@ def test_connection_fail_due_to_http_status(
         assert "Please check the permissions for the requested view_id" in error
         assert test_config["view_id"] in error
     assert json_resp["error"] in error
+
+
+def test_is_data_golden_flag_missing_equals_false(
+    mock_api_returns_is_data_golden_false, test_config, configured_catalog, mock_metrics_dimensions_type_list_link, mock_auth_call
+):
+    source = SourceGoogleAnalyticsV4()
+    for message in source.read(logging.getLogger(), test_config, configured_catalog):
+        if message.type == Type.RECORD:
+            assert message.record.data["isDataGolden"] is False
diff --git a/docs/integrations/sources/google-analytics-v4.md b/docs/integrations/sources/google-analytics-v4.md
@@ -97,6 +97,9 @@ When sampling occurs, a warning is logged to the sync log.
 ## IsDataGolden
 
 Google Analytics API may return provisional or incomplete data. When this occurs, the returned data will set the flag `isDataGolden` to false, and the connector will log a warning to the sync log.
+The connector adds a lookback window of 2 days to ensure any previously synced non-golden data is re-synced with its potential updates. This is done because [Google Analytics takes up to 48 hours](https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article) to update the data. For example:
+- If your last sync occurred 5 days ago and a sync kicks off today, it will attempt to sync data from 7 days ago up to the latest data available.
+To determine whether data is finished processing or not, the `isDataGolden` flag is exposed and should be used.
 
 ## Reading Custom Reports
 
@@ -159,6 +162,7 @@ Incremental sync is supported only if you add `ga:date` dimension to your custom
 
 | Version | Date       | Pull Request                                             | Subject                                                                                      |
 |:--------|:-----------|:---------------------------------------------------------|:---------------------------------------------------------------------------------------------|
+| 0.1.20  | 2022-04-28 | [12426](https://github.com/airbytehq/airbyte/pull/12426) | Expose `isDataGOlden` field and always resync data two days back to make sure it is golden   |
 | 0.1.19  | 2022-04-19 | [12150](https://github.com/airbytehq/airbyte/pull/12150) | Minor changes to documentation                                                               |
 | 0.1.18  | 2022-04-07 | [11803](https://github.com/airbytehq/airbyte/pull/11803) | Improved documentation                                                                       |
 | 0.1.17  | 2022-03-31 | [11512](https://github.com/airbytehq/airbyte/pull/11512) | Improved Unit and Acceptance tests coverage, fixed `read` with abnormally large state values |