diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index a9b096d3782c..a4d19c49fe09 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -279,7 +279,7 @@ - name: Google Analytics sourceDefinitionId: eff3616a-f9c3-11eb-9a03-0242ac130003 dockerRepository: airbyte/source-google-analytics-v4 - dockerImageTag: 0.1.19 + dockerImageTag: 0.1.20 documentationUrl: https://docs.airbyte.io/integrations/sources/google-analytics-v4 icon: google-analytics.svg sourceType: api diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index aafa7185f3cd..0012903832a0 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -2797,7 +2797,7 @@ oauthFlowOutputParameters: - - "access_token" - - "refresh_token" -- dockerImage: "airbyte/source-google-analytics-v4:0.1.19" +- dockerImage: "airbyte/source-google-analytics-v4:0.1.20" spec: documentationUrl: "https://docs.airbyte.io/integrations/sources/google-analytics-v4" connectionSpecification: @@ -2826,10 +2826,10 @@ - "2020-06-01" window_in_days: type: "integer" - title: "Window in days (Optional)" - description: "The amount of days each stream slice would consist of beginning\ - \ from start_date. Bigger the value - faster the fetch. (Min=1, as for\ - \ a Day; Max=364, as for a Year)." + title: "Data request window (Optional)" + description: "The amount of data batched by the number of days. The bigger\ + \ the value, the bigger the batch size and the lower the API requests\ + \ made. (Min=1, as for a Day; Max=364, as for a Year)." examples: - 30 - 60 diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile b/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile index fffcbe5b003c..56f7b74e9cd3 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile +++ b/airbyte-integrations/connectors/source-google-analytics-v4/Dockerfile @@ -12,5 +12,5 @@ COPY main.py ./ ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.1.19 +LABEL io.airbyte.version=0.1.20 LABEL io.airbyte.name=airbyte/source-google-analytics-v4 diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml b/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml index a697c0fc0955..920381712869 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml +++ b/airbyte-integrations/connectors/source-google-analytics-v4/acceptance-test-config.yml @@ -20,10 +20,12 @@ tests: empty_streams: [] expect_records: path: "integration_tests/expected_records.txt" - incremental: - - config_path: "secrets/service_config.json" - configured_catalog_path: "integration_tests/configured_catalog.json" - future_state_path: "integration_tests/abnormal_state.json" +# Since the connector makes 2 days look back window, it can not pass SAT where all records produce cursor value greater ao equal to a state value +# see https://github.com/airbytehq/airbyte/issues/12013 for details +# incremental: +# - config_path: "secrets/service_config.json" +# configured_catalog_path: "integration_tests/configured_catalog.json" +# future_state_path: "integration_tests/abnormal_state.json" full_refresh: - config_path: "secrets/service_config.json" configured_catalog_path: "integration_tests/configured_catalog.json" diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt b/airbyte-integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt index 3701178cfbe6..0a2dd91721f2 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt +++ b/airbyte-integrations/connectors/source-google-analytics-v4/integration_tests/expected_records.txt @@ -1,6 +1,6 @@ -{"stream": "new_users_per_day", "data": {"ga_date": "2021-12-10", "ga_country": "United States", "ga_region": "Washington", "ga_newUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563255199} -{"stream": "devices", "data": {"ga_date": "2021-12-10", "ga_deviceCategory": "desktop", "ga_operatingSystem": "Macintosh", "ga_browser": "Firefox", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563253285} -{"stream": "daily_active_users", "data": {"ga_date": "2021-12-10", "ga_1dayUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563251092} -{"stream": "weekly_active_users", "data": {"ga_date": "2021-12-15", "ga_7dayUsers": 1, "view_id": "211669975"}, "emitted_at": 1639563249172} -{"stream": "locations", "data": {"ga_date": "2021-12-10", "ga_continent": "Americas", "ga_subContinent": "Northern America", "ga_country": "United States", "ga_region": "Washington", "ga_metro": "Seattle-Tacoma WA", "ga_city": "Seattle", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563230934} -{"stream": "pages", "data": {"ga_date": "2021-12-10", "ga_hostname": "www.surveymonkey.com", "ga_pagePath": "/apps/NKI5TOTqk4tS5BZyJXU9YQ_3D_3D/preview", "ga_pageviews": 1, "ga_uniquePageviews": 1, "ga_avgTimeOnPage": 0.0, "ga_entrances": 1, "ga_entranceRate": 100.0, "ga_bounceRate": 100.0, "ga_exits": 1, "ga_exitRate": 100.0, "view_id": "211669975"}, "emitted_at": 1639563227527} \ No newline at end of file +{"stream": "new_users_per_day", "data": {"ga_date": "2021-12-10", "ga_country": "United States", "ga_region": "Washington", "ga_newUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563255199} +{"stream": "devices", "data": {"ga_date": "2021-12-10", "ga_deviceCategory": "desktop", "ga_operatingSystem": "Macintosh", "ga_browser": "Firefox", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563253285} +{"stream": "daily_active_users", "data": {"ga_date": "2021-12-10", "ga_1dayUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563251092} +{"stream": "weekly_active_users", "data": {"ga_date": "2021-12-15", "ga_7dayUsers": 1, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563249172} +{"stream": "locations", "data": {"ga_date": "2021-12-10", "ga_continent": "Americas", "ga_subContinent": "Northern America", "ga_country": "United States", "ga_region": "Washington", "ga_metro": "Seattle-Tacoma WA", "ga_city": "Seattle", "ga_users": 1, "ga_newUsers": 1, "ga_sessions": 1, "ga_sessionsPerUser": 1.0, "ga_avgSessionDuration": 0.0, "ga_pageviews": 1, "ga_pageviewsPerSession": 1.0, "ga_avgTimeOnPage": 0.0, "ga_bounceRate": 100.0, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563230934} +{"stream": "pages", "data": {"ga_date": "2021-12-10", "ga_hostname": "www.surveymonkey.com", "ga_pagePath": "/apps/NKI5TOTqk4tS5BZyJXU9YQ_3D_3D/preview", "ga_pageviews": 1, "ga_uniquePageviews": 1, "ga_avgTimeOnPage": 0.0, "ga_entrances": 1, "ga_entranceRate": 100.0, "ga_bounceRate": 100.0, "ga_exits": 1, "ga_exitRate": 100.0, "view_id": "211669975", "isDataGolden": true}, "emitted_at": 1639563227527} \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py b/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py index b6fcc161d5f2..70a0f5cdc0de 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py +++ b/airbyte-integrations/connectors/source-google-analytics-v4/source_google_analytics_v4/source.py @@ -209,7 +209,7 @@ def get_json_schema(self) -> Mapping[str, Any]: if data_format: metric_data["format"] = data_format schema["properties"][metric] = metric_data - + schema["properties"]["isDataGolden"] = {"type": "boolean"} return schema def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) -> Iterable[Optional[Mapping[str, Any]]]: @@ -226,14 +226,15 @@ def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs: Any) - ...] """ - today = pendulum.now().date() + end_date = pendulum.now().date() start_date = pendulum.parse(self.start_date).date() if stream_state: prev_end_date = pendulum.parse(stream_state.get(self.cursor_field)).date() - start_date = prev_end_date.add(days=1) - end_date = today - if start_date > end_date: - return [None] + start_date = prev_end_date.add(days=1) # do not include previous `end_date` + # always resync 2 previous days to be sure data is golden + # https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article + # https://github.com/airbytehq/airbyte/issues/12013#issuecomment-1111255503 + start_date = start_date.subtract(days=2) date_slices = [] slice_start_date = start_date @@ -403,11 +404,11 @@ def parse_response(self, response: requests.Response, **kwargs: Any) -> Iterable record[metric_name.replace("ga:", "ga_")] = value record["view_id"] = self.view_id - + record["isDataGolden"] = report.get("data", {}).get("isDataGolden", False) yield record def check_for_sampled_result(self, data: Mapping) -> None: - if not data.get("isDataGolden", True): + if not data.get("isDataGolden", False): self.logger.warning(DATA_IS_NOT_GOLDEN_MSG) if data.get("samplesReadCounts", False): self.logger.warning(RESULT_IS_SAMPLED_MSG) diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_with_records.json b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_golden_data.json similarity index 96% rename from airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_with_records.json rename to airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_golden_data.json index be89bd585876..38b5a1af404c 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_with_records.json +++ b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_golden_data.json @@ -23,6 +23,7 @@ ] } ], + "isDataGolden": true, "totals": [ { "values": ["158"] diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_is_data_golden_false.json b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_non_golden_data.json similarity index 96% rename from airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_is_data_golden_false.json rename to airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_non_golden_data.json index ff7e3d23ad23..486c27180ec3 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_is_data_golden_false.json +++ b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/response_non_golden_data.json @@ -23,7 +23,6 @@ ] } ], - "isDataGolden": false, "totals": [ { "values": ["158"] diff --git a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py index b95ed88cbdf3..f5134467f412 100644 --- a/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py +++ b/airbyte-integrations/connectors/source-google-analytics-v4/unit_tests/unit_test.py @@ -11,7 +11,7 @@ import pendulum import pytest -from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode +from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode, Type from airbyte_cdk.sources.streams.http.auth import NoAuth from freezegun import freeze_time from source_google_analytics_v4.source import ( @@ -81,10 +81,10 @@ def mock_api_returns_no_records(requests_mock): @pytest.fixture def mock_api_returns_valid_records(requests_mock): """API returns valid data for given date based slice""" - yield requests_mock.post( - "https://analyticsreporting.googleapis.com/v4/reports:batchGet", - json=json.loads(read_file("response_with_records.json")), - ) + response = json.loads(read_file("response_golden_data.json")) + for report in response["reports"]: + assert report["data"]["isDataGolden"] is True + yield requests_mock.post("https://analyticsreporting.googleapis.com/v4/reports:batchGet", json=response) @pytest.fixture @@ -99,10 +99,10 @@ def mock_api_returns_sampled_results(requests_mock): @pytest.fixture def mock_api_returns_is_data_golden_false(requests_mock): """API returns valid data for given date based slice""" - yield requests_mock.post( - "https://analyticsreporting.googleapis.com/v4/reports:batchGet", - json=json.loads(read_file("response_is_data_golden_false.json")), - ) + response = json.loads(read_file("response_non_golden_data.json")) + for report in response["reports"]: + assert "isDataGolden" not in report["data"] + yield requests_mock.post("https://analyticsreporting.googleapis.com/v4/reports:batchGet", json=response) @pytest.fixture @@ -310,13 +310,15 @@ def test_unknown_metrics_or_dimensions_error_validation(mock_metrics_dimensions_ @freeze_time("2021-11-30") -def test_stream_slices_limited_by_current_date(test_config, mock_metrics_dimensions_type_list_link): +def test_stream_slice_limits(test_config, mock_metrics_dimensions_type_list_link): test_config["window_in_days"] = 14 g = GoogleAnalyticsV4IncrementalObjectsBase(config=test_config) stream_state = {"ga_date": "2021-11-25"} slices = g.stream_slices(stream_state=stream_state) current_date = pendulum.now().date().strftime("%Y-%m-%d") - assert slices == [{"startDate": "2021-11-26", "endDate": current_date}] + expected_start_date = "2021-11-24" # always resync two days back + expected_end_date = current_date # do not try to sync future dates + assert slices == [{"startDate": expected_start_date, "endDate": expected_end_date}] @freeze_time("2021-11-30") @@ -370,3 +372,12 @@ def test_connection_fail_due_to_http_status( assert "Please check the permissions for the requested view_id" in error assert test_config["view_id"] in error assert json_resp["error"] in error + + +def test_is_data_golden_flag_missing_equals_false( + mock_api_returns_is_data_golden_false, test_config, configured_catalog, mock_metrics_dimensions_type_list_link, mock_auth_call +): + source = SourceGoogleAnalyticsV4() + for message in source.read(logging.getLogger(), test_config, configured_catalog): + if message.type == Type.RECORD: + assert message.record.data["isDataGolden"] is False diff --git a/docs/integrations/sources/google-analytics-v4.md b/docs/integrations/sources/google-analytics-v4.md index 7b57aabc8483..e62dd233d110 100644 --- a/docs/integrations/sources/google-analytics-v4.md +++ b/docs/integrations/sources/google-analytics-v4.md @@ -97,6 +97,9 @@ When sampling occurs, a warning is logged to the sync log. ## IsDataGolden Google Analytics API may return provisional or incomplete data. When this occurs, the returned data will set the flag `isDataGolden` to false, and the connector will log a warning to the sync log. +The connector adds a lookback window of 2 days to ensure any previously synced non-golden data is re-synced with its potential updates. This is done because [Google Analytics takes up to 48 hours](https://support.google.com/analytics/answer/1070983?hl=en#DataProcessingLatency&zippy=%2Cin-this-article) to update the data. For example: +- If your last sync occurred 5 days ago and a sync kicks off today, it will attempt to sync data from 7 days ago up to the latest data available. +To determine whether data is finished processing or not, the `isDataGolden` flag is exposed and should be used. ## Reading Custom Reports @@ -159,6 +162,7 @@ Incremental sync is supported only if you add `ga:date` dimension to your custom | Version | Date | Pull Request | Subject | |:--------|:-----------|:---------------------------------------------------------|:---------------------------------------------------------------------------------------------| +| 0.1.20 | 2022-04-28 | [12426](https://github.com/airbytehq/airbyte/pull/12426) | Expose `isDataGOlden` field and always resync data two days back to make sure it is golden | | 0.1.19 | 2022-04-19 | [12150](https://github.com/airbytehq/airbyte/pull/12150) | Minor changes to documentation | | 0.1.18 | 2022-04-07 | [11803](https://github.com/airbytehq/airbyte/pull/11803) | Improved documentation | | 0.1.17 | 2022-03-31 | [11512](https://github.com/airbytehq/airbyte/pull/11512) | Improved Unit and Acceptance tests coverage, fixed `read` with abnormally large state values |