-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
🎉 Google Ads improvement: Support user-specified queries #5302
Changes from all commits
7ba083b
0b57c89
7ba9a5c
0b343da
cbb274a
7c4a212
a4ddc06
5a27b95
00b9fa1
6a2d3d7
7e4c397
2c9efa3
b297a6a
80c6e2d
99c8e02
4e17d84
c7256c6
c31c3fd
78adddf
e17a320
14e80e1
f43c29f
5775511
a7abb83
dd737dd
ad8b2c0
baae471
1ce8bf7
a4685dc
bbd678f
7fa232f
a7b8916
0ba4690
9aaab72
2260ca2
3ee960b
c5d55de
612ffc3
22be66e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"streams": [ | ||
{ | ||
"stream": { | ||
"name": "ad_group_custom", | ||
"json_schema": {}, | ||
"supported_sync_modes": ["full_refresh", "incremental"], | ||
"source_defined_cursor": true, | ||
"default_cursor_field": ["segments.date"] | ||
}, | ||
"sync_mode": "incremental", | ||
"destination_sync_mode": "overwrite", | ||
"cursor_field": ["segments.date"] | ||
} | ||
] | ||
} |
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
# | ||
# MIT License | ||
# | ||
# Copyright (c) 2020 Airbyte | ||
# | ||
# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
# of this software and associated documentation files (the "Software"), to deal | ||
# in the Software without restriction, including without limitation the rights | ||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
# copies of the Software, and to permit persons to whom the Software is | ||
# furnished to do so, subject to the following conditions: | ||
# | ||
# The above copyright notice and this permission notice shall be included in all | ||
# copies or substantial portions of the Software. | ||
# | ||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
# SOFTWARE. | ||
# | ||
|
||
import re | ||
from functools import lru_cache | ||
from typing import Any, Dict, List, Mapping | ||
|
||
from .streams import IncrementalGoogleAdsStream | ||
|
||
|
||
class CustomQuery(IncrementalGoogleAdsStream): | ||
def __init__(self, custom_query_config, **kwargs): | ||
self.custom_query_config = custom_query_config | ||
self.user_defined_query = custom_query_config["query"] | ||
super().__init__(**kwargs) | ||
|
||
@property | ||
def primary_key(self) -> str: | ||
""" | ||
The primary_key option is disabled. Config should not provide the primary key. | ||
It will be ignored if provided. | ||
If you need to enable it, uncomment the next line instead of `return None` and modify your config | ||
""" | ||
# return self.custom_query_config.get("primary_key") or None | ||
return None | ||
|
||
@property | ||
def name(self): | ||
return self.custom_query_config["table_name"] | ||
|
||
def get_query(self, stream_slice: Mapping[str, Any] = None) -> str: | ||
start_date, end_date = self.get_date_params(stream_slice, self.cursor_field) | ||
return self.insert_segments_date_expr(self.user_defined_query, start_date, end_date) | ||
|
||
# IncrementalGoogleAdsStream uses get_json_schema a lot while parsing | ||
# responses, caching plaing crucial role for performance here. | ||
@lru_cache() | ||
def get_json_schema(self) -> Dict[str, Any]: | ||
""" | ||
Compose json schema based on user defined query. | ||
:return Dict object representing jsonschema | ||
""" | ||
|
||
local_json_schema = { | ||
"$schema": "http://json-schema.org/draft-07/schema#", | ||
"type": "object", | ||
"properties": {}, | ||
"additionalProperties": True, | ||
} | ||
# full list {'ENUM', 'STRING', 'DATE', 'DOUBLE', 'RESOURCE_NAME', 'INT32', 'INT64', 'BOOLEAN', 'MESSAGE'} | ||
|
||
google_datatype_mapping = { | ||
"INT64": "integer", | ||
"INT32": "integer", | ||
"DOUBLE": "number", | ||
"STRING": "string", | ||
"BOOLEAN": "boolean", | ||
"DATE": "string", | ||
} | ||
fields = CustomQuery.get_query_fields(self.user_defined_query) | ||
fields.append(self.cursor_field) | ||
google_schema = self.google_ads_client.get_fields_metadata(fields) | ||
|
||
for field in fields: | ||
node = google_schema.get(field) | ||
# Data type return in enum format: "GoogleAdsFieldDataType.<data_type>" | ||
google_data_type = str(node.data_type).replace("GoogleAdsFieldDataType.", "") | ||
if google_data_type == "ENUM": | ||
field_value = {"type": "string", "enum": list(node.enum_values)} | ||
elif google_data_type == "MESSAGE": | ||
# Represents protobuf message and could be anything, set custom | ||
# attribute "protobuf_message" to convert it to a string (or | ||
# array of strings) later. | ||
# https://developers.google.com/google-ads/api/reference/rpc/v8/GoogleAdsFieldDataTypeEnum.GoogleAdsFieldDataType?hl=en#message | ||
if node.is_repeated: | ||
output_type = ["array", "null"] | ||
else: | ||
output_type = ["string", "null"] | ||
field_value = {"type": output_type, "protobuf_message": True} | ||
else: | ||
output_type = [google_datatype_mapping.get(google_data_type, "string"), "null"] | ||
field_value = {"type": output_type} | ||
local_json_schema["properties"][field] = field_value | ||
|
||
return local_json_schema | ||
|
||
# Regexp flags for parsing GAQL query | ||
RE_FLAGS = re.DOTALL | re.MULTILINE | re.IGNORECASE | ||
# Regexp for getting query columns | ||
SELECT_EXPR = re.compile("select(.*)from", flags=RE_FLAGS) | ||
WHERE_EXPR = re.compile("where.*", flags=RE_FLAGS) | ||
# list of keywords that can come after WHERE clause, | ||
# according to https://developers.google.com/google-ads/api/docs/query/grammar | ||
KEYWORDS_EXPR = re.compile("(order by|limit|parameters)", flags=RE_FLAGS) | ||
|
||
@staticmethod | ||
def get_query_fields(query: str) -> List[str]: | ||
fields = CustomQuery.SELECT_EXPR.search(query) | ||
if not fields: | ||
return [] | ||
fields = fields.group(1) | ||
return [f.strip() for f in fields.split(",")] | ||
|
||
@staticmethod | ||
def insert_segments_date_expr(query: str, start_date: str, end_date: str) -> str: | ||
""" | ||
Insert segments.date condition to break query into slices for incremental stream. | ||
:param query Origin user defined query | ||
:param start_date start date for metric (inclusive) | ||
:param end_date end date for metric (inclusive) | ||
:return Modified query with date window condition included | ||
""" | ||
# insert segments.date field | ||
columns = CustomQuery.SELECT_EXPR.search(query) | ||
if not columns: | ||
raise Exception("Not valid GAQL expression") | ||
columns = columns.group(1) | ||
new_columns = columns + ", segments.date\n" | ||
result_query = query.replace(columns, new_columns) | ||
|
||
# Modify/insert where condition | ||
where_cond = CustomQuery.WHERE_EXPR.search(result_query) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a very cool approach, I like it. The only issue I see here is In this case I think we should not add the where condition and just default to what the user did There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case I've added checking if custom query contain "segments.date" field on check_connection method. It wont allow adding time-bound conditions to custom queries. |
||
if not where_cond: | ||
# There is no where condition, insert new one | ||
where_location = len(result_query) | ||
keywords = CustomQuery.KEYWORDS_EXPR.search(result_query) | ||
if keywords: | ||
# where condition is not at the end of expression, insert new condition before keyword begins. | ||
where_location = keywords.start() | ||
result_query = ( | ||
result_query[0:where_location] | ||
+ f"\nWHERE segments.date BETWEEN '{start_date}' AND '{end_date}'\n" | ||
+ result_query[where_location:] | ||
) | ||
return result_query | ||
# There is already where condition, add segments.date expression | ||
where_cond = where_cond.group(0) | ||
keywords = CustomQuery.KEYWORDS_EXPR.search(where_cond) | ||
if keywords: | ||
# There is some keywords after WHERE condition | ||
where_cond = where_cond[0 : keywords.start()] | ||
new_where_cond = where_cond + f" AND segments.date BETWEEN '{start_date}' AND '{end_date}'\n" | ||
result_query = result_query.replace(where_cond, new_where_cond) | ||
return result_query |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
# SOFTWARE. | ||
# | ||
|
||
|
||
from enum import Enum | ||
from typing import Any, List, Mapping | ||
|
||
|
@@ -60,10 +61,32 @@ def send_request(self, query: str) -> SearchGoogleAdsResponse: | |
|
||
return self.ga_service.search(search_request) | ||
|
||
def get_fields_metadata(self, fields: List[str]) -> Mapping[str, Any]: | ||
""" | ||
Issue Google API request to get detailed information on data type for custom query columns. | ||
:params fields list of columns for user defined query. | ||
:return dict of fields type info. | ||
""" | ||
|
||
ga_field_service = self.client.get_service("GoogleAdsFieldService") | ||
request = self.client.get_type("SearchGoogleAdsFieldsRequest") | ||
request.page_size = len(fields) | ||
fields_sql = ",".join([f"'{field}'" for field in fields]) | ||
request.query = f""" | ||
SELECT | ||
name, | ||
data_type, | ||
enum_values, | ||
is_repeated | ||
WHERE name in ({fields_sql}) | ||
""" | ||
response = ga_field_service.search_google_ads_fields(request=request) | ||
return {r.name: r for r in response} | ||
|
||
@staticmethod | ||
def get_fields_from_schema(schema: Mapping[str, Any]) -> List[str]: | ||
properties = schema.get("properties") | ||
return [*properties] | ||
return list(properties.keys()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if schema doesn't have properties? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the error will be raised, but this error will be raised as well in previous implementation ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, in case we have no properties in schema (like we used), I parse the raw query. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. outdated, now everything read is taken from get_json_schema |
||
|
||
@staticmethod | ||
def convert_schema_into_query( | ||
|
@@ -82,7 +105,7 @@ def convert_schema_into_query( | |
return query_template | ||
|
||
@staticmethod | ||
def get_field_value(field_value: GoogleAdsRow, field: str) -> str: | ||
def get_field_value(field_value: GoogleAdsRow, field: str, schema_type: Mapping[str, Any]) -> str: | ||
field_name = field.split(".") | ||
for level_attr in field_name: | ||
""" | ||
|
@@ -130,7 +153,6 @@ def get_field_value(field_value: GoogleAdsRow, field: str) -> str: | |
# In GoogleAdsRow there are attributes that add an underscore at the end in their name. | ||
# For example, 'ad_group_ad.ad.type' is replaced by 'ad_group_ad.ad.type_'. | ||
field_value = getattr(field_value, level_attr + "_", None) | ||
|
||
if isinstance(field_value, Enum): | ||
field_value = field_value.name | ||
elif isinstance(field_value, (Repeated, RepeatedComposite)): | ||
|
@@ -144,13 +166,23 @@ def get_field_value(field_value: GoogleAdsRow, field: str) -> str: | |
# For example: | ||
# 1. ad_group_ad.ad.responsive_display_ad.long_headline - type AdTextAsset (https://developers.google.com/google-ads/api/reference/rpc/v6/AdTextAsset?hl=en). | ||
# 2. ad_group_ad.ad.legacy_app_install_ad - type LegacyAppInstallAdInfo (https://developers.google.com/google-ads/api/reference/rpc/v7/LegacyAppInstallAdInfo?hl=en). | ||
# | ||
if not (isinstance(field_value, (list, int, float, str, bool, dict)) or field_value is None): | ||
field_value = str(field_value) | ||
# In case of custom query field has MESSAGE type it represents protobuf | ||
# message and could be anything, convert it to a string or array of | ||
# string if it has "repeated" flag on metadata | ||
if schema_type.get("protobuf_message"): | ||
if "array" in schema_type.get("type"): | ||
field_value = [str(field) for field in field_value] | ||
else: | ||
field_value = str(field_value) | ||
|
||
return field_value | ||
|
||
@staticmethod | ||
def parse_single_result(schema: Mapping[str, Any], result: GoogleAdsRow): | ||
props = schema.get("properties") | ||
fields = GoogleAds.get_fields_from_schema(schema) | ||
single_record = {field: GoogleAds.get_field_value(result, field) for field in fields} | ||
single_record = {field: GoogleAds.get_field_value(result, field, props.get(field)) for field in fields} | ||
return single_record |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What this for?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
from the user side, catalog commandline argument is taken from the discover.
from our side for read we use console command python main.py read --config ... --catalog ...
This is to fill this command and be able to run it and get the data from source.