diff --git a/airbyte-integrations/bases/base-normalization/Dockerfile b/airbyte-integrations/bases/base-normalization/Dockerfile index c8619f21930d..ca4f1b51714f 100644 --- a/airbyte-integrations/bases/base-normalization/Dockerfile +++ b/airbyte-integrations/bases/base-normalization/Dockerfile @@ -28,5 +28,5 @@ WORKDIR /airbyte ENV AIRBYTE_ENTRYPOINT "/airbyte/entrypoint.sh" ENTRYPOINT ["/airbyte/entrypoint.sh"] -LABEL io.airbyte.version=0.1.65 +LABEL io.airbyte.version=0.1.66 LABEL io.airbyte.name=airbyte/normalization diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py index 66a1372bfcdf..e5fc2df0ae84 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py @@ -146,7 +146,7 @@ def __normalize_non_column_identifier_name( ) -> str: # We force standard naming for non column names (see issue #1785) result = transform_standard_naming(input_name) - result = self.__normalize_naming_conventions(result) + result = self.__normalize_naming_conventions(result, is_column=False) if truncate: result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level) result = self.__normalize_identifier_case(result, is_quoted=False) @@ -160,7 +160,7 @@ def __normalize_non_column_identifier_name( def __normalize_identifier_name( self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: - result = self.__normalize_naming_conventions(column_name) + result = self.__normalize_naming_conventions(column_name, is_column=True) if truncate: result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level) if self.needs_quotes(result): @@ -189,14 +189,16 @@ def apply_quote(self, input: str) -> str: return f"quote('{input}')" return f"adapter.quote('{input}')" - def __normalize_naming_conventions(self, input_name: str) -> str: + def __normalize_naming_conventions(self, input_name: str, is_column: bool = False) -> str: result = input_name if self.destination_type.value == DestinationType.ORACLE.value: return transform_standard_naming(result) elif self.destination_type.value == DestinationType.BIGQUERY.value: + # Can start with number: datasetId, table + # Can not start with number: column result = transform_standard_naming(result) doesnt_start_with_alphaunderscore = match("[^A-Za-z_]", result[0]) is not None - if doesnt_start_with_alphaunderscore: + if is_column and doesnt_start_with_alphaunderscore: result = f"_{result}" return result diff --git a/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py b/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py index 61754fc2fc59..d6a435777ee1 100644 --- a/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py +++ b/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py @@ -98,6 +98,62 @@ def test_transform_standard_naming(input_str: str, expected: str): assert transform_standard_naming(input_str) == expected +@pytest.mark.parametrize( + "input_str, destination_type, expected, expected_in_jinja", + [ + # Case sensitive names + ("Identifier Name", "Postgres", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), + ("Identifier Name", "BigQuery", "Identifier_Name", "'Identifier_Name'"), + ("Identifier Name", "Snowflake", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), + ("Identifier Name", "Redshift", "{{ adapter.quote('identifier name') }}", "adapter.quote('identifier name')"), + ("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), + ("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), + # Reserved Word for BigQuery and MySQL only + ("Groups", "Postgres", "groups", "'groups'"), + ("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), + ("Groups", "Snowflake", "GROUPS", "'GROUPS'"), + ("Groups", "Redshift", "groups", "'groups'"), + ("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), + ("Groups", "MSSQL", "groups", "'groups'"), + ], +) +def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str): + t = DestinationType.from_string(destination_type) + assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=False) == expected + assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=True) == expected_in_jinja + + +@pytest.mark.parametrize( + "input_str, expected", + [ + # below the limit + ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh"), + # at the limit + ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii"), + # over the limit + ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"), + ("Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"), + ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii_Jjjj_Kkkk", "Aaaa_Bbbb_Cccc_Dddd___g_Hhhh_Iiii_Jjjj_Kkkk"), + ("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_0123456789", "ABCDEFGHIJKLMNOPQRST__qrstuvwxyz_0123456789"), + ], +) +def test_truncate_identifier(input_str: str, expected: str): + """ + Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit: + - `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii` + - `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii` + + Deciding on how to truncate (in the middle) are being verified in these tests. + In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii` + and can potentially cause a collision in table names. + + Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`. + """ + name_transformer = DestinationNameTransformer(DestinationType.POSTGRES) + print(f"Truncating from #{len(input_str)} to #{len(expected)}") + assert name_transformer.truncate_identifier_name(input_str) == expected + + @pytest.mark.parametrize( "input_str, destination_type, expected, expected_column", [ @@ -117,7 +173,8 @@ def test_transform_standard_naming(input_str: str, expected: str): ("a-Unicode_name_文6", "MSSQL", "a_unicode_name__6", "{{ adapter.quote('a-Unicode_name_文6') }}"), # Doesnt start with alpha or underscore ("100x2001", "Postgres", "100x2001", "{{ adapter.quote('100x2001') }}"), - ("100x2002", "BigQuery", "_100x2002", "_100x2002"), + ("100x2002", "BigQuery", "100x2002", "_100x2002"), + ("文2_a-Unicode_name", "BigQuery", "_2_a_Unicode_name", "_2_a_Unicode_name"), ("100x2003", "Snowflake", "100x2003", "{{ adapter.quote('100x2003') }}"), ("100x2004", "Redshift", "100x2004", "{{ adapter.quote('100x2004') }}"), ("100x2005", "MySQL", "100x2005", "{{ adapter.quote('100x2005') }}"), @@ -164,59 +221,3 @@ def test_normalize_name(input_str: str, destination_type: str, expected: str, ex assert DestinationNameTransformer(t).normalize_schema_name(input_str) == expected assert DestinationNameTransformer(t).normalize_table_name(input_str) == expected assert DestinationNameTransformer(t).normalize_column_name(input_str) == expected_column - - -@pytest.mark.parametrize( - "input_str, destination_type, expected, expected_in_jinja", - [ - # Case sensitive names - ("Identifier Name", "Postgres", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), - ("Identifier Name", "BigQuery", "Identifier_Name", "'Identifier_Name'"), - ("Identifier Name", "Snowflake", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), - ("Identifier Name", "Redshift", "{{ adapter.quote('identifier name') }}", "adapter.quote('identifier name')"), - ("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), - ("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), - # Reserved Word for BigQuery and MySQL only - ("Groups", "Postgres", "groups", "'groups'"), - ("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), - ("Groups", "Snowflake", "GROUPS", "'GROUPS'"), - ("Groups", "Redshift", "groups", "'groups'"), - ("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), - ("Groups", "MSSQL", "groups", "'groups'"), - ], -) -def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str): - t = DestinationType.from_string(destination_type) - assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=False) == expected - assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=True) == expected_in_jinja - - -@pytest.mark.parametrize( - "input_str, expected", - [ - # below the limit - ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh"), - # at the limit - ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii"), - # over the limit - ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"), - ("Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"), - ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii_Jjjj_Kkkk", "Aaaa_Bbbb_Cccc_Dddd___g_Hhhh_Iiii_Jjjj_Kkkk"), - ("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_0123456789", "ABCDEFGHIJKLMNOPQRST__qrstuvwxyz_0123456789"), - ], -) -def test_truncate_identifier(input_str: str, expected: str): - """ - Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit: - - `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii` - - `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii` - - Deciding on how to truncate (in the middle) are being verified in these tests. - In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii` - and can potentially cause a collision in table names. - - Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`. - """ - name_transformer = DestinationNameTransformer(DestinationType.POSTGRES) - print(f"Truncating from #{len(input_str)} to #{len(expected)}") - assert name_transformer.truncate_identifier_name(input_str) == expected diff --git a/airbyte-workers/src/main/java/io/airbyte/workers/normalization/NormalizationRunnerFactory.java b/airbyte-workers/src/main/java/io/airbyte/workers/normalization/NormalizationRunnerFactory.java index db1fa4aca23f..2d1208957e3e 100644 --- a/airbyte-workers/src/main/java/io/airbyte/workers/normalization/NormalizationRunnerFactory.java +++ b/airbyte-workers/src/main/java/io/airbyte/workers/normalization/NormalizationRunnerFactory.java @@ -14,7 +14,7 @@ public class NormalizationRunnerFactory { public static final String BASE_NORMALIZATION_IMAGE_NAME = "airbyte/normalization"; - public static final String NORMALIZATION_VERSION = "0.1.65"; + public static final String NORMALIZATION_VERSION = "0.1.66"; static final Map> NORMALIZATION_MAPPING = ImmutableMap.>builder() diff --git a/docs/understanding-airbyte/basic-normalization.md b/docs/understanding-airbyte/basic-normalization.md index bc2dc591bbd0..4fbedd3b4113 100644 --- a/docs/understanding-airbyte/basic-normalization.md +++ b/docs/understanding-airbyte/basic-normalization.md @@ -349,28 +349,29 @@ Note that Basic Normalization is packaged in a docker image `airbyte/normalizati Therefore, in order to "upgrade" to the desired normalization version, you need to use the corresponding Airbyte version that it's being released in: | Airbyte Version | Normalization Version | Date | Pull Request | Subject | -| :--- | :--- | :--- | :--- | :--- | -| 0.35.13-alpha | 0.1.65 | 2021-01-28 | [\#9846](https://github.com/airbytehq/airbyte/pull/9846) | Tweak dbt multi-thread parameter down | -| 0.35.12-alpha | 0.1.64 | 2021-01-28 | [\#9793](https://github.com/airbytehq/airbyte/pull/9793) | Support PEM format for ssh-tunnel keys | -| 0.35.4-alpha | 0.1.63 | 2021-01-07 | [\#9301](https://github.com/airbytehq/airbyte/pull/9301) | Fix Snowflake prefix tables starting with numbers | -| | 0.1.62 | 2021-01-07 | [\#9340](https://github.com/airbytehq/airbyte/pull/9340) | Use TCP-port support for clickhouse | -| | 0.1.62 | 2021-01-07 | [\#9063](https://github.com/airbytehq/airbyte/pull/9063) | Change Snowflake-specific materialization settings | -| | 0.1.62 | 2021-01-07 | [\#9317](https://github.com/airbytehq/airbyte/pull/9317) | Fix issue with quoted & case sensitive columns | -| | 0.1.62 | 2021-01-07 | [\#9281](https://github.com/airbytehq/airbyte/pull/9281) | Fix SCD partition by float columns in BigQuery | -| 0.32.11-alpha | 0.1.61 | 2021-12-02 | [\#8394](https://github.com/airbytehq/airbyte/pull/8394) | Fix incremental queries not updating empty tables | -| | 0.1.61 | 2021-12-01 | [\#8378](https://github.com/airbytehq/airbyte/pull/8378) | Fix un-nesting queries and add proper ref hints | -| 0.32.5-alpha | 0.1.60 | 2021-11-22 | [\#8088](https://github.com/airbytehq/airbyte/pull/8088) | Speed-up incremental queries for SCD table on Snowflake | -| 0.30.32-alpha | 0.1.59 | 2021-11-08 | [\#7669](https://github.com/airbytehq/airbyte/pull/7169) | Fix nested incremental dbt | -| 0.30.24-alpha | 0.1.57 | 2021-10-26 | [\#7162](https://github.com/airbytehq/airbyte/pull/7162) | Implement incremental dbt updates | -| 0.30.16-alpha | 0.1.52 | 2021-10-07 | [\#6379](https://github.com/airbytehq/airbyte/pull/6379) | Handle empty string for date and date-time format | -| | 0.1.51 | 2021-10-08 | [\#6799](https://github.com/airbytehq/airbyte/pull/6799) | Added support for ad\_cdc\_log\_pos while normalization | -| | 0.1.50 | 2021-10-07 | [\#6079](https://github.com/airbytehq/airbyte/pull/6079) | Added support for MS SQL Server normalization | -| | 0.1.49 | 2021-10-06 | [\#6709](https://github.com/airbytehq/airbyte/pull/6709) | Forward destination dataset location to dbt profiles | -| 0.29.17-alpha | 0.1.47 | 2021-09-20 | [\#6317](https://github.com/airbytehq/airbyte/pull/6317) | MySQL: updated MySQL normalization with using SSH tunnel | -| | 0.1.45 | 2021-09-18 | [\#6052](https://github.com/airbytehq/airbyte/pull/6052) | Snowflake: accept any date-time format | -| 0.29.8-alpha | 0.1.40 | 2021-08-18 | [\#5433](https://github.com/airbytehq/airbyte/pull/5433) | Allow optional credentials\_json for BigQuery | -| 0.29.5-alpha | 0.1.39 | 2021-08-11 | [\#4557](https://github.com/airbytehq/airbyte/pull/4557) | Handle date times and solve conflict name btw stream/field | -| 0.28.2-alpha | 0.1.38 | 2021-07-28 | [\#5027](https://github.com/airbytehq/airbyte/pull/5027) | Handle quotes in column names when parsing JSON blob | -| 0.27.5-alpha | 0.1.37 | 2021-07-22 | [\#3947](https://github.com/airbytehq/airbyte/pull/4881/) | Handle `NULL` cursor field values when deduping | -| 0.27.2-alpha | 0.1.36 | 2021-07-09 | [\#3947](https://github.com/airbytehq/airbyte/pull/4163/) | Enable normalization for MySQL destination | +|:----------------| :--- | :--- | :--- | :--- | +| | 0.1.66 | 2022-02-04 | [\#9341](https://github.com/airbytehq/airbyte/pull/9341) | Fix normalization for bigquery datasetId and tables | +| 0.35.13-alpha | 0.1.65 | 2021-01-28 | [\#9846](https://github.com/airbytehq/airbyte/pull/9846) | Tweak dbt multi-thread parameter down | +| 0.35.12-alpha | 0.1.64 | 2021-01-28 | [\#9793](https://github.com/airbytehq/airbyte/pull/9793) | Support PEM format for ssh-tunnel keys | +| 0.35.4-alpha | 0.1.63 | 2021-01-07 | [\#9301](https://github.com/airbytehq/airbyte/pull/9301) | Fix Snowflake prefix tables starting with numbers | +| | 0.1.62 | 2021-01-07 | [\#9340](https://github.com/airbytehq/airbyte/pull/9340) | Use TCP-port support for clickhouse | +| | 0.1.62 | 2021-01-07 | [\#9063](https://github.com/airbytehq/airbyte/pull/9063) | Change Snowflake-specific materialization settings | +| | 0.1.62 | 2021-01-07 | [\#9317](https://github.com/airbytehq/airbyte/pull/9317) | Fix issue with quoted & case sensitive columns | +| | 0.1.62 | 2021-01-07 | [\#9281](https://github.com/airbytehq/airbyte/pull/9281) | Fix SCD partition by float columns in BigQuery | +| 0.32.11-alpha | 0.1.61 | 2021-12-02 | [\#8394](https://github.com/airbytehq/airbyte/pull/8394) | Fix incremental queries not updating empty tables | +| | 0.1.61 | 2021-12-01 | [\#8378](https://github.com/airbytehq/airbyte/pull/8378) | Fix un-nesting queries and add proper ref hints | +| 0.32.5-alpha | 0.1.60 | 2021-11-22 | [\#8088](https://github.com/airbytehq/airbyte/pull/8088) | Speed-up incremental queries for SCD table on Snowflake | +| 0.30.32-alpha | 0.1.59 | 2021-11-08 | [\#7669](https://github.com/airbytehq/airbyte/pull/7169) | Fix nested incremental dbt | +| 0.30.24-alpha | 0.1.57 | 2021-10-26 | [\#7162](https://github.com/airbytehq/airbyte/pull/7162) | Implement incremental dbt updates | +| 0.30.16-alpha | 0.1.52 | 2021-10-07 | [\#6379](https://github.com/airbytehq/airbyte/pull/6379) | Handle empty string for date and date-time format | +| | 0.1.51 | 2021-10-08 | [\#6799](https://github.com/airbytehq/airbyte/pull/6799) | Added support for ad\_cdc\_log\_pos while normalization | +| | 0.1.50 | 2021-10-07 | [\#6079](https://github.com/airbytehq/airbyte/pull/6079) | Added support for MS SQL Server normalization | +| | 0.1.49 | 2021-10-06 | [\#6709](https://github.com/airbytehq/airbyte/pull/6709) | Forward destination dataset location to dbt profiles | +| 0.29.17-alpha | 0.1.47 | 2021-09-20 | [\#6317](https://github.com/airbytehq/airbyte/pull/6317) | MySQL: updated MySQL normalization with using SSH tunnel | +| | 0.1.45 | 2021-09-18 | [\#6052](https://github.com/airbytehq/airbyte/pull/6052) | Snowflake: accept any date-time format | +| 0.29.8-alpha | 0.1.40 | 2021-08-18 | [\#5433](https://github.com/airbytehq/airbyte/pull/5433) | Allow optional credentials\_json for BigQuery | +| 0.29.5-alpha | 0.1.39 | 2021-08-11 | [\#4557](https://github.com/airbytehq/airbyte/pull/4557) | Handle date times and solve conflict name btw stream/field | +| 0.28.2-alpha | 0.1.38 | 2021-07-28 | [\#5027](https://github.com/airbytehq/airbyte/pull/5027) | Handle quotes in column names when parsing JSON blob | +| 0.27.5-alpha | 0.1.37 | 2021-07-22 | [\#3947](https://github.com/airbytehq/airbyte/pull/4881/) | Handle `NULL` cursor field values when deduping | +| 0.27.2-alpha | 0.1.36 | 2021-07-09 | [\#3947](https://github.com/airbytehq/airbyte/pull/4163/) | Enable normalization for MySQL destination |