From 98cf84d61cc5e7dcdd544cbd5c40ac9f07392972 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Thu, 21 Nov 2024 18:18:07 +0000 Subject: [PATCH 01/22] WIP --- .../pipelines/pipeline_non_dlt.py | 392 ++++++++++++++++++ .../data_imports/pipelines/pipeline_sync.py | 89 +++- .../pipelines/sql_database/__init__.py | 1 + .../pipelines/sql_database_v2/__init__.py | 25 ++ .../pipelines/sql_database_v2/helpers.py | 9 +- .../workflow_activities/import_data_sync.py | 24 +- .../warehouse/models/external_data_schema.py | 18 +- requirements.in | 1 + requirements.txt | 4 + 9 files changed, 535 insertions(+), 28 deletions(-) create mode 100644 posthog/temporal/data_imports/pipelines/pipeline_non_dlt.py diff --git a/posthog/temporal/data_imports/pipelines/pipeline_non_dlt.py b/posthog/temporal/data_imports/pipelines/pipeline_non_dlt.py new file mode 100644 index 0000000000000..6ec26eb58b283 --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/pipeline_non_dlt.py @@ -0,0 +1,392 @@ +import json +import time +from collections.abc import Sequence +from conditional_cache import lru_cache +from typing import Any +import pyarrow as pa +from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_schema +from dlt.common.normalizers.naming.snake_case import NamingConvention +from dlt.sources import DltSource, DltResource +import deltalake as deltalake +from django.conf import settings +from django.db.models import F +from posthog.settings.base_variables import TEST +from posthog.temporal.common.logger import FilteringBoundLogger +from posthog.temporal.data_imports.pipelines.pipeline_sync import validate_schema_and_update_table_sync +from posthog.temporal.data_imports.util import prepare_s3_files_for_querying +from posthog.warehouse.models import DataWarehouseTable, ExternalDataJob, ExternalDataSchema +from posthog.hogql.database.models import ( + BooleanDatabaseField, + DatabaseField, + DateDatabaseField, + DateTimeDatabaseField, + FloatDatabaseField, + IntegerDatabaseField, + StringDatabaseField, + StringJSONDatabaseField, +) + + +class HogQLSchema: + schema: dict[str, str] + + def __init__(self): + self.schema = {} + + def add_pyarrow_table(self, table: pa.Table) -> None: + for field in table.schema: + self.add_field(field, table.column(field.name)) + + def add_field(self, field: pa.Field, column: pa.ChunkedArray) -> None: + existing_type = self.schema.get(field.name) + if existing_type is not None and existing_type != StringDatabaseField.__name__: + return + + hogql_type: type[DatabaseField] = DatabaseField + + if pa.types.is_time(field.type): + hogql_type = DateTimeDatabaseField + elif pa.types.is_timestamp(field.type): + hogql_type = DateTimeDatabaseField + elif pa.types.is_date(field.type): + hogql_type = DateDatabaseField + elif pa.types.is_decimal(field.type): + hogql_type = FloatDatabaseField + elif pa.types.is_floating(field.type): + hogql_type = FloatDatabaseField + elif pa.types.is_boolean(field.type): + hogql_type = BooleanDatabaseField + elif pa.types.is_integer(field.type): + hogql_type = IntegerDatabaseField + elif pa.types.is_binary(field.type): + raise Exception("Type 'binary' is not a supported column type") + elif pa.types.is_string(field.type): + hogql_type = StringDatabaseField + + # Checking for JSON string columns with the first non-null value in the column + for value in column: + value_str = value.as_py() + if value_str is not None: + assert isinstance(value_str, str) + if value_str.startswith("{") or value_str.startswith("["): + hogql_type = StringJSONDatabaseField + break + + self.schema[field.name] = hogql_type.__name__ + + def to_hogql_types(self) -> dict[str, str]: + return self.schema + + +class DeltaTableHelper: + _resource_name: str + _job: ExternalDataJob + + def __init__(self, resource_name: str, job: ExternalDataJob) -> None: + self._resource_name = resource_name + self._job = job + + def _get_credentials(self): + if TEST: + return { + "aws_access_key_id": settings.AIRBYTE_BUCKET_KEY, + "aws_secret_access_key": settings.AIRBYTE_BUCKET_SECRET, + "endpoint_url": settings.OBJECT_STORAGE_ENDPOINT, + "region_name": settings.AIRBYTE_BUCKET_REGION, + "AWS_ALLOW_HTTP": "true", + "AWS_S3_ALLOW_UNSAFE_RENAME": "true", + } + + return { + "aws_access_key_id": settings.AIRBYTE_BUCKET_KEY, + "aws_secret_access_key": settings.AIRBYTE_BUCKET_SECRET, + "region_name": settings.AIRBYTE_BUCKET_REGION, + "AWS_DEFAULT_REGION": settings.AIRBYTE_BUCKET_REGION, + "AWS_S3_ALLOW_UNSAFE_RENAME": "true", + } + + def _get_delta_table_uri(self) -> str: + normalized_resource_name = NamingConvention().normalize_identifier(self._resource_name) + return f"{settings.BUCKET_URL}/{self._job.folder_path()}/{normalized_resource_name}" + + def _evolve_delta_schema(self, schema: pa.Schema) -> deltalake.DeltaTable: + delta_table = self.get_delta_table() + if delta_table is None: + raise Exception("Deltalake table not found") + + delta_table_schema = delta_table.schema().to_pyarrow() + + new_fields = [ + deltalake.Field.from_pyarrow(field) + for field in ensure_delta_compatible_arrow_schema(schema) + if field.name not in delta_table_schema.names + ] + if new_fields: + delta_table.alter.add_columns(new_fields) + + return delta_table + + @lru_cache(maxsize=1, condition=lambda result: result is not None) + def get_delta_table(self) -> deltalake.DeltaTable | None: + delta_uri = self._get_delta_table_uri() + storage_options = self._get_credentials() + + if deltalake.DeltaTable.is_deltatable(table_uri=delta_uri, storage_options=storage_options): + return deltalake.DeltaTable(table_uri=delta_uri, storage_options=storage_options) + + return None + + def write_to_deltalake( + self, data: pa.Table, is_incremental: bool, chunk_index: int, primary_keys: Sequence[Any] | None + ) -> deltalake.DeltaTable: + delta_table = self.get_delta_table() + + if delta_table: + delta_table = self._evolve_delta_schema(data.schema) + + if is_incremental and delta_table is not None: + if not primary_keys or len(primary_keys) == 0: + raise Exception("Primary key required for incremental syncs") + + delta_table.merge( + source=data, + source_alias="source", + target_alias="target", + predicate=" AND ".join([f"source.{c} = target.{c}" for c in primary_keys]), + ).when_matched_update_all().when_not_matched_insert_all().execute() + else: + mode = "append" + schema_mode = "merge" + if chunk_index == 0 or delta_table is None: + mode = "overwrite" + schema_mode = "overwrite" + + if delta_table is None: + delta_table = deltalake.DeltaTable.create(table_uri=self._get_delta_table_uri(), schema=data.schema) + + deltalake.write_deltalake( + table_or_uri=delta_table, + data=data, + partition_by=None, + mode=mode, + schema_mode=schema_mode, + engine="rust", + ) # type: ignore + + delta_table = self.get_delta_table() + assert delta_table is not None + + return delta_table + + +class PipelineNonDLT: + _resource: DltResource + _resource_name: str + _job: ExternalDataJob + _schema: ExternalDataSchema + _logger: FilteringBoundLogger + _is_incremental: bool + _delta_table_helper: DeltaTableHelper + _internal_schema = HogQLSchema() + _load_id: int + + def __init__(self, source: DltSource, logger: FilteringBoundLogger, job_id: str, is_incremental: bool) -> None: + resources = list(source.resources.items()) + assert len(resources) == 1 + resource_name, resource = resources[0] + + self._resource = resource + self._resource_name = resource_name + self._job = ExternalDataJob.objects.prefetch_related("schema").get(id=job_id) + self._is_incremental = is_incremental + self._logger = logger + self._load_id = time.time_ns() + + schema: ExternalDataSchema | None = self._job.schema + assert schema is not None + self._schema = schema + + self._delta_table_helper = DeltaTableHelper(resource_name, self._job) + self._internal_schema = HogQLSchema() + + def run(self): + buffer: list[Any] = [] + chunk_size = 5000 + row_count = 0 + chunk_index = 0 + + for item in self._resource: + py_table = None + + if isinstance(item, list): + if len(buffer) > 0: + buffer.extend(item) + if len(buffer) >= chunk_size: + py_table = pa.Table.from_pylist(buffer) + buffer = [] + else: + if len(item) >= chunk_size: + py_table = pa.Table.from_pylist(item) + else: + buffer.extend(item) + continue + elif isinstance(item, dict): + buffer.append(item) + if len(buffer) < chunk_size: + continue + + py_table = pa.Table.from_pylist(buffer) + buffer = [] + elif isinstance(item, pa.Table): + py_table = item + else: + raise Exception(f"Unhandled item type: {item.__class__.__name__}") + + assert py_table is not None + + self._process_pa_table(pa_table=py_table, index=chunk_index) + + row_count += py_table.num_rows + chunk_index += 1 + + if len(buffer) > 0: + py_table = pa.Table.from_pylist(buffer) + self._process_pa_table(pa_table=py_table, index=chunk_index) + row_count += py_table.num_rows + + self._post_run_operations(row_count=row_count) + + def _process_pa_table(self, pa_table: pa.Table, index: int): + delta_table = self._delta_table_helper.get_delta_table() + + pa_table = _append_debug_column_to_pyarrows_table(pa_table, self._load_id) + pa_table = _evolve_pyarrow_schema(pa_table, delta_table.schema() if delta_table is not None else None) + + table_primary_keys = self._get_primary_keys() + delta_table = self._delta_table_helper.write_to_deltalake( + pa_table, self._is_incremental, index, table_primary_keys + ) + + self._internal_schema.add_pyarrow_table(pa_table) + + _update_incrementality(self._schema, pa_table, self._logger) + _update_job_row_count(self._job.id, pa_table.num_rows, self._logger) + + def _post_run_operations(self, row_count: int): + delta_table = self._delta_table_helper.get_delta_table() + + assert delta_table is not None + + self._logger.info("Compacting delta table") + delta_table.optimize.compact() + delta_table.vacuum(retention_hours=24, enforce_retention_duration=False, dry_run=False) + + file_uris = delta_table.file_uris() + self._logger.info(f"Preparing S3 files - total parquet files: {len(file_uris)}") + prepare_s3_files_for_querying(self._job.folder_path(), self._resource_name, file_uris) + + self._logger.debug("Validating schema and updating table") + + validate_schema_and_update_table_sync( + run_id=str(self._job.id), + team_id=self._job.team_id, + schema_id=self._schema.id, + table_schema={}, + table_schema_dict=self._internal_schema.to_hogql_types(), + row_count=row_count, + table_format=DataWarehouseTable.TableFormat.DeltaS3Wrapper, + ) + + def _get_primary_keys(self) -> list[Any] | None: + primary_keys = self._resource._hints.get("primary_key") + + if primary_keys is None: + return None + + if isinstance(primary_keys, list): + return primary_keys + + if isinstance(primary_keys, Sequence): + return list(primary_keys) + + raise Exception(f"primary_keys of type {primary_keys.__class__.__name__} are not supported") + + +def _evolve_pyarrow_schema(table: pa.Table, delta_schema: deltalake.Schema | None) -> pa.Table: + py_table_field_names = table.schema.names + + # Change pa.structs to JSON string + for column_name in table.column_names: + column = table.column(column_name) + if pa.types.is_struct(column.type) or pa.types.is_list(column.type): + json_column = pa.array([json.dumps(row.as_py()) if row.as_py() is not None else None for row in column]) + table = table.set_column(table.schema.get_field_index(column_name), column_name, json_column) + + if delta_schema: + for field in delta_schema.to_pyarrow(): + if field.name not in py_table_field_names: + if field.nullable: + new_column_data = pa.array([None] * table.num_rows, type=field.type) + else: + new_column_data = pa.array( + [_get_default_value_from_pyarrow_type(field.type)] * table.num_rows, type=field.type + ) + table = table.append_column(field, new_column_data) + + # Change types based on what deltalake tables support + return table.cast(ensure_delta_compatible_arrow_schema(table.schema)) + + +def _append_debug_column_to_pyarrows_table(table: pa.Table, load_id: int) -> pa.Table: + debug_info = f'{{"load_id": {load_id}}}' + + column = pa.array([debug_info] * table.num_rows, type=pa.string()) + return table.append_column("_ph_debug", column) + + +def _get_default_value_from_pyarrow_type(pyarrow_type: pa.DataType): + """ + Returns a default value for the given PyArrow type. + """ + if pa.types.is_integer(pyarrow_type): + return 0 + elif pa.types.is_floating(pyarrow_type): + return 0.0 + elif pa.types.is_string(pyarrow_type): + return "" + elif pa.types.is_boolean(pyarrow_type): + return False + elif pa.types.is_binary(pyarrow_type): + return b"" + elif pa.types.is_timestamp(pyarrow_type): + return pa.scalar(0, type=pyarrow_type).as_py() + elif pa.types.is_date(pyarrow_type): + return pa.scalar(0, type=pyarrow_type).as_py() + elif pa.types.is_time(pyarrow_type): + return pa.scalar(0, type=pyarrow_type).as_py() + else: + raise ValueError(f"No default value defined for type: {pyarrow_type}") + + +def _update_incrementality(schema: ExternalDataSchema | None, table: pa.Table, logger: FilteringBoundLogger) -> None: + if schema is None or schema.sync_type != ExternalDataSchema.SyncType.INCREMENTAL: + return + + incremental_field_name: str | None = schema.sync_type_config.get("incremental_field") + if incremental_field_name is None: + return + + column = table[incremental_field_name] + numpy_arr = column.combine_chunks().to_pandas().to_numpy() + + # TODO(@Gilbert09): support different operations here (e.g. min) + last_value = numpy_arr.max() + + logger.debug(f"Updating incremental_field_last_value with {last_value}") + + schema.update_incremental_field_last_value(last_value) + + +def _update_job_row_count(job_id: str, count: int, logger: FilteringBoundLogger) -> None: + logger.debug(f"Updating rows_synced with +{count}") + ExternalDataJob.objects.filter(id=job_id).update(rows_synced=F("rows_synced") + count) diff --git a/posthog/temporal/data_imports/pipelines/pipeline_sync.py b/posthog/temporal/data_imports/pipelines/pipeline_sync.py index bd48d9a53ec0e..581e84f2e476e 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline_sync.py +++ b/posthog/temporal/data_imports/pipelines/pipeline_sync.py @@ -6,6 +6,7 @@ import dlt from django.conf import settings from django.db.models import Prefetch +import dlt.common from dlt.pipeline.exceptions import PipelineStepFailed from deltalake import DeltaTable @@ -345,6 +346,10 @@ def _run(self) -> dict[str, int]: job_id=self.inputs.run_id, schema_id=str(self.inputs.schema_id), team_id=self.inputs.team_id ) + if self._incremental: + self.logger.debug("Saving last incremental value...") + save_last_incremental_value(str(self.inputs.schema_id), str(self.inputs.team_id), self.source, self.logger) + # Cleanup: delete local state from the file system pipeline.drop() @@ -371,6 +376,28 @@ def update_last_synced_at_sync(job_id: str, schema_id: str, team_id: int) -> Non schema.save() +def save_last_incremental_value(schema_id: str, team_id: str, source: DltSource, logger: FilteringBoundLogger) -> None: + schema = ExternalDataSchema.objects.exclude(deleted=True).get(id=schema_id, team_id=team_id) + + incremental_field = schema.sync_type_config.get("incremental_field") + resource = next(iter(source.resources.values())) + + incremental: dict | None = resource.state.get("incremental") + + if incremental is None: + return + + incremental_object: dict | None = incremental.get(incremental_field) + if incremental_object is None: + return + + last_value = incremental_object.get("last_value") + + logger.debug(f"Updating incremental_field_last_value with {last_value}") + + schema.update_incremental_field_last_value(last_value) + + def validate_schema_and_update_table_sync( run_id: str, team_id: int, @@ -378,6 +405,7 @@ def validate_schema_and_update_table_sync( table_schema: TSchemaTables, row_count: int, table_format: DataWarehouseTable.TableFormat, + table_schema_dict: Optional[dict[str, str]] = None, ) -> None: """ @@ -465,27 +493,46 @@ def validate_schema_and_update_table_sync( else: raise - for schema in table_schema.values(): - if schema.get("resource") == _schema_name: - schema_columns = schema.get("columns") or {} - raw_db_columns: dict[str, dict[str, str]] = table_created.get_columns() - db_columns = {key: column.get("clickhouse", "") for key, column in raw_db_columns.items()} - - columns = {} - for column_name, db_column_type in db_columns.items(): - dlt_column = schema_columns.get(column_name) - if dlt_column is not None: - dlt_data_type = dlt_column.get("data_type") - hogql_type = dlt_to_hogql_type(dlt_data_type) - else: - hogql_type = dlt_to_hogql_type(None) - - columns[column_name] = { - "clickhouse": db_column_type, - "hogql": hogql_type, - } - table_created.columns = columns - break + # If using new non-DLT pipeline + if table_schema_dict is not None: + raw_db_columns: dict[str, dict[str, str]] = table_created.get_columns() + db_columns = {key: column.get("clickhouse", "") for key, column in raw_db_columns.items()} + + columns = {} + for column_name, db_column_type in db_columns.items(): + hogql_type = table_schema_dict.get(column_name) + + if hogql_type is None: + raise Exception(f"HogQL type not found for column: {column_name}") + + columns[column_name] = { + "clickhouse": db_column_type, + "hogql": hogql_type, + } + table_created.columns = columns + else: + # If using DLT pipeline + for schema in table_schema.values(): + if schema.get("resource") == _schema_name: + schema_columns = schema.get("columns") or {} + raw_db_columns: dict[str, dict[str, str]] = table_created.get_columns() + db_columns = {key: column.get("clickhouse", "") for key, column in raw_db_columns.items()} + + columns = {} + for column_name, db_column_type in db_columns.items(): + dlt_column = schema_columns.get(column_name) + if dlt_column is not None: + dlt_data_type = dlt_column.get("data_type") + hogql_type = dlt_to_hogql_type(dlt_data_type) + else: + hogql_type = dlt_to_hogql_type(None) + + columns[column_name] = { + "clickhouse": db_column_type, + "hogql": hogql_type, + } + table_created.columns = columns + break table_created.save() diff --git a/posthog/temporal/data_imports/pipelines/sql_database/__init__.py b/posthog/temporal/data_imports/pipelines/sql_database/__init__.py index 2d826b8ed71f6..3f852ac8a8607 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database/__init__.py +++ b/posthog/temporal/data_imports/pipelines/sql_database/__init__.py @@ -51,6 +51,7 @@ def sql_source_for_type( team_id: Optional[int] = None, incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, + db_incremental_field_last_value: Optional[Any] = None, ) -> DltSource: host = quote(host) user = quote(user) diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py index 33c150e79998f..227b01dd6633f 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py +++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py @@ -1,6 +1,7 @@ """Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads.""" from datetime import datetime, date +from dateutil import parser from typing import Optional, Union, Any from collections.abc import Callable, Iterable @@ -54,6 +55,20 @@ def incremental_type_to_initial_value(field_type: IncrementalFieldType) -> Any: return date(1970, 1, 1) +def process_incremental_last_value(value: Any | None, field_type: IncrementalFieldType | None) -> Any | None: + if value is None or field_type is None: + return None + + if field_type == IncrementalFieldType.Integer or field_type == IncrementalFieldType.Numeric: + return value + + if field_type == IncrementalFieldType.DateTime or field_type == IncrementalFieldType.Timestamp: + return parser.parse(value) + + if field_type == IncrementalFieldType.Date: + return parser.parse(value).date() + + def sql_source_for_type( source_type: ExternalDataSource.Type, host: str, @@ -67,6 +82,7 @@ def sql_source_for_type( team_id: Optional[int] = None, incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, + db_incremental_field_last_value: Optional[Any] = None, ) -> DltSource: host = quote(host) user = quote(user) @@ -105,11 +121,16 @@ def sql_source_for_type( else: raise Exception("Unsupported source_type") + processed_db_incremental_field_last_value = process_incremental_last_value( + db_incremental_field_last_value, incremental_field_type + ) + db_source = sql_database( credentials, schema=schema, table_names=table_names, incremental=incremental, + db_incremental_field_last_value=processed_db_incremental_field_last_value, team_id=team_id, connect_args=connect_args, ) @@ -203,6 +224,7 @@ def sql_database( include_views: bool = False, type_adapter_callback: Optional[TTypeAdapter] = None, incremental: Optional[dlt.sources.incremental] = None, + db_incremental_field_last_value: Optional[Any] = None, team_id: Optional[int] = None, connect_args: Optional[list[str]] = None, ) -> Iterable[DltResource]: @@ -275,6 +297,7 @@ def sql_database( backend_kwargs=backend_kwargs, type_adapter_callback=type_adapter_callback, incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, team_id=team_id, connect_args=connect_args, ) @@ -304,6 +327,7 @@ def sql_table( schema: Optional[str] = dlt.config.value, metadata: Optional[MetaData] = None, incremental: Optional[dlt.sources.incremental[Any]] = None, + db_incremental_field_last_value: Optional[Any] = None, chunk_size: int = 50000, backend: TableBackend = "sqlalchemy", detect_precision_hints: Optional[bool] = None, @@ -396,6 +420,7 @@ def query_adapter_callback(query: SelectAny, table: Table): chunk_size=chunk_size, backend=backend, incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, reflection_level=reflection_level, defer_table_reflect=defer_table_reflect, table_adapter_callback=table_adapter_callback, diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py index 46f59929beb47..acd64c97aae99 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py +++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py @@ -46,6 +46,7 @@ def __init__( columns: TTableSchemaColumns, chunk_size: int = 1000, incremental: Optional[dlt.sources.incremental[Any]] = None, + db_incremental_field_last_value: Optional[Any] = None, query_adapter_callback: Optional[TQueryAdapter] = None, connect_args: Optional[list[str]] = None, ) -> None: @@ -64,7 +65,11 @@ def __init__( raise KeyError( f"Cursor column '{incremental.cursor_path}' does not exist in table '{table.name}'" ) from e - self.last_value = incremental.last_value + self.last_value = ( + db_incremental_field_last_value + if db_incremental_field_last_value is not None + else incremental.last_value + ) self.end_value = incremental.end_value self.row_order: TSortOrder = self.incremental.row_order else: @@ -183,6 +188,7 @@ def table_rows( chunk_size: int, backend: TableBackend, incremental: Optional[dlt.sources.incremental[Any]] = None, + db_incremental_field_last_value: Optional[Any] = None, defer_table_reflect: bool = False, table_adapter_callback: Optional[Callable[[Table], None]] = None, reflection_level: ReflectionLevel = "minimal", @@ -226,6 +232,7 @@ def table_rows( table, columns, incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, chunk_size=chunk_size, query_adapter_callback=query_adapter_callback, connect_args=connect_args, diff --git a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py index 74244dcded195..20fdd467e717a 100644 --- a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py +++ b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py @@ -3,6 +3,7 @@ from datetime import datetime from typing import Any +from django.conf import settings from django.db import close_old_connections from django.db.models import Prefetch, F @@ -12,6 +13,7 @@ from posthog.temporal.common.heartbeat_sync import HeartbeaterSync from posthog.temporal.data_imports.pipelines.bigquery import delete_table +from posthog.temporal.data_imports.pipelines.pipeline_non_dlt import PipelineNonDLT from posthog.temporal.data_imports.pipelines.pipeline_sync import DataImportPipelineSync, PipelineInputs from posthog.temporal.data_imports.util import is_posthog_team from posthog.warehouse.models import ( @@ -176,6 +178,9 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, + db_incremental_field_last_value=schema.sync_type_config.get("incremental_field_last_value") + if schema.is_incremental + else None, team_id=inputs.team_id, ) @@ -202,6 +207,9 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, + db_incremental_field_last_value=schema.sync_type_config.get("incremental_field_last_value") + if schema.is_incremental + else None, team_id=inputs.team_id, ) @@ -425,12 +433,18 @@ def _run( schema: ExternalDataSchema, reset_pipeline: bool, ): - table_row_counts = DataImportPipelineSync(job_inputs, source, logger, reset_pipeline, schema.is_incremental).run() - total_rows_synced = sum(table_row_counts.values()) + if settings.DEBUG: + PipelineNonDLT(source, logger, job_inputs.run_id, schema.is_incremental).run() + else: + table_row_counts = DataImportPipelineSync( + job_inputs, source, logger, reset_pipeline, schema.is_incremental + ).run() + total_rows_synced = sum(table_row_counts.values()) + + ExternalDataJob.objects.filter(id=inputs.run_id, team_id=inputs.team_id).update( + rows_synced=F("rows_synced") + total_rows_synced + ) - ExternalDataJob.objects.filter(id=inputs.run_id, team_id=inputs.team_id).update( - rows_synced=F("rows_synced") + total_rows_synced - ) source = ExternalDataSource.objects.get(id=inputs.source_id) source.job_inputs.pop("reset_pipeline", None) source.save() diff --git a/posthog/warehouse/models/external_data_schema.py b/posthog/warehouse/models/external_data_schema.py index 3bcbc6c658f7f..beaad6ba8c408 100644 --- a/posthog/warehouse/models/external_data_schema.py +++ b/posthog/warehouse/models/external_data_schema.py @@ -1,6 +1,6 @@ from collections import defaultdict from datetime import datetime, timedelta -from typing import Optional +from typing import Any, Optional from django.db import models from django_deprecate_fields import deprecate_field import snowflake.connector @@ -48,6 +48,8 @@ class SyncFrequency(models.TextChoices): status = models.CharField(max_length=400, null=True, blank=True) last_synced_at = models.DateTimeField(null=True, blank=True) sync_type = models.CharField(max_length=128, choices=SyncType.choices, null=True, blank=True) + + # { "incremental_field": string, "incremental_field_type": string, "incremental_field_last_value": any } sync_type_config = models.JSONField( default=dict, blank=True, @@ -67,6 +69,20 @@ def folder_path(self) -> str: def is_incremental(self): return self.sync_type == self.SyncType.INCREMENTAL + def update_incremental_field_last_value(self, last_value: Any) -> None: + incremental_field_type = self.sync_type_config.get("incremental_field_type") + + if ( + incremental_field_type == IncrementalFieldType.Integer + or incremental_field_type == IncrementalFieldType.Numeric + ): + last_value_json = last_value + else: + last_value_json = str(last_value) + + self.sync_type_config["incremental_field_last_value"] = last_value_json + self.save() + def soft_delete(self): self.deleted = True self.deleted_at = datetime.now() diff --git a/requirements.in b/requirements.in index 3696df35d43d1..e1afbf34b108f 100644 --- a/requirements.in +++ b/requirements.in @@ -14,6 +14,7 @@ celery==5.3.4 celery-redbeat==2.1.1 clickhouse-driver==0.2.7 clickhouse-pool==0.5.3 +conditional-cache==1.2 cryptography==39.0.2 dj-database-url==0.5.0 Django~=4.2.15 diff --git a/requirements.txt b/requirements.txt index c276d7a792904..d5cac17a5ce4e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -93,6 +93,8 @@ charset-normalizer==2.1.0 # via # requests # snowflake-connector-python +circular-dict==1.9 + # via conditional-cache click==8.1.7 # via # celery @@ -113,6 +115,8 @@ clickhouse-driver==0.2.7 # sentry-sdk clickhouse-pool==0.5.3 # via -r requirements.in +conditional-cache==1.2 + # via -r requirements.in cryptography==39.0.2 # via # -r requirements.in From e4213535e838c268ec8046d654c5e3a8fc6d0489 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Fri, 22 Nov 2024 14:21:17 +0000 Subject: [PATCH 02/22] Restructure new pipeline files --- .../pipelines/pipeline/delta_table_helper.py | 111 +++++ .../pipelines/pipeline/hogql_schema.py | 63 +++ .../pipelines/pipeline/pipeline.py | 137 ++++++ .../data_imports/pipelines/pipeline/utils.py | 105 +++++ .../pipelines/pipeline_non_dlt.py | 392 ------------------ .../workflow_activities/import_data_sync.py | 2 +- 6 files changed, 417 insertions(+), 393 deletions(-) create mode 100644 posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py create mode 100644 posthog/temporal/data_imports/pipelines/pipeline/hogql_schema.py create mode 100644 posthog/temporal/data_imports/pipelines/pipeline/pipeline.py create mode 100644 posthog/temporal/data_imports/pipelines/pipeline/utils.py delete mode 100644 posthog/temporal/data_imports/pipelines/pipeline_non_dlt.py diff --git a/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py new file mode 100644 index 0000000000000..30e3cf0e466d5 --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py @@ -0,0 +1,111 @@ +from collections.abc import Sequence +from conditional_cache import lru_cache +from typing import Any +import pyarrow as pa +from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_schema +from dlt.common.normalizers.naming.snake_case import NamingConvention +import deltalake as deltalake +from django.conf import settings +from posthog.settings.base_variables import TEST +from posthog.warehouse.models import ExternalDataJob + + +class DeltaTableHelper: + _resource_name: str + _job: ExternalDataJob + + def __init__(self, resource_name: str, job: ExternalDataJob) -> None: + self._resource_name = resource_name + self._job = job + + def _get_credentials(self): + if TEST: + return { + "aws_access_key_id": settings.AIRBYTE_BUCKET_KEY, + "aws_secret_access_key": settings.AIRBYTE_BUCKET_SECRET, + "endpoint_url": settings.OBJECT_STORAGE_ENDPOINT, + "region_name": settings.AIRBYTE_BUCKET_REGION, + "AWS_ALLOW_HTTP": "true", + "AWS_S3_ALLOW_UNSAFE_RENAME": "true", + } + + return { + "aws_access_key_id": settings.AIRBYTE_BUCKET_KEY, + "aws_secret_access_key": settings.AIRBYTE_BUCKET_SECRET, + "region_name": settings.AIRBYTE_BUCKET_REGION, + "AWS_DEFAULT_REGION": settings.AIRBYTE_BUCKET_REGION, + "AWS_S3_ALLOW_UNSAFE_RENAME": "true", + } + + def _get_delta_table_uri(self) -> str: + normalized_resource_name = NamingConvention().normalize_identifier(self._resource_name) + return f"{settings.BUCKET_URL}/{self._job.folder_path()}/{normalized_resource_name}" + + def _evolve_delta_schema(self, schema: pa.Schema) -> deltalake.DeltaTable: + delta_table = self.get_delta_table() + if delta_table is None: + raise Exception("Deltalake table not found") + + delta_table_schema = delta_table.schema().to_pyarrow() + + new_fields = [ + deltalake.Field.from_pyarrow(field) + for field in ensure_delta_compatible_arrow_schema(schema) + if field.name not in delta_table_schema.names + ] + if new_fields: + delta_table.alter.add_columns(new_fields) + + return delta_table + + @lru_cache(maxsize=1, condition=lambda result: result is not None) + def get_delta_table(self) -> deltalake.DeltaTable | None: + delta_uri = self._get_delta_table_uri() + storage_options = self._get_credentials() + + if deltalake.DeltaTable.is_deltatable(table_uri=delta_uri, storage_options=storage_options): + return deltalake.DeltaTable(table_uri=delta_uri, storage_options=storage_options) + + return None + + def write_to_deltalake( + self, data: pa.Table, is_incremental: bool, chunk_index: int, primary_keys: Sequence[Any] | None + ) -> deltalake.DeltaTable: + delta_table = self.get_delta_table() + + if delta_table: + delta_table = self._evolve_delta_schema(data.schema) + + if is_incremental and delta_table is not None: + if not primary_keys or len(primary_keys) == 0: + raise Exception("Primary key required for incremental syncs") + + delta_table.merge( + source=data, + source_alias="source", + target_alias="target", + predicate=" AND ".join([f"source.{c} = target.{c}" for c in primary_keys]), + ).when_matched_update_all().when_not_matched_insert_all().execute() + else: + mode = "append" + schema_mode = "merge" + if chunk_index == 0 or delta_table is None: + mode = "overwrite" + schema_mode = "overwrite" + + if delta_table is None: + delta_table = deltalake.DeltaTable.create(table_uri=self._get_delta_table_uri(), schema=data.schema) + + deltalake.write_deltalake( + table_or_uri=delta_table, + data=data, + partition_by=None, + mode=mode, + schema_mode=schema_mode, + engine="rust", + ) # type: ignore + + delta_table = self.get_delta_table() + assert delta_table is not None + + return delta_table diff --git a/posthog/temporal/data_imports/pipelines/pipeline/hogql_schema.py b/posthog/temporal/data_imports/pipelines/pipeline/hogql_schema.py new file mode 100644 index 0000000000000..383a3296f0435 --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/pipeline/hogql_schema.py @@ -0,0 +1,63 @@ +import pyarrow as pa +import deltalake as deltalake +from posthog.hogql.database.models import ( + BooleanDatabaseField, + DatabaseField, + DateDatabaseField, + DateTimeDatabaseField, + FloatDatabaseField, + IntegerDatabaseField, + StringDatabaseField, + StringJSONDatabaseField, +) + + +class HogQLSchema: + schema: dict[str, str] + + def __init__(self): + self.schema = {} + + def add_pyarrow_table(self, table: pa.Table) -> None: + for field in table.schema: + self.add_field(field, table.column(field.name)) + + def add_field(self, field: pa.Field, column: pa.ChunkedArray) -> None: + existing_type = self.schema.get(field.name) + if existing_type is not None and existing_type != StringDatabaseField.__name__: + return + + hogql_type: type[DatabaseField] = DatabaseField + + if pa.types.is_time(field.type): + hogql_type = DateTimeDatabaseField + elif pa.types.is_timestamp(field.type): + hogql_type = DateTimeDatabaseField + elif pa.types.is_date(field.type): + hogql_type = DateDatabaseField + elif pa.types.is_decimal(field.type): + hogql_type = FloatDatabaseField + elif pa.types.is_floating(field.type): + hogql_type = FloatDatabaseField + elif pa.types.is_boolean(field.type): + hogql_type = BooleanDatabaseField + elif pa.types.is_integer(field.type): + hogql_type = IntegerDatabaseField + elif pa.types.is_binary(field.type): + raise Exception("Type 'binary' is not a supported column type") + elif pa.types.is_string(field.type): + hogql_type = StringDatabaseField + + # Checking for JSON string columns with the first non-null value in the column + for value in column: + value_str = value.as_py() + if value_str is not None: + assert isinstance(value_str, str) + if value_str.startswith("{") or value_str.startswith("["): + hogql_type = StringJSONDatabaseField + break + + self.schema[field.name] = hogql_type.__name__ + + def to_hogql_types(self) -> dict[str, str]: + return self.schema diff --git a/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py b/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py new file mode 100644 index 0000000000000..96f938a32e55f --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py @@ -0,0 +1,137 @@ +import time +from typing import Any +import pyarrow as pa +from dlt.sources import DltSource, DltResource +import deltalake as deltalake +from posthog.temporal.common.logger import FilteringBoundLogger +from posthog.temporal.data_imports.pipelines.pipeline.utils import ( + _update_incremental_state, + _get_primary_keys, + _evolve_pyarrow_schema, + _append_debug_column_to_pyarrows_table, + _update_job_row_count, +) +from posthog.temporal.data_imports.pipelines.pipeline.delta_table_helper import DeltaTableHelper +from posthog.temporal.data_imports.pipelines.pipeline.hogql_schema import HogQLSchema +from posthog.temporal.data_imports.pipelines.pipeline_sync import validate_schema_and_update_table_sync +from posthog.temporal.data_imports.util import prepare_s3_files_for_querying +from posthog.warehouse.models import DataWarehouseTable, ExternalDataJob, ExternalDataSchema + + +class PipelineNonDLT: + _resource: DltResource + _resource_name: str + _job: ExternalDataJob + _schema: ExternalDataSchema + _logger: FilteringBoundLogger + _is_incremental: bool + _delta_table_helper: DeltaTableHelper + _internal_schema = HogQLSchema() + _load_id: int + + def __init__(self, source: DltSource, logger: FilteringBoundLogger, job_id: str, is_incremental: bool) -> None: + resources = list(source.resources.items()) + assert len(resources) == 1 + resource_name, resource = resources[0] + + self._resource = resource + self._resource_name = resource_name + self._job = ExternalDataJob.objects.prefetch_related("schema").get(id=job_id) + self._is_incremental = is_incremental + self._logger = logger + self._load_id = time.time_ns() + + schema: ExternalDataSchema | None = self._job.schema + assert schema is not None + self._schema = schema + + self._delta_table_helper = DeltaTableHelper(resource_name, self._job) + self._internal_schema = HogQLSchema() + + def run(self): + buffer: list[Any] = [] + chunk_size = 5000 + row_count = 0 + chunk_index = 0 + + for item in self._resource: + py_table = None + + if isinstance(item, list): + if len(buffer) > 0: + buffer.extend(item) + if len(buffer) >= chunk_size: + py_table = pa.Table.from_pylist(buffer) + buffer = [] + else: + if len(item) >= chunk_size: + py_table = pa.Table.from_pylist(item) + else: + buffer.extend(item) + continue + elif isinstance(item, dict): + buffer.append(item) + if len(buffer) < chunk_size: + continue + + py_table = pa.Table.from_pylist(buffer) + buffer = [] + elif isinstance(item, pa.Table): + py_table = item + else: + raise Exception(f"Unhandled item type: {item.__class__.__name__}") + + assert py_table is not None + + self._process_pa_table(pa_table=py_table, index=chunk_index) + + row_count += py_table.num_rows + chunk_index += 1 + + if len(buffer) > 0: + py_table = pa.Table.from_pylist(buffer) + self._process_pa_table(pa_table=py_table, index=chunk_index) + row_count += py_table.num_rows + + self._post_run_operations(row_count=row_count) + + def _process_pa_table(self, pa_table: pa.Table, index: int): + delta_table = self._delta_table_helper.get_delta_table() + + pa_table = _append_debug_column_to_pyarrows_table(pa_table, self._load_id) + pa_table = _evolve_pyarrow_schema(pa_table, delta_table.schema() if delta_table is not None else None) + + table_primary_keys = _get_primary_keys(self._resource) + delta_table = self._delta_table_helper.write_to_deltalake( + pa_table, self._is_incremental, index, table_primary_keys + ) + + self._internal_schema.add_pyarrow_table(pa_table) + + _update_incremental_state(self._schema, pa_table, self._logger) + _update_job_row_count(self._job.id, pa_table.num_rows, self._logger) + + def _post_run_operations(self, row_count: int): + delta_table = self._delta_table_helper.get_delta_table() + + assert delta_table is not None + + self._logger.info("Compacting delta table") + delta_table.optimize.compact() + delta_table.vacuum(retention_hours=24, enforce_retention_duration=False, dry_run=False) + + file_uris = delta_table.file_uris() + self._logger.info(f"Preparing S3 files - total parquet files: {len(file_uris)}") + prepare_s3_files_for_querying(self._job.folder_path(), self._resource_name, file_uris) + + self._logger.debug("Validating schema and updating table") + + validate_schema_and_update_table_sync( + run_id=str(self._job.id), + team_id=self._job.team_id, + schema_id=self._schema.id, + table_schema={}, + table_schema_dict=self._internal_schema.to_hogql_types(), + row_count=row_count, + table_format=DataWarehouseTable.TableFormat.DeltaS3Wrapper, + ) diff --git a/posthog/temporal/data_imports/pipelines/pipeline/utils.py b/posthog/temporal/data_imports/pipelines/pipeline/utils.py new file mode 100644 index 0000000000000..d07a697b9b4ea --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/pipeline/utils.py @@ -0,0 +1,105 @@ +import json +from collections.abc import Sequence +from typing import Any +import pyarrow as pa +from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_schema +from dlt.sources import DltResource +import deltalake as deltalake +from django.db.models import F +from posthog.temporal.common.logger import FilteringBoundLogger +from posthog.warehouse.models import ExternalDataJob, ExternalDataSchema + + +def _get_primary_keys(resource: DltResource) -> list[Any] | None: + primary_keys = resource._hints.get("primary_key") + + if primary_keys is None: + return None + + if isinstance(primary_keys, list): + return primary_keys + + if isinstance(primary_keys, Sequence): + return list(primary_keys) + + raise Exception(f"primary_keys of type {primary_keys.__class__.__name__} are not supported") + + +def _evolve_pyarrow_schema(table: pa.Table, delta_schema: deltalake.Schema | None) -> pa.Table: + py_table_field_names = table.schema.names + + # Change pa.structs to JSON string + for column_name in table.column_names: + column = table.column(column_name) + if pa.types.is_struct(column.type) or pa.types.is_list(column.type): + json_column = pa.array([json.dumps(row.as_py()) if row.as_py() is not None else None for row in column]) + table = table.set_column(table.schema.get_field_index(column_name), column_name, json_column) + + if delta_schema: + for field in delta_schema.to_pyarrow(): + if field.name not in py_table_field_names: + if field.nullable: + new_column_data = pa.array([None] * table.num_rows, type=field.type) + else: + new_column_data = pa.array( + [_get_default_value_from_pyarrow_type(field.type)] * table.num_rows, type=field.type + ) + table = table.append_column(field, new_column_data) + + # Change types based on what deltalake tables support + return table.cast(ensure_delta_compatible_arrow_schema(table.schema)) + + +def _append_debug_column_to_pyarrows_table(table: pa.Table, load_id: int) -> pa.Table: + debug_info = f'{{"load_id": {load_id}}}' + + column = pa.array([debug_info] * table.num_rows, type=pa.string()) + return table.append_column("_ph_debug", column) + + +def _get_default_value_from_pyarrow_type(pyarrow_type: pa.DataType): + """ + Returns a default value for the given PyArrow type. + """ + if pa.types.is_integer(pyarrow_type): + return 0 + elif pa.types.is_floating(pyarrow_type): + return 0.0 + elif pa.types.is_string(pyarrow_type): + return "" + elif pa.types.is_boolean(pyarrow_type): + return False + elif pa.types.is_binary(pyarrow_type): + return b"" + elif pa.types.is_timestamp(pyarrow_type): + return pa.scalar(0, type=pyarrow_type).as_py() + elif pa.types.is_date(pyarrow_type): + return pa.scalar(0, type=pyarrow_type).as_py() + elif pa.types.is_time(pyarrow_type): + return pa.scalar(0, type=pyarrow_type).as_py() + else: + raise ValueError(f"No default value defined for type: {pyarrow_type}") + + +def _update_incremental_state(schema: ExternalDataSchema | None, table: pa.Table, logger: FilteringBoundLogger) -> None: + if schema is None or schema.sync_type != ExternalDataSchema.SyncType.INCREMENTAL: + return + + incremental_field_name: str | None = schema.sync_type_config.get("incremental_field") + if incremental_field_name is None: + return + + column = table[incremental_field_name] + numpy_arr = column.combine_chunks().to_pandas().to_numpy() + + # TODO(@Gilbert09): support different operations here (e.g. min) + last_value = numpy_arr.max() + + logger.debug(f"Updating incremental_field_last_value with {last_value}") + + schema.update_incremental_field_last_value(last_value) + + +def _update_job_row_count(job_id: str, count: int, logger: FilteringBoundLogger) -> None: + logger.debug(f"Updating rows_synced with +{count}") + ExternalDataJob.objects.filter(id=job_id).update(rows_synced=F("rows_synced") + count) diff --git a/posthog/temporal/data_imports/pipelines/pipeline_non_dlt.py b/posthog/temporal/data_imports/pipelines/pipeline_non_dlt.py deleted file mode 100644 index 6ec26eb58b283..0000000000000 --- a/posthog/temporal/data_imports/pipelines/pipeline_non_dlt.py +++ /dev/null @@ -1,392 +0,0 @@ -import json -import time -from collections.abc import Sequence -from conditional_cache import lru_cache -from typing import Any -import pyarrow as pa -from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_schema -from dlt.common.normalizers.naming.snake_case import NamingConvention -from dlt.sources import DltSource, DltResource -import deltalake as deltalake -from django.conf import settings -from django.db.models import F -from posthog.settings.base_variables import TEST -from posthog.temporal.common.logger import FilteringBoundLogger -from posthog.temporal.data_imports.pipelines.pipeline_sync import validate_schema_and_update_table_sync -from posthog.temporal.data_imports.util import prepare_s3_files_for_querying -from posthog.warehouse.models import DataWarehouseTable, ExternalDataJob, ExternalDataSchema -from posthog.hogql.database.models import ( - BooleanDatabaseField, - DatabaseField, - DateDatabaseField, - DateTimeDatabaseField, - FloatDatabaseField, - IntegerDatabaseField, - StringDatabaseField, - StringJSONDatabaseField, -) - - -class HogQLSchema: - schema: dict[str, str] - - def __init__(self): - self.schema = {} - - def add_pyarrow_table(self, table: pa.Table) -> None: - for field in table.schema: - self.add_field(field, table.column(field.name)) - - def add_field(self, field: pa.Field, column: pa.ChunkedArray) -> None: - existing_type = self.schema.get(field.name) - if existing_type is not None and existing_type != StringDatabaseField.__name__: - return - - hogql_type: type[DatabaseField] = DatabaseField - - if pa.types.is_time(field.type): - hogql_type = DateTimeDatabaseField - elif pa.types.is_timestamp(field.type): - hogql_type = DateTimeDatabaseField - elif pa.types.is_date(field.type): - hogql_type = DateDatabaseField - elif pa.types.is_decimal(field.type): - hogql_type = FloatDatabaseField - elif pa.types.is_floating(field.type): - hogql_type = FloatDatabaseField - elif pa.types.is_boolean(field.type): - hogql_type = BooleanDatabaseField - elif pa.types.is_integer(field.type): - hogql_type = IntegerDatabaseField - elif pa.types.is_binary(field.type): - raise Exception("Type 'binary' is not a supported column type") - elif pa.types.is_string(field.type): - hogql_type = StringDatabaseField - - # Checking for JSON string columns with the first non-null value in the column - for value in column: - value_str = value.as_py() - if value_str is not None: - assert isinstance(value_str, str) - if value_str.startswith("{") or value_str.startswith("["): - hogql_type = StringJSONDatabaseField - break - - self.schema[field.name] = hogql_type.__name__ - - def to_hogql_types(self) -> dict[str, str]: - return self.schema - - -class DeltaTableHelper: - _resource_name: str - _job: ExternalDataJob - - def __init__(self, resource_name: str, job: ExternalDataJob) -> None: - self._resource_name = resource_name - self._job = job - - def _get_credentials(self): - if TEST: - return { - "aws_access_key_id": settings.AIRBYTE_BUCKET_KEY, - "aws_secret_access_key": settings.AIRBYTE_BUCKET_SECRET, - "endpoint_url": settings.OBJECT_STORAGE_ENDPOINT, - "region_name": settings.AIRBYTE_BUCKET_REGION, - "AWS_ALLOW_HTTP": "true", - "AWS_S3_ALLOW_UNSAFE_RENAME": "true", - } - - return { - "aws_access_key_id": settings.AIRBYTE_BUCKET_KEY, - "aws_secret_access_key": settings.AIRBYTE_BUCKET_SECRET, - "region_name": settings.AIRBYTE_BUCKET_REGION, - "AWS_DEFAULT_REGION": settings.AIRBYTE_BUCKET_REGION, - "AWS_S3_ALLOW_UNSAFE_RENAME": "true", - } - - def _get_delta_table_uri(self) -> str: - normalized_resource_name = NamingConvention().normalize_identifier(self._resource_name) - return f"{settings.BUCKET_URL}/{self._job.folder_path()}/{normalized_resource_name}" - - def _evolve_delta_schema(self, schema: pa.Schema) -> deltalake.DeltaTable: - delta_table = self.get_delta_table() - if delta_table is None: - raise Exception("Deltalake table not found") - - delta_table_schema = delta_table.schema().to_pyarrow() - - new_fields = [ - deltalake.Field.from_pyarrow(field) - for field in ensure_delta_compatible_arrow_schema(schema) - if field.name not in delta_table_schema.names - ] - if new_fields: - delta_table.alter.add_columns(new_fields) - - return delta_table - - @lru_cache(maxsize=1, condition=lambda result: result is not None) - def get_delta_table(self) -> deltalake.DeltaTable | None: - delta_uri = self._get_delta_table_uri() - storage_options = self._get_credentials() - - if deltalake.DeltaTable.is_deltatable(table_uri=delta_uri, storage_options=storage_options): - return deltalake.DeltaTable(table_uri=delta_uri, storage_options=storage_options) - - return None - - def write_to_deltalake( - self, data: pa.Table, is_incremental: bool, chunk_index: int, primary_keys: Sequence[Any] | None - ) -> deltalake.DeltaTable: - delta_table = self.get_delta_table() - - if delta_table: - delta_table = self._evolve_delta_schema(data.schema) - - if is_incremental and delta_table is not None: - if not primary_keys or len(primary_keys) == 0: - raise Exception("Primary key required for incremental syncs") - - delta_table.merge( - source=data, - source_alias="source", - target_alias="target", - predicate=" AND ".join([f"source.{c} = target.{c}" for c in primary_keys]), - ).when_matched_update_all().when_not_matched_insert_all().execute() - else: - mode = "append" - schema_mode = "merge" - if chunk_index == 0 or delta_table is None: - mode = "overwrite" - schema_mode = "overwrite" - - if delta_table is None: - delta_table = deltalake.DeltaTable.create(table_uri=self._get_delta_table_uri(), schema=data.schema) - - deltalake.write_deltalake( - table_or_uri=delta_table, - data=data, - partition_by=None, - mode=mode, - schema_mode=schema_mode, - engine="rust", - ) # type: ignore - - delta_table = self.get_delta_table() - assert delta_table is not None - - return delta_table - - -class PipelineNonDLT: - _resource: DltResource - _resource_name: str - _job: ExternalDataJob - _schema: ExternalDataSchema - _logger: FilteringBoundLogger - _is_incremental: bool - _delta_table_helper: DeltaTableHelper - _internal_schema = HogQLSchema() - _load_id: int - - def __init__(self, source: DltSource, logger: FilteringBoundLogger, job_id: str, is_incremental: bool) -> None: - resources = list(source.resources.items()) - assert len(resources) == 1 - resource_name, resource = resources[0] - - self._resource = resource - self._resource_name = resource_name - self._job = ExternalDataJob.objects.prefetch_related("schema").get(id=job_id) - self._is_incremental = is_incremental - self._logger = logger - self._load_id = time.time_ns() - - schema: ExternalDataSchema | None = self._job.schema - assert schema is not None - self._schema = schema - - self._delta_table_helper = DeltaTableHelper(resource_name, self._job) - self._internal_schema = HogQLSchema() - - def run(self): - buffer: list[Any] = [] - chunk_size = 5000 - row_count = 0 - chunk_index = 0 - - for item in self._resource: - py_table = None - - if isinstance(item, list): - if len(buffer) > 0: - buffer.extend(item) - if len(buffer) >= chunk_size: - py_table = pa.Table.from_pylist(buffer) - buffer = [] - else: - if len(item) >= chunk_size: - py_table = pa.Table.from_pylist(item) - else: - buffer.extend(item) - continue - elif isinstance(item, dict): - buffer.append(item) - if len(buffer) < chunk_size: - continue - - py_table = pa.Table.from_pylist(buffer) - buffer = [] - elif isinstance(item, pa.Table): - py_table = item - else: - raise Exception(f"Unhandled item type: {item.__class__.__name__}") - - assert py_table is not None - - self._process_pa_table(pa_table=py_table, index=chunk_index) - - row_count += py_table.num_rows - chunk_index += 1 - - if len(buffer) > 0: - py_table = pa.Table.from_pylist(buffer) - self._process_pa_table(pa_table=py_table, index=chunk_index) - row_count += py_table.num_rows - - self._post_run_operations(row_count=row_count) - - def _process_pa_table(self, pa_table: pa.Table, index: int): - delta_table = self._delta_table_helper.get_delta_table() - - pa_table = _append_debug_column_to_pyarrows_table(pa_table, self._load_id) - pa_table = _evolve_pyarrow_schema(pa_table, delta_table.schema() if delta_table is not None else None) - - table_primary_keys = self._get_primary_keys() - delta_table = self._delta_table_helper.write_to_deltalake( - pa_table, self._is_incremental, index, table_primary_keys - ) - - self._internal_schema.add_pyarrow_table(pa_table) - - _update_incrementality(self._schema, pa_table, self._logger) - _update_job_row_count(self._job.id, pa_table.num_rows, self._logger) - - def _post_run_operations(self, row_count: int): - delta_table = self._delta_table_helper.get_delta_table() - - assert delta_table is not None - - self._logger.info("Compacting delta table") - delta_table.optimize.compact() - delta_table.vacuum(retention_hours=24, enforce_retention_duration=False, dry_run=False) - - file_uris = delta_table.file_uris() - self._logger.info(f"Preparing S3 files - total parquet files: {len(file_uris)}") - prepare_s3_files_for_querying(self._job.folder_path(), self._resource_name, file_uris) - - self._logger.debug("Validating schema and updating table") - - validate_schema_and_update_table_sync( - run_id=str(self._job.id), - team_id=self._job.team_id, - schema_id=self._schema.id, - table_schema={}, - table_schema_dict=self._internal_schema.to_hogql_types(), - row_count=row_count, - table_format=DataWarehouseTable.TableFormat.DeltaS3Wrapper, - ) - - def _get_primary_keys(self) -> list[Any] | None: - primary_keys = self._resource._hints.get("primary_key") - - if primary_keys is None: - return None - - if isinstance(primary_keys, list): - return primary_keys - - if isinstance(primary_keys, Sequence): - return list(primary_keys) - - raise Exception(f"primary_keys of type {primary_keys.__class__.__name__} are not supported") - - -def _evolve_pyarrow_schema(table: pa.Table, delta_schema: deltalake.Schema | None) -> pa.Table: - py_table_field_names = table.schema.names - - # Change pa.structs to JSON string - for column_name in table.column_names: - column = table.column(column_name) - if pa.types.is_struct(column.type) or pa.types.is_list(column.type): - json_column = pa.array([json.dumps(row.as_py()) if row.as_py() is not None else None for row in column]) - table = table.set_column(table.schema.get_field_index(column_name), column_name, json_column) - - if delta_schema: - for field in delta_schema.to_pyarrow(): - if field.name not in py_table_field_names: - if field.nullable: - new_column_data = pa.array([None] * table.num_rows, type=field.type) - else: - new_column_data = pa.array( - [_get_default_value_from_pyarrow_type(field.type)] * table.num_rows, type=field.type - ) - table = table.append_column(field, new_column_data) - - # Change types based on what deltalake tables support - return table.cast(ensure_delta_compatible_arrow_schema(table.schema)) - - -def _append_debug_column_to_pyarrows_table(table: pa.Table, load_id: int) -> pa.Table: - debug_info = f'{{"load_id": {load_id}}}' - - column = pa.array([debug_info] * table.num_rows, type=pa.string()) - return table.append_column("_ph_debug", column) - - -def _get_default_value_from_pyarrow_type(pyarrow_type: pa.DataType): - """ - Returns a default value for the given PyArrow type. - """ - if pa.types.is_integer(pyarrow_type): - return 0 - elif pa.types.is_floating(pyarrow_type): - return 0.0 - elif pa.types.is_string(pyarrow_type): - return "" - elif pa.types.is_boolean(pyarrow_type): - return False - elif pa.types.is_binary(pyarrow_type): - return b"" - elif pa.types.is_timestamp(pyarrow_type): - return pa.scalar(0, type=pyarrow_type).as_py() - elif pa.types.is_date(pyarrow_type): - return pa.scalar(0, type=pyarrow_type).as_py() - elif pa.types.is_time(pyarrow_type): - return pa.scalar(0, type=pyarrow_type).as_py() - else: - raise ValueError(f"No default value defined for type: {pyarrow_type}") - - -def _update_incrementality(schema: ExternalDataSchema | None, table: pa.Table, logger: FilteringBoundLogger) -> None: - if schema is None or schema.sync_type != ExternalDataSchema.SyncType.INCREMENTAL: - return - - incremental_field_name: str | None = schema.sync_type_config.get("incremental_field") - if incremental_field_name is None: - return - - column = table[incremental_field_name] - numpy_arr = column.combine_chunks().to_pandas().to_numpy() - - # TODO(@Gilbert09): support different operations here (e.g. min) - last_value = numpy_arr.max() - - logger.debug(f"Updating incremental_field_last_value with {last_value}") - - schema.update_incremental_field_last_value(last_value) - - -def _update_job_row_count(job_id: str, count: int, logger: FilteringBoundLogger) -> None: - logger.debug(f"Updating rows_synced with +{count}") - ExternalDataJob.objects.filter(id=job_id).update(rows_synced=F("rows_synced") + count) diff --git a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py index 20fdd467e717a..4c51014d46a63 100644 --- a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py +++ b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py @@ -13,7 +13,7 @@ from posthog.temporal.common.heartbeat_sync import HeartbeaterSync from posthog.temporal.data_imports.pipelines.bigquery import delete_table -from posthog.temporal.data_imports.pipelines.pipeline_non_dlt import PipelineNonDLT +from posthog.temporal.data_imports.pipelines.pipeline.pipeline import PipelineNonDLT from posthog.temporal.data_imports.pipelines.pipeline_sync import DataImportPipelineSync, PipelineInputs from posthog.temporal.data_imports.util import is_posthog_team from posthog.warehouse.models import ( From 4a7d7b45bddffe338bbcf32a7d60c990124765fe Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Fri, 22 Nov 2024 14:46:52 +0000 Subject: [PATCH 03/22] Updated all sources to take new incremental value --- .../pipelines/chargebee/__init__.py | 10 +++- .../pipelines/rest_source/__init__.py | 20 ++++++-- .../pipelines/salesforce/__init__.py | 4 +- .../pipelines/sql_database/__init__.py | 25 ++++++++-- .../pipelines/sql_database/helpers.py | 17 ++++++- .../pipelines/sql_database_v2/__init__.py | 47 +++++++++---------- .../data_imports/pipelines/stripe/__init__.py | 10 +++- .../pipelines/vitally/__init__.py | 3 +- .../pipelines/zendesk/__init__.py | 3 +- .../workflow_activities/import_data_sync.py | 34 ++++++++++++-- 10 files changed, 126 insertions(+), 47 deletions(-) diff --git a/posthog/temporal/data_imports/pipelines/chargebee/__init__.py b/posthog/temporal/data_imports/pipelines/chargebee/__init__.py index 245afb6e5d880..7a093e65f7364 100644 --- a/posthog/temporal/data_imports/pipelines/chargebee/__init__.py +++ b/posthog/temporal/data_imports/pipelines/chargebee/__init__.py @@ -218,7 +218,13 @@ def update_request(self, request: Request) -> None: @dlt.source(max_table_nesting=0) def chargebee_source( - api_key: str, site_name: str, endpoint: str, team_id: int, job_id: str, is_incremental: bool = False + api_key: str, + site_name: str, + endpoint: str, + team_id: int, + job_id: str, + db_incremental_field_last_value: Optional[Any], + is_incremental: bool = False, ): config: RESTAPIConfig = { "client": { @@ -242,7 +248,7 @@ def chargebee_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) def validate_credentials(api_key: str, site_name: str) -> bool: diff --git a/posthog/temporal/data_imports/pipelines/rest_source/__init__.py b/posthog/temporal/data_imports/pipelines/rest_source/__init__.py index 4fd019ce76753..9a8599882c652 100644 --- a/posthog/temporal/data_imports/pipelines/rest_source/__init__.py +++ b/posthog/temporal/data_imports/pipelines/rest_source/__init__.py @@ -46,6 +46,7 @@ def rest_api_source( config: RESTAPIConfig, team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any] = None, name: Optional[str] = None, section: Optional[str] = None, max_table_nesting: Optional[int] = None, @@ -108,10 +109,12 @@ def rest_api_source( spec, ) - return decorated(config, team_id, job_id) + return decorated(config, team_id, job_id, db_incremental_field_last_value) -def rest_api_resources(config: RESTAPIConfig, team_id: int, job_id: str) -> list[DltResource]: +def rest_api_resources( + config: RESTAPIConfig, team_id: int, job_id: str, db_incremental_field_last_value: Optional[Any] +) -> list[DltResource]: """Creates a list of resources from a REST API configuration. Args: @@ -193,6 +196,7 @@ def rest_api_resources(config: RESTAPIConfig, team_id: int, job_id: str) -> list resolved_param_map, team_id=team_id, job_id=job_id, + db_incremental_field_last_value=db_incremental_field_last_value, ) return list(resources.values()) @@ -205,6 +209,7 @@ def create_resources( resolved_param_map: dict[str, Optional[ResolvedParam]], team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any] = None, ) -> dict[str, DltResource]: resources = {} @@ -264,6 +269,7 @@ async def paginate_resource( incremental_object, incremental_param, incremental_cursor_transform, + db_incremental_field_last_value, ) yield client.paginate( @@ -317,6 +323,7 @@ async def paginate_dependent_resource( incremental_object, incremental_param, incremental_cursor_transform, + db_incremental_field_last_value, ) for item in items: @@ -358,6 +365,7 @@ def _set_incremental_params( incremental_object: Incremental[Any], incremental_param: Optional[IncrementalParam], transform: Optional[Callable[..., Any]], + db_incremental_field_last_value: Optional[Any] = None, ) -> dict[str, Any]: def identity_func(x: Any) -> Any: return x @@ -368,7 +376,13 @@ def identity_func(x: Any) -> Any: if incremental_param is None: return params - params[incremental_param.start] = transform(incremental_object.last_value) + last_value = ( + db_incremental_field_last_value + if db_incremental_field_last_value is not None + else incremental_object.last_value + ) + + params[incremental_param.start] = transform(last_value) if incremental_param.end: params[incremental_param.end] = transform(incremental_object.end_value) return params diff --git a/posthog/temporal/data_imports/pipelines/salesforce/__init__.py b/posthog/temporal/data_imports/pipelines/salesforce/__init__.py index cd206b6adcd4f..f01e17197e65f 100644 --- a/posthog/temporal/data_imports/pipelines/salesforce/__init__.py +++ b/posthog/temporal/data_imports/pipelines/salesforce/__init__.py @@ -6,7 +6,6 @@ from posthog.temporal.data_imports.pipelines.rest_source import RESTAPIConfig, rest_api_resources from posthog.temporal.data_imports.pipelines.rest_source.typing import EndpointResource from posthog.temporal.data_imports.pipelines.salesforce.auth import SalseforceAuth -import pendulum import re @@ -326,6 +325,7 @@ def salesforce_source( endpoint: str, team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any], is_incremental: bool = False, ): config: RESTAPIConfig = { @@ -340,4 +340,4 @@ def salesforce_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) diff --git a/posthog/temporal/data_imports/pipelines/sql_database/__init__.py b/posthog/temporal/data_imports/pipelines/sql_database/__init__.py index 3f852ac8a8607..7593332f2d20a 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database/__init__.py +++ b/posthog/temporal/data_imports/pipelines/sql_database/__init__.py @@ -48,10 +48,10 @@ def sql_source_for_type( sslmode: str, schema: str, table_names: list[str], + db_incremental_field_last_value: Optional[Any], team_id: Optional[int] = None, incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, - db_incremental_field_last_value: Optional[Any] = None, ) -> DltSource: host = quote(host) user = quote(user) @@ -91,12 +91,13 @@ def sql_source_for_type( raise Exception("Unsupported source_type") db_source = sql_database( - credentials, + credentials=credentials, schema=schema, table_names=table_names, incremental=incremental, team_id=team_id, connect_args=connect_args, + db_incremental_field_last_value=db_incremental_field_last_value, ) return db_source @@ -110,6 +111,7 @@ def snowflake_source( warehouse: str, schema: str, table_names: list[str], + db_incremental_field_last_value: Optional[Any], role: Optional[str] = None, incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, @@ -131,7 +133,13 @@ def snowflake_source( credentials = ConnectionStringCredentials( f"snowflake://{user}:{password}@{account_id}/{database}/{schema}?warehouse={warehouse}{f'&role={role}' if role else ''}" ) - db_source = sql_database(credentials, schema=schema, table_names=table_names, incremental=incremental) + db_source = sql_database( + credentials=credentials, + schema=schema, + table_names=table_names, + incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, + ) return db_source @@ -145,6 +153,7 @@ def bigquery_source( token_uri: str, table_name: str, bq_destination_table_id: str, + db_incremental_field_last_value: Optional[Any], incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, ) -> DltSource: @@ -169,7 +178,13 @@ def bigquery_source( credentials_info=credentials_info, ) - return sql_database(engine, schema=None, table_names=[table_name], incremental=incremental) + return sql_database( + credentials=engine, + schema=None, + table_names=[table_name], + incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, + ) # Temp while DLT doesn't support `interval` columns @@ -190,6 +205,7 @@ def internal_remove(doc: dict) -> dict: @dlt.source(max_table_nesting=0) def sql_database( + db_incremental_field_last_value: Optional[Any], credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value, schema: Optional[str] = dlt.config.value, metadata: Optional[MetaData] = None, @@ -249,6 +265,7 @@ def sql_database( table=table, incremental=incremental, connect_args=connect_args, + db_incremental_field_last_value=db_incremental_field_last_value, ) ) diff --git a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py index 50577b6b04d17..0400a60b32fd5 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py +++ b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py @@ -27,6 +27,7 @@ def __init__( chunk_size: int = 1000, incremental: Optional[dlt.sources.incremental[Any]] = None, connect_args: Optional[list[str]] = None, + db_incremental_field_last_value: Optional[Any] = None, ) -> None: self.engine = engine self.table = table @@ -43,7 +44,11 @@ def __init__( raise KeyError( f"Cursor column '{incremental.cursor_path}' does not exist in table '{table.name}'" ) from e - self.last_value = incremental.last_value + self.last_value = ( + db_incremental_field_last_value + if db_incremental_field_last_value is not None + else incremental.last_value + ) else: self.cursor_column = None self.last_value = None @@ -90,6 +95,7 @@ def table_rows( chunk_size: int = DEFAULT_CHUNK_SIZE, incremental: Optional[dlt.sources.incremental[Any]] = None, connect_args: Optional[list[str]] = None, + db_incremental_field_last_value: Optional[Any] = None, ) -> Iterator[TDataItem]: """ A DLT source which loads data from an SQL database using SQLAlchemy. @@ -106,7 +112,14 @@ def table_rows( """ yield dlt.mark.materialize_table_schema() # type: ignore - loader = TableLoader(engine, table, incremental=incremental, chunk_size=chunk_size, connect_args=connect_args) + loader = TableLoader( + engine, + table, + incremental=incremental, + chunk_size=chunk_size, + connect_args=connect_args, + db_incremental_field_last_value=db_incremental_field_last_value, + ) yield from loader.load_rows() engine.dispose() diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py index 227b01dd6633f..0ec4abdc202c9 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py +++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py @@ -1,7 +1,6 @@ """Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads.""" from datetime import datetime, date -from dateutil import parser from typing import Optional, Union, Any from collections.abc import Callable, Iterable @@ -55,20 +54,6 @@ def incremental_type_to_initial_value(field_type: IncrementalFieldType) -> Any: return date(1970, 1, 1) -def process_incremental_last_value(value: Any | None, field_type: IncrementalFieldType | None) -> Any | None: - if value is None or field_type is None: - return None - - if field_type == IncrementalFieldType.Integer or field_type == IncrementalFieldType.Numeric: - return value - - if field_type == IncrementalFieldType.DateTime or field_type == IncrementalFieldType.Timestamp: - return parser.parse(value) - - if field_type == IncrementalFieldType.Date: - return parser.parse(value).date() - - def sql_source_for_type( source_type: ExternalDataSource.Type, host: str, @@ -79,10 +64,10 @@ def sql_source_for_type( sslmode: str, schema: str, table_names: list[str], + db_incremental_field_last_value: Optional[Any], team_id: Optional[int] = None, incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, - db_incremental_field_last_value: Optional[Any] = None, ) -> DltSource: host = quote(host) user = quote(user) @@ -121,16 +106,12 @@ def sql_source_for_type( else: raise Exception("Unsupported source_type") - processed_db_incremental_field_last_value = process_incremental_last_value( - db_incremental_field_last_value, incremental_field_type - ) - db_source = sql_database( - credentials, + credentials=credentials, schema=schema, table_names=table_names, incremental=incremental, - db_incremental_field_last_value=processed_db_incremental_field_last_value, + db_incremental_field_last_value=db_incremental_field_last_value, team_id=team_id, connect_args=connect_args, ) @@ -146,6 +127,7 @@ def snowflake_source( warehouse: str, schema: str, table_names: list[str], + db_incremental_field_last_value: Optional[Any], role: Optional[str] = None, incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, @@ -167,7 +149,13 @@ def snowflake_source( credentials = ConnectionStringCredentials( f"snowflake://{user}:{password}@{account_id}/{database}/{schema}?warehouse={warehouse}{f'&role={role}' if role else ''}" ) - db_source = sql_database(credentials, schema=schema, table_names=table_names, incremental=incremental) + db_source = sql_database( + credentials=credentials, + schema=schema, + table_names=table_names, + incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, + ) return db_source @@ -181,6 +169,7 @@ def bigquery_source( token_uri: str, table_name: str, bq_destination_table_id: str, + db_incremental_field_last_value: Optional[Any], incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, ) -> DltSource: @@ -205,11 +194,18 @@ def bigquery_source( credentials_info=credentials_info, ) - return sql_database(engine, schema=None, table_names=[table_name], incremental=incremental) + return sql_database( + credentials=engine, + schema=None, + table_names=[table_name], + incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, + ) @dlt.source(max_table_nesting=0) def sql_database( + db_incremental_field_last_value: Optional[Any], credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value, schema: Optional[str] = dlt.config.value, metadata: Optional[MetaData] = None, @@ -224,7 +220,6 @@ def sql_database( include_views: bool = False, type_adapter_callback: Optional[TTypeAdapter] = None, incremental: Optional[dlt.sources.incremental] = None, - db_incremental_field_last_value: Optional[Any] = None, team_id: Optional[int] = None, connect_args: Optional[list[str]] = None, ) -> Iterable[DltResource]: @@ -322,12 +317,12 @@ def internal_remove(table: pa.Table) -> pa.Table: @dlt.resource(name=lambda args: args["table"], standalone=True, spec=SqlTableResourceConfiguration) def sql_table( + db_incremental_field_last_value: Optional[Any], credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value, table: str = dlt.config.value, schema: Optional[str] = dlt.config.value, metadata: Optional[MetaData] = None, incremental: Optional[dlt.sources.incremental[Any]] = None, - db_incremental_field_last_value: Optional[Any] = None, chunk_size: int = 50000, backend: TableBackend = "sqlalchemy", detect_precision_hints: Optional[bool] = None, diff --git a/posthog/temporal/data_imports/pipelines/stripe/__init__.py b/posthog/temporal/data_imports/pipelines/stripe/__init__.py index 5b386aa10adba..da9af92c191dc 100644 --- a/posthog/temporal/data_imports/pipelines/stripe/__init__.py +++ b/posthog/temporal/data_imports/pipelines/stripe/__init__.py @@ -325,7 +325,13 @@ def update_request(self, request: Request) -> None: @dlt.source(max_table_nesting=0) def stripe_source( - api_key: str, account_id: Optional[str], endpoint: str, team_id: int, job_id: str, is_incremental: bool = False + api_key: str, + account_id: Optional[str], + endpoint: str, + team_id: int, + job_id: str, + db_incremental_field_last_value: Optional[Any], + is_incremental: bool = False, ): config: RESTAPIConfig = { "client": { @@ -355,7 +361,7 @@ def stripe_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) def validate_credentials(api_key: str) -> bool: diff --git a/posthog/temporal/data_imports/pipelines/vitally/__init__.py b/posthog/temporal/data_imports/pipelines/vitally/__init__.py index 223513d439d7c..86ca0bfdf7ff4 100644 --- a/posthog/temporal/data_imports/pipelines/vitally/__init__.py +++ b/posthog/temporal/data_imports/pipelines/vitally/__init__.py @@ -323,6 +323,7 @@ def vitally_source( endpoint: str, team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any], is_incremental: bool = False, ): config: RESTAPIConfig = { @@ -347,7 +348,7 @@ def vitally_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) def validate_credentials(secret_token: str, region: str, subdomain: Optional[str]) -> bool: diff --git a/posthog/temporal/data_imports/pipelines/zendesk/__init__.py b/posthog/temporal/data_imports/pipelines/zendesk/__init__.py index 36d842e4d3889..55b6be994f006 100644 --- a/posthog/temporal/data_imports/pipelines/zendesk/__init__.py +++ b/posthog/temporal/data_imports/pipelines/zendesk/__init__.py @@ -289,6 +289,7 @@ def zendesk_source( endpoint: str, team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any], is_incremental: bool = False, ): config: RESTAPIConfig = { @@ -312,7 +313,7 @@ def zendesk_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) def validate_credentials(subdomain: str, api_key: str, email_address: str) -> bool: diff --git a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py index 4c51014d46a63..85e21351a50b4 100644 --- a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py +++ b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py @@ -1,6 +1,7 @@ import dataclasses import uuid from datetime import datetime +from dateutil import parser from typing import Any from django.conf import settings @@ -24,6 +25,7 @@ from structlog.typing import FilteringBoundLogger from posthog.warehouse.models.external_data_schema import ExternalDataSchema from posthog.warehouse.models.ssh_tunnel import SSHTunnel +from posthog.warehouse.types import IncrementalFieldType @dataclasses.dataclass @@ -34,6 +36,20 @@ class ImportDataActivityInputs: run_id: str +def process_incremental_last_value(value: Any | None, field_type: IncrementalFieldType | None) -> Any | None: + if value is None or field_type is None: + return None + + if field_type == IncrementalFieldType.Integer or field_type == IncrementalFieldType.Numeric: + return value + + if field_type == IncrementalFieldType.DateTime or field_type == IncrementalFieldType.Timestamp: + return parser.parse(value) + + if field_type == IncrementalFieldType.Date: + return parser.parse(value).date() + + @activity.defn def import_data_activity_sync(inputs: ImportDataActivityInputs): logger = bind_temporal_worker_logger_sync(team_id=inputs.team_id) @@ -66,6 +82,11 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): endpoints = [schema.name] + processed_incremental_last_value = process_incremental_last_value( + schema.sync_type_config.get("incremental_field_last_value"), + schema.sync_type_config.get("incremental_field_type"), + ) + source = None if model.pipeline.source_type == ExternalDataSource.Type.STRIPE: from posthog.temporal.data_imports.pipelines.stripe import stripe_source @@ -82,6 +103,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -178,7 +200,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, - db_incremental_field_last_value=schema.sync_type_config.get("incremental_field_last_value") + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, team_id=inputs.team_id, @@ -207,9 +229,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, - db_incremental_field_last_value=schema.sync_type_config.get("incremental_field_last_value") - if schema.is_incremental - else None, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, team_id=inputs.team_id, ) @@ -252,6 +272,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -296,6 +317,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -318,6 +340,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -339,6 +362,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -376,6 +400,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) _run( @@ -411,6 +436,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( From 39f3dc6217c6d3e0353a574352c1d019fd86c2de Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Mon, 25 Nov 2024 11:30:51 +0000 Subject: [PATCH 04/22] Added new temporal queue and pipeline version --- posthog/constants.py | 1 + .../0520_externaldatajob_pipeline_version.py | 22 ++++++ posthog/migrations/max_migration.txt | 2 +- posthog/settings/data_warehouse.py | 4 + posthog/tasks/test/test_usage_report.py | 74 ++++++++++++++++++- posthog/tasks/test/test_warehouse.py | 21 +++++- posthog/tasks/usage_report.py | 1 + .../data_imports/external_data_job.py | 37 ++++++++++ .../pipelines/pipeline/delta_table_helper.py | 3 +- .../data_imports/pipelines/pipeline/utils.py | 2 +- .../pipelines/test/test_pipeline_sync.py | 1 + .../workflow_activities/create_job_model.py | 9 +++ .../workflow_activities/import_data_sync.py | 23 ++++-- .../tests/batch_exports/test_import_data.py | 1 + .../external_data/test_external_data_job.py | 5 ++ posthog/warehouse/api/external_data_source.py | 6 +- .../api/test/test_external_data_source.py | 26 +++++++ posthog/warehouse/api/test/test_log_entry.py | 8 +- posthog/warehouse/models/external_data_job.py | 6 ++ .../warehouse/models/external_data_schema.py | 9 ++- 20 files changed, 244 insertions(+), 17 deletions(-) create mode 100644 posthog/migrations/0520_externaldatajob_pipeline_version.py diff --git a/posthog/constants.py b/posthog/constants.py index 92b842e8c612a..6b2b8fc223661 100644 --- a/posthog/constants.py +++ b/posthog/constants.py @@ -303,6 +303,7 @@ class FlagRequestType(StrEnum): ENRICHED_DASHBOARD_INSIGHT_IDENTIFIER = "Feature Viewed" DATA_WAREHOUSE_TASK_QUEUE = "data-warehouse-task-queue" +DATA_WAREHOUSE_TASK_QUEUE_V2 = "data-warehouse-task-queue-v2" BATCH_EXPORTS_TASK_QUEUE = "no-sandbox-python-django" GENERAL_PURPOSE_TASK_QUEUE = "general-purpose-task-queue" diff --git a/posthog/migrations/0520_externaldatajob_pipeline_version.py b/posthog/migrations/0520_externaldatajob_pipeline_version.py new file mode 100644 index 0000000000000..c0475ae24aece --- /dev/null +++ b/posthog/migrations/0520_externaldatajob_pipeline_version.py @@ -0,0 +1,22 @@ +# Generated by Django 4.2.15 on 2024-11-23 14:49 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("posthog", "0519_errortrackingissue_description_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="externaldatajob", + name="pipeline_version", + field=models.CharField( + blank=True, + choices=[("v1-dlt-sync", "v1-dlt-sync"), ("v2-non-dlt", "v2-non-dlt")], + max_length=400, + null=True, + ), + ), + ] diff --git a/posthog/migrations/max_migration.txt b/posthog/migrations/max_migration.txt index 6a1c8da5c16d3..9043b7e660bcf 100644 --- a/posthog/migrations/max_migration.txt +++ b/posthog/migrations/max_migration.txt @@ -1 +1 @@ -0519_errortrackingissue_description_and_more +0520_externaldatajob_pipeline_version diff --git a/posthog/settings/data_warehouse.py b/posthog/settings/data_warehouse.py index a0a78a9621e3a..747bed9ceb2ed 100644 --- a/posthog/settings/data_warehouse.py +++ b/posthog/settings/data_warehouse.py @@ -1,5 +1,7 @@ import os +from posthog.settings.utils import get_from_env, str_to_bool + AIRBYTE_API_KEY = os.getenv("AIRBYTE_API_KEY", None) AIRBYTE_BUCKET_REGION = os.getenv("AIRBYTE_BUCKET_REGION", None) AIRBYTE_BUCKET_KEY = os.getenv("AIRBYTE_BUCKET_KEY", None) @@ -9,3 +11,5 @@ BUCKET_URL = os.getenv("BUCKET_URL", None) AIRBYTE_BUCKET_NAME = os.getenv("AIRBYTE_BUCKET_NAME", None) BUCKET = "test-pipeline" + +TEMPORAL_V2: bool = get_from_env("TEMPORAL_V2", False, type_cast=str_to_bool) diff --git a/posthog/tasks/test/test_usage_report.py b/posthog/tasks/test/test_usage_report.py index 41795a3bc372b..b69727fbd9f99 100644 --- a/posthog/tasks/test/test_usage_report.py +++ b/posthog/tasks/test/test_usage_report.py @@ -1265,11 +1265,23 @@ def test_external_data_rows_synced_response( for i in range(5): start_time = (now() - relativedelta(hours=i)).strftime("%Y-%m-%dT%H:%M:%SZ") - ExternalDataJob.objects.create(team_id=3, created_at=start_time, rows_synced=10, pipeline=source) + ExternalDataJob.objects.create( + team_id=3, + created_at=start_time, + rows_synced=10, + pipeline=source, + pipeline_version=ExternalDataJob.PipelineVersion.V1, + ) for i in range(5): start_time = (now() - relativedelta(hours=i)).strftime("%Y-%m-%dT%H:%M:%SZ") - ExternalDataJob.objects.create(team_id=4, created_at=start_time, rows_synced=10, pipeline=source) + ExternalDataJob.objects.create( + team_id=4, + created_at=start_time, + rows_synced=10, + pipeline=source, + pipeline_version=ExternalDataJob.PipelineVersion.V1, + ) period = get_previous_day(at=now() + relativedelta(days=1)) period_start, period_end = period @@ -1294,6 +1306,64 @@ def test_external_data_rows_synced_response( assert org_2_report["organization_name"] == "Org 2" assert org_2_report["rows_synced_in_period"] == 0 + @patch("posthog.tasks.usage_report.Client") + @patch("posthog.tasks.usage_report.send_report_to_billing_service") + def test_external_data_rows_synced_response_with_v2_jobs( + self, billing_task_mock: MagicMock, posthog_capture_mock: MagicMock + ) -> None: + self._setup_teams() + + source = ExternalDataSource.objects.create( + team=self.analytics_team, + source_id="source_id", + connection_id="connection_id", + status=ExternalDataSource.Status.COMPLETED, + source_type=ExternalDataSource.Type.STRIPE, + ) + + for i in range(5): + start_time = (now() - relativedelta(hours=i)).strftime("%Y-%m-%dT%H:%M:%SZ") + ExternalDataJob.objects.create( + team_id=3, + created_at=start_time, + rows_synced=10, + pipeline=source, + pipeline_version=ExternalDataJob.PipelineVersion.V1, + ) + + for i in range(5): + start_time = (now() - relativedelta(hours=i)).strftime("%Y-%m-%dT%H:%M:%SZ") + ExternalDataJob.objects.create( + team_id=4, + created_at=start_time, + rows_synced=10, + pipeline=source, + pipeline_version=ExternalDataJob.PipelineVersion.V2, + ) + + period = get_previous_day(at=now() + relativedelta(days=1)) + period_start, period_end = period + all_reports = _get_all_org_reports(period_start, period_end) + + assert len(all_reports) == 3 + + org_1_report = _get_full_org_usage_report_as_dict( + _get_full_org_usage_report(all_reports[str(self.org_1.id)], get_instance_metadata(period)) + ) + + org_2_report = _get_full_org_usage_report_as_dict( + _get_full_org_usage_report(all_reports[str(self.org_2.id)], get_instance_metadata(period)) + ) + + assert org_1_report["organization_name"] == "Org 1" + assert org_1_report["rows_synced_in_period"] == 50 + + assert org_1_report["teams"]["3"]["rows_synced_in_period"] == 50 + assert org_1_report["teams"]["4"]["rows_synced_in_period"] == 0 # V2 pipelines + + assert org_2_report["organization_name"] == "Org 2" + assert org_2_report["rows_synced_in_period"] == 0 + @freeze_time("2022-01-10T00:01:00Z") class TestHogFunctionUsageReports(ClickhouseDestroyTablesMixin, TestCase, ClickhouseTestMixin): diff --git a/posthog/tasks/test/test_warehouse.py b/posthog/tasks/test/test_warehouse.py index c6150ef565336..ec7bce8c7832f 100644 --- a/posthog/tasks/test/test_warehouse.py +++ b/posthog/tasks/test/test_warehouse.py @@ -36,7 +36,12 @@ def test_capture_workspace_rows_synced_by_team_month_cutoff(self, mock_get_ph_cl with freeze_time("2023-11-07T16:50:49Z"): job = ExternalDataJob.objects.create( - pipeline=source, workflow_id="fake_workflow_id", team=self.team, status="Running", rows_synced=100000 + pipeline=source, + workflow_id="fake_workflow_id", + team=self.team, + status="Running", + rows_synced=100000, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) capture_workspace_rows_synced_by_team(self.team.pk) @@ -86,12 +91,22 @@ def test_capture_workspace_rows_synced_by_team_month_cutoff_field_set(self, mock with freeze_time("2023-10-30T18:32:41Z"): ExternalDataJob.objects.create( - pipeline=source, workflow_id="fake_workflow_id", team=self.team, status="Completed", rows_synced=97747 + pipeline=source, + workflow_id="fake_workflow_id", + team=self.team, + status="Completed", + rows_synced=97747, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) with freeze_time("2023-11-07T16:50:49Z"): job2 = ExternalDataJob.objects.create( - pipeline=source, workflow_id="fake_workflow_id", team=self.team, status="Completed", rows_synced=93353 + pipeline=source, + workflow_id="fake_workflow_id", + team=self.team, + status="Completed", + rows_synced=93353, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) capture_workspace_rows_synced_by_team(self.team.pk) diff --git a/posthog/tasks/usage_report.py b/posthog/tasks/usage_report.py index f9e3982409d82..61259ac04d0c8 100644 --- a/posthog/tasks/usage_report.py +++ b/posthog/tasks/usage_report.py @@ -656,6 +656,7 @@ def get_teams_with_survey_responses_count_in_period( def get_teams_with_rows_synced_in_period(begin: datetime, end: datetime) -> list: return list( ExternalDataJob.objects.filter(created_at__gte=begin, created_at__lte=end) + .exclude(pipeline_version=ExternalDataJob.PipelineVersion.V2) .values("team_id") .annotate(total=Sum("rows_synced")) ) diff --git a/posthog/temporal/data_imports/external_data_job.py b/posthog/temporal/data_imports/external_data_job.py index 84c798961e687..35e8685fd610b 100644 --- a/posthog/temporal/data_imports/external_data_job.py +++ b/posthog/temporal/data_imports/external_data_job.py @@ -1,14 +1,19 @@ +import asyncio import dataclasses import datetime as dt import json +from django.conf import settings from django.db import close_old_connections import posthoganalytics from temporalio import activity, exceptions, workflow from temporalio.common import RetryPolicy +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 + # TODO: remove dependency from posthog.temporal.batch_exports.base import PostHogWorkflow +from posthog.temporal.common.client import sync_connect from posthog.temporal.data_imports.workflow_activities.check_billing_limits import ( CheckBillingLimitsActivityInputs, check_billing_limits_activity, @@ -124,6 +129,30 @@ def update_external_data_job_model(inputs: UpdateExternalDataJobStatusInputs) -> ) +@activity.defn +def trigger_pipeline_v2(inputs: ExternalDataWorkflowInputs): + logger = bind_temporal_worker_logger_sync(team_id=inputs.team_id) + logger.debug("Triggering V2 pipeline") + + temporal = sync_connect() + + asyncio.run( + temporal.start_workflow( + workflow="external-data-job", + arg=dataclasses.asdict(inputs), + id=f"{inputs.external_data_schema_id}-V2", + task_queue=str(DATA_WAREHOUSE_TASK_QUEUE_V2), + retry_policy=RetryPolicy( + maximum_interval=dt.timedelta(seconds=60), + maximum_attempts=1, + non_retryable_error_types=["NondeterminismError"], + ), + ) + ) + + logger.debug("V2 pipeline triggered") + + @dataclasses.dataclass class CreateSourceTemplateInputs: team_id: int @@ -147,6 +176,14 @@ def parse_inputs(inputs: list[str]) -> ExternalDataWorkflowInputs: async def run(self, inputs: ExternalDataWorkflowInputs): assert inputs.external_data_schema_id is not None + if not settings.TEMPORAL_V2: + await workflow.execute_activity( + trigger_pipeline_v2, + inputs, + start_to_close_timeout=dt.timedelta(minutes=1), + retry_policy=RetryPolicy(maximum_attempts=1), + ) + update_inputs = UpdateExternalDataJobStatusInputs( job_id=None, status=ExternalDataJob.Status.COMPLETED, diff --git a/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py index 30e3cf0e466d5..af49e4b2799c9 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py +++ b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py @@ -39,7 +39,8 @@ def _get_credentials(self): def _get_delta_table_uri(self) -> str: normalized_resource_name = NamingConvention().normalize_identifier(self._resource_name) - return f"{settings.BUCKET_URL}/{self._job.folder_path()}/{normalized_resource_name}" + # Appended __V2 on to the end of the url so that data of the V2 pipeline isn't the same as V1 + return f"{settings.BUCKET_URL}/{self._job.folder_path()}/{normalized_resource_name}__V2" def _evolve_delta_schema(self, schema: pa.Schema) -> deltalake.DeltaTable: delta_table = self.get_delta_table() diff --git a/posthog/temporal/data_imports/pipelines/pipeline/utils.py b/posthog/temporal/data_imports/pipelines/pipeline/utils.py index d07a697b9b4ea..fadb6ec02a868 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline/utils.py +++ b/posthog/temporal/data_imports/pipelines/pipeline/utils.py @@ -95,7 +95,7 @@ def _update_incremental_state(schema: ExternalDataSchema | None, table: pa.Table # TODO(@Gilbert09): support different operations here (e.g. min) last_value = numpy_arr.max() - logger.debug(f"Updating incremental_field_last_value with {last_value}") + logger.debug(f"Updating incremental_field_last_value_v2 with {last_value}") schema.update_incremental_field_last_value(last_value) diff --git a/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py b/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py index 601c157acc20a..c9060df15e2d0 100644 --- a/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py +++ b/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py @@ -66,6 +66,7 @@ def _create_pipeline(self, schema_name: str, incremental: bool): status=ExternalDataJob.Status.RUNNING, rows_synced=0, workflow_id=str(uuid.uuid4()), + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) pipeline = DataImportPipelineSync( diff --git a/posthog/temporal/data_imports/workflow_activities/create_job_model.py b/posthog/temporal/data_imports/workflow_activities/create_job_model.py index b62f0c9cc2063..905e2b8f7316c 100644 --- a/posthog/temporal/data_imports/workflow_activities/create_job_model.py +++ b/posthog/temporal/data_imports/workflow_activities/create_job_model.py @@ -1,6 +1,7 @@ import dataclasses import uuid +from django.conf import settings from django.db import close_old_connections from temporalio import activity @@ -20,6 +21,13 @@ class CreateExternalDataJobModelActivityInputs: source_id: uuid.UUID +def get_pipeline_version() -> str: + if settings.TEMPORAL_V2: + return ExternalDataJob.PipelineVersion.V2 + + return ExternalDataJob.PipelineVersion.V1 + + @activity.defn def create_external_data_job_model_activity( inputs: CreateExternalDataJobModelActivityInputs, @@ -37,6 +45,7 @@ def create_external_data_job_model_activity( rows_synced=0, workflow_id=activity.info().workflow_id, workflow_run_id=activity.info().workflow_run_id, + pipeline_version=get_pipeline_version(), ) schema = ExternalDataSchema.objects.get(team_id=inputs.team_id, id=inputs.schema_id) diff --git a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py index 85e21351a50b4..7293e1cb75fc6 100644 --- a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py +++ b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py @@ -82,10 +82,23 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): endpoints = [schema.name] - processed_incremental_last_value = process_incremental_last_value( - schema.sync_type_config.get("incremental_field_last_value"), - schema.sync_type_config.get("incremental_field_type"), - ) + if settings.TEMPORAL_V2: + # Get the V2 last value, if it's not set yet (e.g. the first run), then fallback to the V1 value + processed_incremental_last_value = process_incremental_last_value( + schema.sync_type_config.get("incremental_field_last_value_v2"), + schema.sync_type_config.get("incremental_field_type"), + ) + + if processed_incremental_last_value is None: + processed_incremental_last_value = process_incremental_last_value( + schema.sync_type_config.get("incremental_field_last_value"), + schema.sync_type_config.get("incremental_field_type"), + ) + else: + processed_incremental_last_value = process_incremental_last_value( + schema.sync_type_config.get("incremental_field_last_value"), + schema.sync_type_config.get("incremental_field_type"), + ) source = None if model.pipeline.source_type == ExternalDataSource.Type.STRIPE: @@ -459,7 +472,7 @@ def _run( schema: ExternalDataSchema, reset_pipeline: bool, ): - if settings.DEBUG: + if settings.TEMPORAL_V2: PipelineNonDLT(source, logger, job_inputs.run_id, schema.is_incremental).run() else: table_row_counts = DataImportPipelineSync( diff --git a/posthog/temporal/tests/batch_exports/test_import_data.py b/posthog/temporal/tests/batch_exports/test_import_data.py index 93d20fbd44b23..baaacffb6cbf5 100644 --- a/posthog/temporal/tests/batch_exports/test_import_data.py +++ b/posthog/temporal/tests/batch_exports/test_import_data.py @@ -48,6 +48,7 @@ def _setup(team: Team, job_inputs: dict[Any, Any]) -> ImportDataActivityInputs: status=ExternalDataJob.Status.RUNNING, rows_synced=0, workflow_id="some_workflow_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) return ImportDataActivityInputs(team_id=team.pk, schema_id=schema.pk, source_id=source.pk, run_id=str(job.pk)) diff --git a/posthog/temporal/tests/external_data/test_external_data_job.py b/posthog/temporal/tests/external_data/test_external_data_job.py index f931c97f93943..103513662daeb 100644 --- a/posthog/temporal/tests/external_data/test_external_data_job.py +++ b/posthog/temporal/tests/external_data/test_external_data_job.py @@ -149,6 +149,7 @@ def _create_external_data_job( rows_synced=0, workflow_id=workflow_id, workflow_run_id=workflow_run_id, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) return job @@ -391,6 +392,7 @@ def setup_job_1(): status=ExternalDataJob.Status.RUNNING, rows_synced=0, schema=customer_schema, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) new_job = ExternalDataJob.objects.get(id=new_job.id) @@ -423,6 +425,7 @@ def setup_job_2(): status=ExternalDataJob.Status.RUNNING, rows_synced=0, schema=charge_schema, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) new_job = ExternalDataJob.objects.get(id=new_job.id) @@ -565,6 +568,7 @@ def setup_job_1(): status=ExternalDataJob.Status.RUNNING, rows_synced=0, schema=customer_schema, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) new_job = ( @@ -764,6 +768,7 @@ async def setup_job_1(): status=ExternalDataJob.Status.RUNNING, rows_synced=0, schema=posthog_test_schema, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) new_job = await sync_to_async( diff --git a/posthog/warehouse/api/external_data_source.py b/posthog/warehouse/api/external_data_source.py index 8760a48f5fd13..32e4111b35e51 100644 --- a/posthog/warehouse/api/external_data_source.py +++ b/posthog/warehouse/api/external_data_source.py @@ -1167,7 +1167,11 @@ def jobs(self, request: Request, *arg: Any, **kwargs: Any): after = request.query_params.get("after", None) before = request.query_params.get("before", None) - jobs = instance.jobs.prefetch_related("schema").order_by("-created_at") + jobs = ( + instance.jobs.exclude(pipeline_version=ExternalDataJob.PipelineVersion.V2) + .prefetch_related("schema") + .order_by("-created_at") + ) if after: after_date = parser.parse(after) diff --git a/posthog/warehouse/api/test/test_external_data_source.py b/posthog/warehouse/api/test/test_external_data_source.py index b191f10e04785..b293cbba160dc 100644 --- a/posthog/warehouse/api/test/test_external_data_source.py +++ b/posthog/warehouse/api/test/test_external_data_source.py @@ -704,6 +704,7 @@ def test_source_jobs(self): status=ExternalDataJob.Status.COMPLETED, rows_synced=100, workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) response = self.client.get( @@ -720,6 +721,28 @@ def test_source_jobs(self): assert data[0]["schema"]["id"] == str(schema.pk) assert data[0]["workflow_run_id"] is not None + def test_source_jobs_v2_job(self): + source = self._create_external_data_source() + schema = self._create_external_data_schema(source.pk) + ExternalDataJob.objects.create( + team=self.team, + pipeline=source, + schema=schema, + status=ExternalDataJob.Status.COMPLETED, + rows_synced=100, + workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V2, + ) + + response = self.client.get( + f"/api/projects/{self.team.pk}/external_data_sources/{source.pk}/jobs", + ) + + data = response.json() + + assert response.status_code, status.HTTP_200_OK + assert len(data) == 0 + def test_source_jobs_pagination(self): source = self._create_external_data_source() schema = self._create_external_data_schema(source.pk) @@ -731,6 +754,7 @@ def test_source_jobs_pagination(self): status=ExternalDataJob.Status.COMPLETED, rows_synced=100, workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) response = self.client.get( @@ -752,6 +776,7 @@ def test_source_jobs_pagination(self): status=ExternalDataJob.Status.COMPLETED, rows_synced=100, workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) response = self.client.get( @@ -773,6 +798,7 @@ def test_source_jobs_pagination(self): status=ExternalDataJob.Status.COMPLETED, rows_synced=100, workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) response = self.client.get( diff --git a/posthog/warehouse/api/test/test_log_entry.py b/posthog/warehouse/api/test/test_log_entry.py index c7ed98c572f72..14564015c230d 100644 --- a/posthog/warehouse/api/test/test_log_entry.py +++ b/posthog/warehouse/api/test/test_log_entry.py @@ -91,7 +91,13 @@ def external_data_resources(client, organization, team): # No status but should be completed because a data warehouse table already exists ) job = ExternalDataJob.objects.create( - pipeline=source, schema=schema, workflow_id="fake_workflow_id", team=team, status="Running", rows_synced=100000 + pipeline=source, + schema=schema, + workflow_id="fake_workflow_id", + team=team, + status="Running", + rows_synced=100000, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) return { diff --git a/posthog/warehouse/models/external_data_job.py b/posthog/warehouse/models/external_data_job.py index ae7b642494966..409b0277a153f 100644 --- a/posthog/warehouse/models/external_data_job.py +++ b/posthog/warehouse/models/external_data_job.py @@ -15,6 +15,10 @@ class Status(models.TextChoices): COMPLETED = "Completed", "Completed" CANCELLED = "Cancelled", "Cancelled" + class PipelineVersion(models.TextChoices): + V1 = "v1-dlt-sync", "v1-dlt-sync" + V2 = "v2-non-dlt", "v2-non-dlt" + team = models.ForeignKey(Team, on_delete=models.CASCADE) pipeline = models.ForeignKey("posthog.ExternalDataSource", related_name="jobs", on_delete=models.CASCADE) schema = models.ForeignKey("posthog.ExternalDataSchema", on_delete=models.CASCADE, null=True, blank=True) @@ -25,6 +29,8 @@ class Status(models.TextChoices): workflow_id = models.CharField(max_length=400, null=True, blank=True) workflow_run_id = models.CharField(max_length=400, null=True, blank=True) + pipeline_version = models.CharField(max_length=400, choices=PipelineVersion.choices, null=True, blank=True) + __repr__ = sane_repr("id") def folder_path(self) -> str: diff --git a/posthog/warehouse/models/external_data_schema.py b/posthog/warehouse/models/external_data_schema.py index beaad6ba8c408..93fb939378993 100644 --- a/posthog/warehouse/models/external_data_schema.py +++ b/posthog/warehouse/models/external_data_schema.py @@ -49,7 +49,7 @@ class SyncFrequency(models.TextChoices): last_synced_at = models.DateTimeField(null=True, blank=True) sync_type = models.CharField(max_length=128, choices=SyncType.choices, null=True, blank=True) - # { "incremental_field": string, "incremental_field_type": string, "incremental_field_last_value": any } + # { "incremental_field": string, "incremental_field_type": string, "incremental_field_last_value": any, "incremental_field_last_value_v2": any } sync_type_config = models.JSONField( default=dict, blank=True, @@ -80,7 +80,12 @@ def update_incremental_field_last_value(self, last_value: Any) -> None: else: last_value_json = str(last_value) - self.sync_type_config["incremental_field_last_value"] = last_value_json + if settings.TEMPORAL_V2: + key = "incremental_field_last_value_v2" + else: + key = "incremental_field_last_value" + + self.sync_type_config[key] = last_value_json self.save() def soft_delete(self): From 7e8cbdca27026f161a8bcb44163b059674ab26cc Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Mon, 25 Nov 2024 11:36:43 +0000 Subject: [PATCH 05/22] Update migration to backfill --- .../migrations/0520_externaldatajob_pipeline_version.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/posthog/migrations/0520_externaldatajob_pipeline_version.py b/posthog/migrations/0520_externaldatajob_pipeline_version.py index c0475ae24aece..bd1d9347c3182 100644 --- a/posthog/migrations/0520_externaldatajob_pipeline_version.py +++ b/posthog/migrations/0520_externaldatajob_pipeline_version.py @@ -19,4 +19,11 @@ class Migration(migrations.Migration): null=True, ), ), + migrations.RunSQL( + """ + UPDATE posthog_externaldatajob + SET pipeline_version = 'v1-dlt-sync' + WHERE pipeline_version is null + """ + ), ] From 0141923d43babec1af6a39eba07d64d32cae3451 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Mon, 25 Nov 2024 12:05:47 +0000 Subject: [PATCH 06/22] Use the correct queue var name --- posthog/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/posthog/constants.py b/posthog/constants.py index 6b2b8fc223661..26b35a4a397d0 100644 --- a/posthog/constants.py +++ b/posthog/constants.py @@ -303,7 +303,7 @@ class FlagRequestType(StrEnum): ENRICHED_DASHBOARD_INSIGHT_IDENTIFIER = "Feature Viewed" DATA_WAREHOUSE_TASK_QUEUE = "data-warehouse-task-queue" -DATA_WAREHOUSE_TASK_QUEUE_V2 = "data-warehouse-task-queue-v2" +DATA_WAREHOUSE_TASK_QUEUE_V2 = "v2-data-warehouse-task-queue" BATCH_EXPORTS_TASK_QUEUE = "no-sandbox-python-django" GENERAL_PURPOSE_TASK_QUEUE = "general-purpose-task-queue" From 406ef32dab03967eeca08755a50c04978fc2d72f Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Mon, 25 Nov 2024 12:10:54 +0000 Subject: [PATCH 07/22] Use the queue to dictate logic --- posthog/settings/data_warehouse.py | 4 ---- posthog/temporal/data_imports/external_data_job.py | 2 +- .../data_imports/workflow_activities/create_job_model.py | 3 ++- .../data_imports/workflow_activities/import_data_sync.py | 5 +++-- posthog/warehouse/models/external_data_schema.py | 3 ++- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/posthog/settings/data_warehouse.py b/posthog/settings/data_warehouse.py index 747bed9ceb2ed..a0a78a9621e3a 100644 --- a/posthog/settings/data_warehouse.py +++ b/posthog/settings/data_warehouse.py @@ -1,7 +1,5 @@ import os -from posthog.settings.utils import get_from_env, str_to_bool - AIRBYTE_API_KEY = os.getenv("AIRBYTE_API_KEY", None) AIRBYTE_BUCKET_REGION = os.getenv("AIRBYTE_BUCKET_REGION", None) AIRBYTE_BUCKET_KEY = os.getenv("AIRBYTE_BUCKET_KEY", None) @@ -11,5 +9,3 @@ BUCKET_URL = os.getenv("BUCKET_URL", None) AIRBYTE_BUCKET_NAME = os.getenv("AIRBYTE_BUCKET_NAME", None) BUCKET = "test-pipeline" - -TEMPORAL_V2: bool = get_from_env("TEMPORAL_V2", False, type_cast=str_to_bool) diff --git a/posthog/temporal/data_imports/external_data_job.py b/posthog/temporal/data_imports/external_data_job.py index 35e8685fd610b..01e48e4a039f8 100644 --- a/posthog/temporal/data_imports/external_data_job.py +++ b/posthog/temporal/data_imports/external_data_job.py @@ -176,7 +176,7 @@ def parse_inputs(inputs: list[str]) -> ExternalDataWorkflowInputs: async def run(self, inputs: ExternalDataWorkflowInputs): assert inputs.external_data_schema_id is not None - if not settings.TEMPORAL_V2: + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: await workflow.execute_activity( trigger_pipeline_v2, inputs, diff --git a/posthog/temporal/data_imports/workflow_activities/create_job_model.py b/posthog/temporal/data_imports/workflow_activities/create_job_model.py index 905e2b8f7316c..b404c610c1cad 100644 --- a/posthog/temporal/data_imports/workflow_activities/create_job_model.py +++ b/posthog/temporal/data_imports/workflow_activities/create_job_model.py @@ -7,6 +7,7 @@ # TODO: remove dependency +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 from posthog.warehouse.models import ExternalDataJob, ExternalDataSource from posthog.warehouse.models.external_data_schema import ( ExternalDataSchema, @@ -22,7 +23,7 @@ class CreateExternalDataJobModelActivityInputs: def get_pipeline_version() -> str: - if settings.TEMPORAL_V2: + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: return ExternalDataJob.PipelineVersion.V2 return ExternalDataJob.PipelineVersion.V1 diff --git a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py index 7293e1cb75fc6..6983b07d2d2f0 100644 --- a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py +++ b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py @@ -10,6 +10,7 @@ from temporalio import activity +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 from posthog.models.integration import Integration from posthog.temporal.common.heartbeat_sync import HeartbeaterSync from posthog.temporal.data_imports.pipelines.bigquery import delete_table @@ -82,7 +83,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): endpoints = [schema.name] - if settings.TEMPORAL_V2: + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: # Get the V2 last value, if it's not set yet (e.g. the first run), then fallback to the V1 value processed_incremental_last_value = process_incremental_last_value( schema.sync_type_config.get("incremental_field_last_value_v2"), @@ -472,7 +473,7 @@ def _run( schema: ExternalDataSchema, reset_pipeline: bool, ): - if settings.TEMPORAL_V2: + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: PipelineNonDLT(source, logger, job_inputs.run_id, schema.is_incremental).run() else: table_row_counts = DataImportPipelineSync( diff --git a/posthog/warehouse/models/external_data_schema.py b/posthog/warehouse/models/external_data_schema.py index 93fb939378993..1e5f27e42c636 100644 --- a/posthog/warehouse/models/external_data_schema.py +++ b/posthog/warehouse/models/external_data_schema.py @@ -5,6 +5,7 @@ from django_deprecate_fields import deprecate_field import snowflake.connector from django.conf import settings +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 from posthog.models.team import Team from posthog.models.utils import CreatedMetaFields, DeletedMetaFields, UUIDModel, UpdatedMetaFields, sane_repr import uuid @@ -80,7 +81,7 @@ def update_incremental_field_last_value(self, last_value: Any) -> None: else: last_value_json = str(last_value) - if settings.TEMPORAL_V2: + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: key = "incremental_field_last_value_v2" else: key = "incremental_field_last_value" From 30205ace37ee638c79c6fff8e1402b7683226a40 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Tue, 26 Nov 2024 09:48:23 +0000 Subject: [PATCH 08/22] mypy --- mypy-baseline.txt | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mypy-baseline.txt b/mypy-baseline.txt index 3846c41a764ad..4340acbe76ceb 100644 --- a/mypy-baseline.txt +++ b/mypy-baseline.txt @@ -469,7 +469,7 @@ posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5, _T6] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], TypedColumnsClauseRole[_T6] | SQLCoreOperations[_T6] | type[_T6], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5, _T6]] posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], TypedColumnsClauseRole[_T6] | SQLCoreOperations[_T6] | type[_T6], TypedColumnsClauseRole[_T7] | SQLCoreOperations[_T7] | type[_T7], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7]] posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def with_only_columns(self, *entities: TypedColumnsClauseRole[Any] | ColumnsClauseRole | SQLCoreOperations[Any] | Literal['*', 1] | type[Any] | Inspectable[_HasClauseElement[Any]] | _HasClauseElement[Any], maintain_column_froms: bool = ..., **Any) -> Select[Any] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: error: No overload variant of "resource" matches argument types "Callable[[Engine, Table, int, Literal['sqlalchemy', 'pyarrow', 'pandas', 'connectorx'], Incremental[Any] | None, bool, Callable[[Table], None] | None, Literal['minimal', 'full', 'full_with_precision'], dict[str, Any] | None, Callable[[TypeEngine[Any]], TypeEngine[Any] | type[TypeEngine[Any]] | None] | None, list[str] | None, Callable[[Select[Any], Table], Select[Any]] | None, list[str] | None], Iterator[Any]]", "str", "list[str] | None", "list[str] | None", "dict[str, TColumnSchema]", "Collection[str]", "str" [call-overload] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: error: No overload variant of "resource" matches argument types "Callable[[Engine, Table, int, Literal['sqlalchemy', 'pyarrow', 'pandas', 'connectorx'], Incremental[Any] | None, Any | None, bool, Callable[[Table], None] | None, Literal['minimal', 'full', 'full_with_precision'], dict[str, Any] | None, Callable[[TypeEngine[Any]], TypeEngine[Any] | type[TypeEngine[Any]] | None] | None, list[str] | None, Callable[[Select[Any], Table], Select[Any]] | None, list[str] | None], Iterator[Any]]", "str", "list[str] | None", "list[str] | None", "dict[str, TColumnSchema]", "Collection[str]", "str" [call-overload] posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: Possible overload variants: posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TResourceFunParams`-1, TDltResourceImpl: DltResource] resource(Callable[TResourceFunParams, Any], /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> TDltResourceImpl posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(None = ..., /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> Callable[[Callable[TResourceFunParams, Any]], TDltResourceImpl] @@ -806,6 +806,7 @@ posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: "type[Filesys posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "DataWarehouseCredential | Combinable | None") [assignment] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "str | int | Combinable") [assignment] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "dict[str, dict[str, str | bool]] | dict[str, str]", variable has type "dict[str, dict[str, str]]") [assignment] +posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Name "raw_db_columns" already defined on line 0 [no-redef] posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type] posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type] posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type] @@ -833,6 +834,7 @@ posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: List item 0 has incompatible type "tuple[str, str, int, int, int, int, str, int]"; expected "tuple[str, str, int, int, str, str, str, str]" [list-item] posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py:0: error: "tuple[Any, ...]" has no attribute "last_uploaded_part_timestamp" [attr-defined] posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py:0: error: "tuple[Any, ...]" has no attribute "upload_state" [attr-defined] +posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py:0: error: Missing positional argument "db_incremental_field_last_value" in call to "__call__" of "SourceFactory" [call-arg] posthog/migrations/0237_remove_timezone_from_teams.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/migrations/0228_fix_tile_layouts.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined] @@ -840,22 +842,22 @@ posthog/api/plugin_log_entry.py:0: error: Module "django.utils.timezone" does no posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined] posthog/api/plugin_log_entry.py:0: error: Module "django.utils.timezone" does not explicitly export attribute "datetime" [attr-defined] posthog/temporal/tests/batch_exports/test_redshift_batch_export_workflow.py:0: error: Incompatible types in assignment (expression has type "str | int", variable has type "int") [assignment] -posthog/temporal/data_imports/external_data_job.py:0: error: Argument "status" to "update_external_job_status" has incompatible type "str"; expected "Status" [arg-type] posthog/api/sharing.py:0: error: Item "None" of "list[Any] | None" has no attribute "__iter__" (not iterable) [union-attr] +posthog/temporal/data_imports/external_data_job.py:0: error: Argument "status" to "update_external_job_status" has incompatible type "str"; expected "Status" [arg-type] posthog/api/test/batch_exports/conftest.py:0: error: Signature of "run" incompatible with supertype "Worker" [override] posthog/api/test/batch_exports/conftest.py:0: note: Superclass: posthog/api/test/batch_exports/conftest.py:0: note: def run(self) -> Coroutine[Any, Any, None] posthog/api/test/batch_exports/conftest.py:0: note: Subclass: posthog/api/test/batch_exports/conftest.py:0: note: def run(self, loop: Any) -> Any posthog/api/test/batch_exports/conftest.py:0: error: Argument "activities" to "ThreadedWorker" has incompatible type "list[function]"; expected "Sequence[Callable[..., Any]]" [arg-type] +posthog/api/test/test_team.py:0: error: "HttpResponse" has no attribute "json" [attr-defined] +posthog/api/test/test_team.py:0: error: "HttpResponse" has no attribute "json" [attr-defined] +posthog/test/test_middleware.py:0: error: Incompatible types in assignment (expression has type "_MonkeyPatchedWSGIResponse", variable has type "_MonkeyPatchedResponse") [assignment] posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] posthog/temporal/tests/data_imports/test_end_to_end.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/api/test/test_team.py:0: error: "HttpResponse" has no attribute "json" [attr-defined] -posthog/api/test/test_team.py:0: error: "HttpResponse" has no attribute "json" [attr-defined] -posthog/test/test_middleware.py:0: error: Incompatible types in assignment (expression has type "_MonkeyPatchedWSGIResponse", variable has type "_MonkeyPatchedResponse") [assignment] posthog/management/commands/test/test_create_batch_export_from_app.py:0: error: Incompatible return value type (got "dict[str, Collection[str]]", expected "dict[str, str]") [return-value] posthog/management/commands/test/test_create_batch_export_from_app.py:0: error: Incompatible types in assignment (expression has type "dict[str, Collection[str]]", variable has type "dict[str, str]") [assignment] posthog/management/commands/test/test_create_batch_export_from_app.py:0: error: Unpacked dict entry 1 has incompatible type "str"; expected "SupportsKeysAndGetItem[str, str]" [dict-item] From 7d7d274fe723d7afce993ec544d7783a8ee5525b Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Wed, 27 Nov 2024 19:05:42 +0000 Subject: [PATCH 09/22] mypy --- mypy-baseline.txt | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/mypy-baseline.txt b/mypy-baseline.txt index a63448c16b8a7..9654cdb251bef 100644 --- a/mypy-baseline.txt +++ b/mypy-baseline.txt @@ -270,7 +270,6 @@ posthog/tasks/update_survey_iteration.py:0: error: Incompatible types in assignm posthog/tasks/update_survey_iteration.py:0: error: Item "None" of "FeatureFlag | None" has no attribute "filters" [union-attr] posthog/tasks/update_survey_iteration.py:0: error: Item "None" of "FeatureFlag | None" has no attribute "filters" [union-attr] posthog/tasks/update_survey_iteration.py:0: error: Item "None" of "FeatureFlag | None" has no attribute "save" [union-attr] -posthog/permissions.py:0: error: Argument 2 to "feature_enabled" has incompatible type "str | None"; expected "str" [arg-type] posthog/models/event/util.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "datetime") [assignment] posthog/models/event/util.py:0: error: Module has no attribute "utc" [attr-defined] posthog/event_usage.py:0: error: Argument 1 to "capture" has incompatible type "str | None"; expected "str" [arg-type] @@ -314,11 +313,7 @@ posthog/tasks/email.py:0: error: Module "django.utils.timezone" does not explici posthog/tasks/email.py:0: error: Argument "email" to "add_recipient" of "EmailMessage" has incompatible type "str | None"; expected "str" [arg-type] posthog/tasks/email.py:0: error: Argument 1 to "capture" has incompatible type "str | None"; expected "str" [arg-type] posthog/tasks/email.py:0: error: Incompatible types in assignment (expression has type "Team | None", variable has type "Team") [assignment] -posthog/api/documentation.py:0: error: Signature of "run_validation" incompatible with supertype "Field" [override] -posthog/api/documentation.py:0: note: Superclass: -posthog/api/documentation.py:0: note: def run_validation(self, data: Any = ...) -> Any -posthog/api/documentation.py:0: note: Subclass: -posthog/api/documentation.py:0: note: def run_validation(self, data: Any) -> Any +posthog/permissions.py:0: error: Argument 2 to "feature_enabled" has incompatible type "str | None"; expected "str" [arg-type] ee/tasks/subscriptions/email_subscriptions.py:0: error: Item "None" of "User | None" has no attribute "email" [union-attr] ee/tasks/subscriptions/email_subscriptions.py:0: error: Item "None" of "datetime | None" has no attribute "isoformat" [union-attr] ee/tasks/subscriptions/email_subscriptions.py:0: error: Item "None" of "datetime | None" has no attribute "strftime" [union-attr] @@ -332,21 +327,26 @@ posthog/models/property/util.py:0: error: Invalid index type "tuple[str, str]" f posthog/models/property/util.py:0: error: Argument 1 to "append" of "list" has incompatible type "str | int"; expected "str" [arg-type] posthog/models/property/util.py:0: error: Argument 1 to "append" of "list" has incompatible type "str | int"; expected "str" [arg-type] posthog/models/property/util.py:0: error: Argument 1 to "append" of "list" has incompatible type "str | int"; expected "str" [arg-type] -posthog/api/utils.py:0: error: Incompatible types in assignment (expression has type "type[EventDefinition]", variable has type "type[EnterpriseEventDefinition]") [assignment] -posthog/api/utils.py:0: error: Argument 1 to "UUID" has incompatible type "int | str"; expected "str | None" [arg-type] +posthog/api/documentation.py:0: error: Signature of "run_validation" incompatible with supertype "Field" [override] +posthog/api/documentation.py:0: note: Superclass: +posthog/api/documentation.py:0: note: def run_validation(self, data: Any = ...) -> Any +posthog/api/documentation.py:0: note: Subclass: +posthog/api/documentation.py:0: note: def run_validation(self, data: Any) -> Any posthog/queries/trends/util.py:0: error: Argument 1 to "translate_hogql" has incompatible type "str | None"; expected "str" [arg-type] posthog/queries/column_optimizer/foss_column_optimizer.py:0: error: Argument 1 to "get" of "dict" has incompatible type "tuple[str, str]"; expected "tuple[str, Literal['properties', 'group_properties', 'person_properties']]" [arg-type] posthog/hogql/property.py:0: error: Incompatible type for lookup 'id': (got "str | int | list[str]", expected "str | int") [misc] posthog/hogql/property.py:0: error: Incompatible type for lookup 'pk': (got "str | float", expected "str | int") [misc] -posthog/api/capture.py:0: error: Module has no attribute "utc" [attr-defined] +posthog/api/utils.py:0: error: Incompatible types in assignment (expression has type "type[EventDefinition]", variable has type "type[EnterpriseEventDefinition]") [assignment] +posthog/api/utils.py:0: error: Argument 1 to "UUID" has incompatible type "int | str"; expected "str | None" [arg-type] posthog/hogql/filters.py:0: error: Incompatible default for argument "team" (default has type "None", argument has type "Team") [assignment] posthog/hogql/filters.py:0: note: PEP 484 prohibits implicit Optional. Accordingly, mypy has changed its default to no_implicit_optional=True posthog/hogql/filters.py:0: note: Use https://github.com/hauntsaninja/no_implicit_optional to automatically upgrade your codebase -posthog/api/organization.py:0: error: Incompatible return value type (got "int | None", expected "Level | None") [return-value] +posthog/api/capture.py:0: error: Module has no attribute "utc" [attr-defined] posthog/hogql/query.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "str | SelectQuery | SelectSetQuery") [assignment] posthog/hogql/query.py:0: error: Incompatible types in assignment (expression has type "Expr", variable has type "SelectQuery | SelectSetQuery") [assignment] posthog/hogql/query.py:0: error: Argument 1 to "get_default_limit_for_context" has incompatible type "LimitContext | None"; expected "LimitContext" [arg-type] posthog/hogql/query.py:0: error: Subclass of "SelectQuery" and "SelectSetQuery" cannot exist: would have incompatible method signatures [unreachable] +posthog/api/organization.py:0: error: Incompatible return value type (got "int | None", expected "Level | None") [return-value] posthog/queries/person_query.py:0: error: Incompatible type for lookup 'pk': (got "str | int | list[str]", expected "str | int") [misc] posthog/api/action.py:0: error: Argument 1 to has incompatible type "*tuple[str, ...]"; expected "type[BaseRenderer]" [arg-type] posthog/queries/event_query/event_query.py:0: error: Incompatible type for lookup 'pk': (got "str | int | list[str]", expected "str | int") [misc] @@ -827,12 +827,7 @@ posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: Need type annotation for "_execute_async_calls" (hint: "_execute_async_calls: list[] = ...") [var-annotated] posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: Need type annotation for "_cursors" (hint: "_cursors: list[] = ...") [var-annotated] posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: List item 0 has incompatible type "tuple[str, str, int, int, int, int, str, int]"; expected "tuple[str, str, int, int, str, str, str, str]" [list-item] -<<<<<<< HEAD -posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py:0: error: "tuple[Any, ...]" has no attribute "last_uploaded_part_timestamp" [attr-defined] -posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py:0: error: "tuple[Any, ...]" has no attribute "upload_state" [attr-defined] posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py:0: error: Missing positional argument "db_incremental_field_last_value" in call to "__call__" of "SourceFactory" [call-arg] -======= ->>>>>>> master posthog/migrations/0237_remove_timezone_from_teams.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/migrations/0228_fix_tile_layouts.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined] From 2e61ad7d1e5587c1e6add40a5b02f04fc2af465d Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 27 Nov 2024 19:08:43 +0000 Subject: [PATCH 10/22] Update query snapshots --- .../test_session_recordings.ambr | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/posthog/session_recordings/test/__snapshots__/test_session_recordings.ambr b/posthog/session_recordings/test/__snapshots__/test_session_recordings.ambr index ca06cf910d628..ff0aea2563613 100644 --- a/posthog/session_recordings/test/__snapshots__/test_session_recordings.ambr +++ b/posthog/session_recordings/test/__snapshots__/test_session_recordings.ambr @@ -640,12 +640,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '447' + AND "ee_accesscontrol"."resource_id" = '446' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '447' + AND "ee_accesscontrol"."resource_id" = '446' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -1688,12 +1688,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -2441,12 +2441,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -3129,12 +3129,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -3881,12 +3881,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -4597,12 +4597,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -5395,12 +5395,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -5659,12 +5659,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -6091,12 +6091,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -6556,12 +6556,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -7248,12 +7248,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -7997,12 +7997,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '454' + AND "ee_accesscontrol"."resource_id" = '453' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL From 2e7f44e05261747191649eea6cf45a49bcd931e2 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Wed, 27 Nov 2024 21:03:57 +0000 Subject: [PATCH 11/22] Use the correct env var equality test --- posthog/temporal/data_imports/external_data_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/posthog/temporal/data_imports/external_data_job.py b/posthog/temporal/data_imports/external_data_job.py index 2df039a8e3088..357a4ad8489d5 100644 --- a/posthog/temporal/data_imports/external_data_job.py +++ b/posthog/temporal/data_imports/external_data_job.py @@ -179,7 +179,7 @@ def parse_inputs(inputs: list[str]) -> ExternalDataWorkflowInputs: async def run(self, inputs: ExternalDataWorkflowInputs): assert inputs.external_data_schema_id is not None - if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: + if settings.TEMPORAL_TASK_QUEUE != DATA_WAREHOUSE_TASK_QUEUE_V2: await workflow.execute_activity( trigger_pipeline_v2, inputs, From 449662f965e28ab99e4b53821c0ae66b693589fe Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Thu, 28 Nov 2024 10:57:25 +0000 Subject: [PATCH 12/22] Added new queue to management commands --- posthog/management/commands/start_temporal_worker.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/posthog/management/commands/start_temporal_worker.py b/posthog/management/commands/start_temporal_worker.py index 706516f3e5643..17c44d2a5b77c 100644 --- a/posthog/management/commands/start_temporal_worker.py +++ b/posthog/management/commands/start_temporal_worker.py @@ -11,6 +11,7 @@ from posthog.constants import ( BATCH_EXPORTS_TASK_QUEUE, DATA_WAREHOUSE_TASK_QUEUE, + DATA_WAREHOUSE_TASK_QUEUE_V2, GENERAL_PURPOSE_TASK_QUEUE, SYNC_BATCH_EXPORTS_TASK_QUEUE, ) @@ -33,6 +34,7 @@ SYNC_BATCH_EXPORTS_TASK_QUEUE: BATCH_EXPORTS_ACTIVITIES, BATCH_EXPORTS_TASK_QUEUE: BATCH_EXPORTS_ACTIVITIES, DATA_WAREHOUSE_TASK_QUEUE: DATA_SYNC_ACTIVITIES + DATA_MODELING_ACTIVITIES, + DATA_WAREHOUSE_TASK_QUEUE_V2: DATA_SYNC_ACTIVITIES + DATA_MODELING_ACTIVITIES, GENERAL_PURPOSE_TASK_QUEUE: PROXY_SERVICE_ACTIVITIES, } From 61f4bd39a0014d049455232046d01efe7a5804c7 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Thu, 28 Nov 2024 15:09:28 +0000 Subject: [PATCH 13/22] Added missing reference --- posthog/management/commands/start_temporal_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/posthog/management/commands/start_temporal_worker.py b/posthog/management/commands/start_temporal_worker.py index 17c44d2a5b77c..77701478f2ded 100644 --- a/posthog/management/commands/start_temporal_worker.py +++ b/posthog/management/commands/start_temporal_worker.py @@ -28,6 +28,7 @@ SYNC_BATCH_EXPORTS_TASK_QUEUE: BATCH_EXPORTS_WORKFLOWS, BATCH_EXPORTS_TASK_QUEUE: BATCH_EXPORTS_WORKFLOWS, DATA_WAREHOUSE_TASK_QUEUE: DATA_SYNC_WORKFLOWS + DATA_MODELING_WORKFLOWS, + DATA_WAREHOUSE_TASK_QUEUE_V2: DATA_SYNC_WORKFLOWS + DATA_MODELING_WORKFLOWS, GENERAL_PURPOSE_TASK_QUEUE: PROXY_SERVICE_WORKFLOWS, } ACTIVITIES_DICT = { From c96f888da65355d65e4001e3c62a7db81fe1be96 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Mon, 2 Dec 2024 13:17:06 +0000 Subject: [PATCH 14/22] Use the correct paths for V2 jobs --- posthog/hogql/database/s3_table.py | 14 ++++++-- posthog/temporal/data_imports/__init__.py | 2 ++ .../pipelines/pipeline/delta_table_helper.py | 4 +-- .../pipelines/pipeline/pipeline.py | 4 ++- .../data_imports/pipelines/pipeline_sync.py | 34 +++++++++++-------- posthog/temporal/data_imports/util.py | 21 ++++++++++-- posthog/warehouse/models/table.py | 12 +++++-- 7 files changed, 66 insertions(+), 25 deletions(-) diff --git a/posthog/hogql/database/s3_table.py b/posthog/hogql/database/s3_table.py index e5136bc2348cf..4a8ecc0a47f06 100644 --- a/posthog/hogql/database/s3_table.py +++ b/posthog/hogql/database/s3_table.py @@ -1,5 +1,5 @@ import re -from typing import Optional +from typing import TYPE_CHECKING, Optional from posthog.clickhouse.client.escape import substitute_params from posthog.hogql.context import HogQLContext @@ -7,6 +7,9 @@ from posthog.hogql.errors import ExposedHogQLError from posthog.hogql.escape_sql import escape_hogql_identifier +if TYPE_CHECKING: + from posthog.warehouse.models import ExternalDataJob + def build_function_call( url: str, @@ -15,7 +18,10 @@ def build_function_call( access_secret: Optional[str] = None, structure: Optional[str] = None, context: Optional[HogQLContext] = None, + pipeline_version: Optional["ExternalDataJob.PipelineVersion"] = None, ) -> str: + from posthog.warehouse.models import ExternalDataJob + raw_params: dict[str, str] = {} def add_param(value: str, is_sensitive: bool = True) -> str: @@ -36,10 +42,12 @@ def return_expr(expr: str) -> str: # DeltaS3Wrapper format if format == "DeltaS3Wrapper": + query_folder = "__query_v2" if pipeline_version == ExternalDataJob.PipelineVersion.V2 else "__query" + if url.endswith("/"): - escaped_url = add_param(f"{url[:len(url) - 1]}__query/*.parquet") + escaped_url = add_param(f"{url[:len(url) - 1]}{query_folder}/*.parquet") else: - escaped_url = add_param(f"{url}__query/*.parquet") + escaped_url = add_param(f"{url}{query_folder}/*.parquet") if structure: escaped_structure = add_param(structure, False) diff --git a/posthog/temporal/data_imports/__init__.py b/posthog/temporal/data_imports/__init__.py index c59f20b05d8cf..aab0a74ac554c 100644 --- a/posthog/temporal/data_imports/__init__.py +++ b/posthog/temporal/data_imports/__init__.py @@ -6,6 +6,7 @@ update_external_data_job_model, check_billing_limits_activity, sync_new_schemas_activity, + trigger_pipeline_v2, ) WORKFLOWS = [ExternalDataJobWorkflow] @@ -17,4 +18,5 @@ create_source_templates, check_billing_limits_activity, sync_new_schemas_activity, + trigger_pipeline_v2, ] diff --git a/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py index af49e4b2799c9..31c116879c05d 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py +++ b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py @@ -39,8 +39,8 @@ def _get_credentials(self): def _get_delta_table_uri(self) -> str: normalized_resource_name = NamingConvention().normalize_identifier(self._resource_name) - # Appended __V2 on to the end of the url so that data of the V2 pipeline isn't the same as V1 - return f"{settings.BUCKET_URL}/{self._job.folder_path()}/{normalized_resource_name}__V2" + # Appended __v2 on to the end of the url so that data of the V2 pipeline isn't the same as V1 + return f"{settings.BUCKET_URL}/{self._job.folder_path()}/{normalized_resource_name}__v2" def _evolve_delta_schema(self, schema: pa.Schema) -> deltalake.DeltaTable: delta_table = self.get_delta_table() diff --git a/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py b/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py index 96f938a32e55f..a69d60501601b 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py +++ b/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py @@ -122,7 +122,9 @@ def _post_run_operations(self, row_count: int): file_uris = delta_table.file_uris() self._logger.info(f"Preparing S3 files - total parquet files: {len(file_uris)}") - prepare_s3_files_for_querying(self._job.folder_path(), self._resource_name, file_uris) + prepare_s3_files_for_querying( + self._job.folder_path(), self._resource_name, file_uris, ExternalDataJob.PipelineVersion.V2 + ) self._logger.debug("Validating schema and updating table") diff --git a/posthog/temporal/data_imports/pipelines/pipeline_sync.py b/posthog/temporal/data_imports/pipelines/pipeline_sync.py index 649cec7db89c3..46a2537bb571c 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline_sync.py +++ b/posthog/temporal/data_imports/pipelines/pipeline_sync.py @@ -457,6 +457,9 @@ def validate_schema_and_update_table_sync( "pipeline", Prefetch("schema", queryset=ExternalDataSchema.objects.prefetch_related("source")) ).get(pk=run_id) + using_v2_pipeline = job.pipeline_version == ExternalDataJob.PipelineVersion.V2 + pipeline_version = ExternalDataJob.PipelineVersion(job.pipeline_version) + credential = get_or_create_datawarehouse_credential( team_id=team_id, access_key=settings.AIRBYTE_BUCKET_KEY, @@ -506,23 +509,26 @@ def validate_schema_and_update_table_sync( assert isinstance(table_created, DataWarehouseTable) and table_created is not None # Temp fix #2 for Delta tables without table_format - try: - table_created.get_columns() - except Exception as e: - if table_format == DataWarehouseTable.TableFormat.DeltaS3Wrapper: - logger.exception("get_columns exception with DeltaS3Wrapper format - trying Delta format", exc_info=e) - - table_created.format = DataWarehouseTable.TableFormat.Delta + if not using_v2_pipeline: + try: table_created.get_columns() - table_created.save() + except Exception as e: + if table_format == DataWarehouseTable.TableFormat.DeltaS3Wrapper: + logger.exception( + "get_columns exception with DeltaS3Wrapper format - trying Delta format", exc_info=e + ) - logger.info("Delta format worked - updating table to use Delta") - else: - raise + table_created.format = DataWarehouseTable.TableFormat.Delta + table_created.get_columns() + table_created.save() + + logger.info("Delta format worked - updating table to use Delta") + else: + raise # If using new non-DLT pipeline - if table_schema_dict is not None: - raw_db_columns: dict[str, dict[str, str]] = table_created.get_columns() + if using_v2_pipeline and table_schema_dict is not None: + raw_db_columns: dict[str, dict[str, str]] = table_created.get_columns(pipeline_version=pipeline_version) db_columns = {key: column.get("clickhouse", "") for key, column in raw_db_columns.items()} columns = {} @@ -570,7 +576,7 @@ def validate_schema_and_update_table_sync( .get(id=_schema_id, team_id=team_id) ) - if schema_model: + if not using_v2_pipeline and schema_model: schema_model.table = table_created schema_model.save() diff --git a/posthog/temporal/data_imports/util.py b/posthog/temporal/data_imports/util.py index cc8a4892b0aaa..4a133ef336b42 100644 --- a/posthog/temporal/data_imports/util.py +++ b/posthog/temporal/data_imports/util.py @@ -1,18 +1,33 @@ +from typing import Optional from posthog.settings.utils import get_from_env from posthog.utils import str_to_bool +from posthog.warehouse.models import ExternalDataJob from posthog.warehouse.s3 import get_s3_client from django.conf import settings from dlt.common.normalizers.naming.snake_case import NamingConvention -def prepare_s3_files_for_querying(folder_path: str, table_name: str, file_uris: list[str]): +def prepare_s3_files_for_querying( + folder_path: str, + table_name: str, + file_uris: list[str], + pipeline_version: Optional[ExternalDataJob.PipelineVersion] = None, +): s3 = get_s3_client() normalized_table_name = NamingConvention().normalize_identifier(table_name) s3_folder_for_job = f"{settings.BUCKET_URL}/{folder_path}" - s3_folder_for_schema = f"{s3_folder_for_job}/{normalized_table_name}" - s3_folder_for_querying = f"{s3_folder_for_job}/{normalized_table_name}__query" + + if pipeline_version == ExternalDataJob.PipelineVersion.V2: + s3_folder_for_schema = f"{s3_folder_for_job}/{normalized_table_name}__v2" + else: + s3_folder_for_schema = f"{s3_folder_for_job}/{normalized_table_name}" + + if pipeline_version == ExternalDataJob.PipelineVersion.V2: + s3_folder_for_querying = f"{s3_folder_for_job}/{normalized_table_name}__query_v2" + else: + s3_folder_for_querying = f"{s3_folder_for_job}/{normalized_table_name}__query" if s3.exists(s3_folder_for_querying): s3.delete(s3_folder_for_querying, recursive=True) diff --git a/posthog/warehouse/models/table.py b/posthog/warehouse/models/table.py index f5bdb94b246eb..0f960d2648c8d 100644 --- a/posthog/warehouse/models/table.py +++ b/posthog/warehouse/models/table.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Optional, TypeAlias +from typing import TYPE_CHECKING, Optional, TypeAlias from django.db import models from posthog.client import sync_execute @@ -29,6 +29,9 @@ from .external_table_definitions import external_tables from posthog.hogql.context import HogQLContext +if TYPE_CHECKING: + from posthog.warehouse.models import ExternalDataJob + SERIALIZED_FIELD_TO_CLICKHOUSE_MAPPING: dict[DatabaseSerializedFieldType, str] = { DatabaseSerializedFieldType.INTEGER: "Int64", DatabaseSerializedFieldType.FLOAT: "Float64", @@ -138,7 +141,11 @@ def validate_column_type(self, column_key) -> bool: except: return False - def get_columns(self, safe_expose_ch_error=True) -> DataWarehouseTableColumns: + def get_columns( + self, + pipeline_version: Optional["ExternalDataJob.PipelineVersion"] = None, + safe_expose_ch_error: bool = True, + ) -> DataWarehouseTableColumns: try: placeholder_context = HogQLContext(team_id=self.team.pk) s3_table_func = build_function_call( @@ -147,6 +154,7 @@ def get_columns(self, safe_expose_ch_error=True) -> DataWarehouseTableColumns: access_key=self.credential.access_key, access_secret=self.credential.access_secret, context=placeholder_context, + pipeline_version=pipeline_version, ) result = sync_execute( From 27b47e7ec6102626f154d8019f980c5334faaa21 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 13:32:55 +0000 Subject: [PATCH 15/22] Update query snapshots --- .../test_session_recordings.ambr | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/posthog/session_recordings/test/__snapshots__/test_session_recordings.ambr b/posthog/session_recordings/test/__snapshots__/test_session_recordings.ambr index ff0aea2563613..ccba484c51a23 100644 --- a/posthog/session_recordings/test/__snapshots__/test_session_recordings.ambr +++ b/posthog/session_recordings/test/__snapshots__/test_session_recordings.ambr @@ -640,12 +640,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '446' + AND "ee_accesscontrol"."resource_id" = '421' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '446' + AND "ee_accesscontrol"."resource_id" = '421' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -1688,12 +1688,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -2441,12 +2441,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -3129,12 +3129,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -3881,12 +3881,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -4597,12 +4597,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -5395,12 +5395,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -5659,12 +5659,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -6091,12 +6091,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -6556,12 +6556,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -7248,12 +7248,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL @@ -7997,12 +7997,12 @@ LEFT OUTER JOIN "posthog_organizationmembership" ON ("ee_accesscontrol"."organization_member_id" = "posthog_organizationmembership"."id") WHERE (("ee_accesscontrol"."organization_member_id" IS NULL AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("posthog_organizationmembership"."user_id" = 99999 AND "ee_accesscontrol"."resource" = 'project' - AND "ee_accesscontrol"."resource_id" = '453' + AND "ee_accesscontrol"."resource_id" = '428' AND "ee_accesscontrol"."role_id" IS NULL AND "ee_accesscontrol"."team_id" = 99999) OR ("ee_accesscontrol"."organization_member_id" IS NULL From 8e6ab69ad2208c451a388e18505c0b8e0420e81a Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Tue, 3 Dec 2024 18:36:37 +0000 Subject: [PATCH 16/22] Fixes for 100% pass rate in end_to_end tests --- posthog/hogql/database/s3_table.py | 10 +- .../data_imports/external_data_job.py | 3 +- .../pipelines/pipeline/delta_table_helper.py | 6 +- .../data_imports/pipelines/pipeline_sync.py | 5 + .../tests/data_imports/test_end_to_end.py | 162 +++++++++++------- posthog/warehouse/models/external_data_job.py | 12 +- .../warehouse/models/external_data_schema.py | 7 +- .../models/external_table_definitions.py | 1 + 8 files changed, 133 insertions(+), 73 deletions(-) diff --git a/posthog/hogql/database/s3_table.py b/posthog/hogql/database/s3_table.py index 4a8ecc0a47f06..479969ae93bd1 100644 --- a/posthog/hogql/database/s3_table.py +++ b/posthog/hogql/database/s3_table.py @@ -45,9 +45,15 @@ def return_expr(expr: str) -> str: query_folder = "__query_v2" if pipeline_version == ExternalDataJob.PipelineVersion.V2 else "__query" if url.endswith("/"): - escaped_url = add_param(f"{url[:len(url) - 1]}{query_folder}/*.parquet") + if pipeline_version == ExternalDataJob.PipelineVersion.V2: + escaped_url = add_param(f"{url[:-5]}{query_folder}/*.parquet") + else: + escaped_url = add_param(f"{url[:-1]}{query_folder}/*.parquet") else: - escaped_url = add_param(f"{url}{query_folder}/*.parquet") + if pipeline_version == ExternalDataJob.PipelineVersion.V2: + escaped_url = add_param(f"{url[:-4]}{query_folder}/*.parquet") + else: + escaped_url = add_param(f"{url}{query_folder}/*.parquet") if structure: escaped_structure = add_param(structure, False) diff --git a/posthog/temporal/data_imports/external_data_job.py b/posthog/temporal/data_imports/external_data_job.py index 535c5965cc8c1..afdff32d658f7 100644 --- a/posthog/temporal/data_imports/external_data_job.py +++ b/posthog/temporal/data_imports/external_data_job.py @@ -13,6 +13,7 @@ from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 # TODO: remove dependency +from posthog.settings.base_variables import TEST from posthog.temporal.batch_exports.base import PostHogWorkflow from posthog.temporal.common.client import sync_connect from posthog.temporal.data_imports.workflow_activities.check_billing_limits import ( @@ -183,7 +184,7 @@ def parse_inputs(inputs: list[str]) -> ExternalDataWorkflowInputs: async def run(self, inputs: ExternalDataWorkflowInputs): assert inputs.external_data_schema_id is not None - if settings.TEMPORAL_TASK_QUEUE != DATA_WAREHOUSE_TASK_QUEUE_V2: + if settings.TEMPORAL_TASK_QUEUE != DATA_WAREHOUSE_TASK_QUEUE_V2 and not TEST: await workflow.execute_activity( trigger_pipeline_v2, inputs, diff --git a/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py index 31c116879c05d..64cbbda922863 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py +++ b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py @@ -25,6 +25,7 @@ def _get_credentials(self): "aws_secret_access_key": settings.AIRBYTE_BUCKET_SECRET, "endpoint_url": settings.OBJECT_STORAGE_ENDPOINT, "region_name": settings.AIRBYTE_BUCKET_REGION, + "AWS_DEFAULT_REGION": settings.AIRBYTE_BUCKET_REGION, "AWS_ALLOW_HTTP": "true", "AWS_S3_ALLOW_UNSAFE_RENAME": "true", } @@ -95,7 +96,10 @@ def write_to_deltalake( schema_mode = "overwrite" if delta_table is None: - delta_table = deltalake.DeltaTable.create(table_uri=self._get_delta_table_uri(), schema=data.schema) + storage_options = self._get_credentials() + delta_table = deltalake.DeltaTable.create( + table_uri=self._get_delta_table_uri(), schema=data.schema, storage_options=storage_options + ) deltalake.write_deltalake( table_or_uri=delta_table, diff --git a/posthog/temporal/data_imports/pipelines/pipeline_sync.py b/posthog/temporal/data_imports/pipelines/pipeline_sync.py index 46a2537bb571c..fa69f91419c31 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline_sync.py +++ b/posthog/temporal/data_imports/pipelines/pipeline_sync.py @@ -460,6 +460,11 @@ def validate_schema_and_update_table_sync( using_v2_pipeline = job.pipeline_version == ExternalDataJob.PipelineVersion.V2 pipeline_version = ExternalDataJob.PipelineVersion(job.pipeline_version) + # Temp so we dont create a bunch of orphaned Table objects + if using_v2_pipeline: + logger.debug("Using V2 pipeline - dont create table object or get columns") + return + credential = get_or_create_datawarehouse_credential( team_id=team_id, access_key=settings.AIRBYTE_BUCKET_KEY, diff --git a/posthog/temporal/tests/data_imports/test_end_to_end.py b/posthog/temporal/tests/data_imports/test_end_to_end.py index fce2047cd1c28..06c198ec5b2d5 100644 --- a/posthog/temporal/tests/data_imports/test_end_to_end.py +++ b/posthog/temporal/tests/data_imports/test_end_to_end.py @@ -19,7 +19,7 @@ from temporalio.testing import WorkflowEnvironment from temporalio.worker import UnsandboxedWorkflowRunner, Worker -from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE, DATA_WAREHOUSE_TASK_QUEUE_V2 from posthog.hogql.modifiers import create_default_modifiers_for_team from posthog.hogql.query import execute_hogql_query from posthog.hogql_queries.insights.funnels.funnel import Funnel @@ -99,6 +99,19 @@ async def minio_client(): yield minio_client +def pytest_generate_tests(metafunc): + if "task_queue" in metafunc.fixturenames: + metafunc.parametrize("task_queue", [DATA_WAREHOUSE_TASK_QUEUE, DATA_WAREHOUSE_TASK_QUEUE_V2], indirect=True) + + +@pytest.fixture(autouse=True) +def task_queue(request): + queue = getattr(request, "param", None) + + with override_settings(TEMPORAL_TASK_QUEUE=queue): + yield + + async def _run( team: Team, schema_name: str, @@ -142,18 +155,23 @@ async def _run( assert run.status == ExternalDataJob.Status.COMPLETED await sync_to_async(schema.refresh_from_db)() - assert schema.last_synced_at == run.created_at - res = await sync_to_async(execute_hogql_query)(f"SELECT * FROM {table_name}", team) - assert len(res.results) == 1 + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + assert schema.last_synced_at == run.created_at + else: + assert schema.last_synced_at is None - for name, field in external_tables.get(table_name, {}).items(): - if field.hidden: - continue - assert name in (res.columns or []) + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)(f"SELECT * FROM {table_name}", team) + assert len(res.results) == 1 - await sync_to_async(source.refresh_from_db)() - assert source.job_inputs.get("reset_pipeline", None) is None + for name, field in external_tables.get(table_name, {}).items(): + if field.hidden: + continue + assert name in (res.columns or []) + + await sync_to_async(source.refresh_from_db)() + assert source.job_inputs.get("reset_pipeline", None) is None return workflow_id, inputs @@ -203,11 +221,12 @@ def mock_to_object_store_rs_credentials(class_self): ), mock.patch.object(AwsCredentials, "to_session_credentials", mock_to_session_credentials), mock.patch.object(AwsCredentials, "to_object_store_rs_credentials", mock_to_object_store_rs_credentials), + mock.patch("posthog.temporal.data_imports.external_data_job.trigger_pipeline_v2"), ): async with await WorkflowEnvironment.start_time_skipping() as activity_environment: async with Worker( activity_environment.client, - task_queue=DATA_WAREHOUSE_TASK_QUEUE, + task_queue=settings.TEMPORAL_TASK_QUEUE, workflows=[ExternalDataJobWorkflow], activities=ACTIVITIES, # type: ignore workflow_runner=UnsandboxedWorkflowRunner(), @@ -218,7 +237,7 @@ def mock_to_object_store_rs_credentials(class_self): ExternalDataJobWorkflow.run, inputs, id=workflow_id, - task_queue=DATA_WAREHOUSE_TASK_QUEUE, + task_queue=settings.TEMPORAL_TASK_QUEUE, retry_policy=RetryPolicy(maximum_attempts=1), ) @@ -525,12 +544,13 @@ async def test_postgres_binary_columns(team, postgres_config, postgres_connectio mock_data_response=[], ) - res = await sync_to_async(execute_hogql_query)(f"SELECT * FROM postgres_binary_col_test", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)(f"SELECT * FROM postgres_binary_col_test", team) + columns = res.columns - assert columns is not None - assert len(columns) == 1 - assert columns[0] == "id" + assert columns is not None + assert len(columns) == 1 + assert columns[0] == "id" @pytest.mark.django_db(transaction=True) @@ -558,9 +578,14 @@ def get_jobs(): latest_job = jobs[0] folder_path = await sync_to_async(latest_job.folder_path)() - s3_objects = await minio_client.list_objects_v2( - Bucket=BUCKET_NAME, Prefix=f"{folder_path}/balance_transaction__query/" - ) + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + s3_objects = await minio_client.list_objects_v2( + Bucket=BUCKET_NAME, Prefix=f"{folder_path}/balance_transaction__query/" + ) + else: + s3_objects = await minio_client.list_objects_v2( + Bucket=BUCKET_NAME, Prefix=f"{folder_path}/balance_transaction__query_v2/" + ) assert len(s3_objects["Contents"]) != 0 @@ -587,23 +612,24 @@ async def test_funnels_lazy_joins_ordering(team, stripe_customer): field_name="stripe_customer", ) - query = FunnelsQuery( - series=[EventsNode(), EventsNode()], - breakdownFilter=BreakdownFilter( - breakdown_type=BreakdownType.DATA_WAREHOUSE_PERSON_PROPERTY, breakdown="stripe_customer.email" - ), - ) - funnel_class = Funnel(context=FunnelQueryContext(query=query, team=team)) - - query_ast = funnel_class.get_query() - await sync_to_async(execute_hogql_query)( - query_type="FunnelsQuery", - query=query_ast, - team=team, - modifiers=create_default_modifiers_for_team( - team, HogQLQueryModifiers(personsOnEventsMode=PersonsOnEventsMode.PERSON_ID_OVERRIDE_PROPERTIES_JOINED) - ), - ) + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + query = FunnelsQuery( + series=[EventsNode(), EventsNode()], + breakdownFilter=BreakdownFilter( + breakdown_type=BreakdownType.DATA_WAREHOUSE_PERSON_PROPERTY, breakdown="stripe_customer.email" + ), + ) + funnel_class = Funnel(context=FunnelQueryContext(query=query, team=team)) + + query_ast = funnel_class.get_query() + await sync_to_async(execute_hogql_query)( + query_type="FunnelsQuery", + query=query_ast, + team=team, + modifiers=create_default_modifiers_for_team( + team, HogQLQueryModifiers(personsOnEventsMode=PersonsOnEventsMode.PERSON_ID_OVERRIDE_PROPERTIES_JOINED) + ), + ) @pytest.mark.django_db(transaction=True) @@ -636,12 +662,13 @@ async def test_postgres_schema_evolution(team, postgres_config, postgres_connect sync_type_config={"incremental_field": "id", "incremental_field_type": "integer"}, ) - res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) + columns = res.columns - assert columns is not None - assert len(columns) == 1 - assert any(x == "id" for x in columns) + assert columns is not None + assert len(columns) == 1 + assert any(x == "id" for x in columns) # Evole schema await postgres_connection.execute( @@ -655,18 +682,20 @@ async def test_postgres_schema_evolution(team, postgres_config, postgres_connect # Execute the same schema again - load await _execute_run(str(uuid.uuid4()), inputs, []) - res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) + columns = res.columns - assert columns is not None - assert len(columns) == 2 - assert any(x == "id" for x in columns) - assert any(x == "new_col" for x in columns) + assert columns is not None + assert len(columns) == 2 + assert any(x == "id" for x in columns) + assert any(x == "new_col" for x in columns) @pytest.mark.django_db(transaction=True) @pytest.mark.asyncio async def test_sql_database_missing_incremental_values(team, postgres_config, postgres_connection): + await postgres_connection.execute("CREATE SCHEMA IF NOT EXISTS {schema}".format(schema=postgres_config["schema"])) await postgres_connection.execute( "CREATE TABLE IF NOT EXISTS {schema}.test_table (id integer)".format(schema=postgres_config["schema"]) ) @@ -697,15 +726,16 @@ async def test_sql_database_missing_incremental_values(team, postgres_config, po sync_type_config={"incremental_field": "id", "incremental_field_type": "integer"}, ) - res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) + columns = res.columns - assert columns is not None - assert len(columns) == 1 - assert any(x == "id" for x in columns) + assert columns is not None + assert len(columns) == 1 + assert any(x == "id" for x in columns) - # Exclude rows that don't have the incremental cursor key set - assert len(res.results) == 1 + # Exclude rows that don't have the incremental cursor key set + assert len(res.results) == 1 @pytest.mark.django_db(transaction=True) @@ -739,15 +769,16 @@ async def test_sql_database_incremental_initial_value(team, postgres_config, pos sync_type_config={"incremental_field": "id", "incremental_field_type": "integer"}, ) - res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) + columns = res.columns - assert columns is not None - assert len(columns) == 1 - assert any(x == "id" for x in columns) + assert columns is not None + assert len(columns) == 1 + assert any(x == "id" for x in columns) - # Include rows that have the same incremental value as the `initial_value` - assert len(res.results) == 1 + # Include rows that have the same incremental value as the `initial_value` + assert len(res.results) == 1 @pytest.mark.django_db(transaction=True) @@ -1007,7 +1038,8 @@ async def test_delta_table_deleted(team, stripe_balance_transaction): sync_type=ExternalDataSchema.SyncType.FULL_REFRESH, ) - with mock.patch.object(DeltaTable, "delete") as mock_delta_table_delete: - await _execute_run(str(uuid.uuid4()), inputs, stripe_balance_transaction["data"]) + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + with mock.patch.object(DeltaTable, "delete") as mock_delta_table_delete: + await _execute_run(str(uuid.uuid4()), inputs, stripe_balance_transaction["data"]) - mock_delta_table_delete.assert_called_once() + mock_delta_table_delete.assert_called_once() diff --git a/posthog/warehouse/models/external_data_job.py b/posthog/warehouse/models/external_data_job.py index 409b0277a153f..d9949e00d4423 100644 --- a/posthog/warehouse/models/external_data_job.py +++ b/posthog/warehouse/models/external_data_job.py @@ -41,9 +41,17 @@ def folder_path(self) -> str: def url_pattern_by_schema(self, schema: str) -> str: if TEST: - return f"http://{settings.AIRBYTE_BUCKET_DOMAIN}/{settings.BUCKET}/{self.folder_path()}/{schema.lower()}/" + if self.pipeline_version == ExternalDataJob.PipelineVersion.V1: + return ( + f"http://{settings.AIRBYTE_BUCKET_DOMAIN}/{settings.BUCKET}/{self.folder_path()}/{schema.lower()}/" + ) + else: + return f"http://{settings.AIRBYTE_BUCKET_DOMAIN}/{settings.BUCKET}/{self.folder_path()}/{schema.lower()}__v2/" - return f"https://{settings.AIRBYTE_BUCKET_DOMAIN}/dlt/{self.folder_path()}/{schema.lower()}/" + if self.pipeline_version == ExternalDataJob.PipelineVersion.V1: + return f"https://{settings.AIRBYTE_BUCKET_DOMAIN}/dlt/{self.folder_path()}/{schema.lower()}/" + + return f"https://{settings.AIRBYTE_BUCKET_DOMAIN}/dlt/{self.folder_path()}/{schema.lower()}__v2/" @database_sync_to_async diff --git a/posthog/warehouse/models/external_data_schema.py b/posthog/warehouse/models/external_data_schema.py index 449142b55522b..fae744be0795c 100644 --- a/posthog/warehouse/models/external_data_schema.py +++ b/posthog/warehouse/models/external_data_schema.py @@ -3,6 +3,7 @@ from typing import Any, Optional from django.db import models from django_deprecate_fields import deprecate_field +import numpy import snowflake.connector from django.conf import settings from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 @@ -73,13 +74,15 @@ def is_incremental(self): def update_incremental_field_last_value(self, last_value: Any) -> None: incremental_field_type = self.sync_type_config.get("incremental_field_type") + last_value_py = last_value.item() if isinstance(last_value, numpy.generic) else last_value + if ( incremental_field_type == IncrementalFieldType.Integer or incremental_field_type == IncrementalFieldType.Numeric ): - last_value_json = last_value + last_value_json = last_value_py else: - last_value_json = str(last_value) + last_value_json = str(last_value_py) if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: key = "incremental_field_last_value_v2" diff --git a/posthog/warehouse/models/external_table_definitions.py b/posthog/warehouse/models/external_table_definitions.py index 00704ec6c3994..4294cc6003836 100644 --- a/posthog/warehouse/models/external_table_definitions.py +++ b/posthog/warehouse/models/external_table_definitions.py @@ -16,6 +16,7 @@ "*": { "__dlt_id": StringDatabaseField(name="_dlt_id", hidden=True), "__dlt_load_id": StringDatabaseField(name="_dlt_load_id", hidden=True), + "__ph_debug": StringJSONDatabaseField(name="_ph_debug", hidden=True), }, "stripe_account": { "id": StringDatabaseField(name="id"), From 1b05a2137e88ddd700b9d009e4310b6db16b129a Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Thu, 5 Dec 2024 14:20:01 +0000 Subject: [PATCH 17/22] Fix mypy issues and limit V2 to just team 2 --- mypy-baseline.txt | 3 ++- posthog/temporal/data_imports/external_data_job.py | 7 ++++++- posthog/temporal/data_imports/pipelines/pipeline_sync.py | 6 +++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/mypy-baseline.txt b/mypy-baseline.txt index 23e23e7e67ebe..09445b9a9eb87 100644 --- a/mypy-baseline.txt +++ b/mypy-baseline.txt @@ -796,7 +796,8 @@ posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: "FilesystemDe posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: "type[FilesystemDestinationClientConfiguration]" has no attribute "delta_jobs_per_write" [attr-defined] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "DataWarehouseCredential | Combinable | None") [assignment] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "str | int | Combinable") [assignment] -posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "dict[str, dict[str, str | bool]] | dict[str, str]", variable has type "dict[str, dict[str, str]]") [assignment] +posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Right operand of "and" is never evaluated [unreachable] +posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Statement is unreachable [unreachable] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Name "raw_db_columns" already defined on line 0 [no-redef] posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type] posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type] diff --git a/posthog/temporal/data_imports/external_data_job.py b/posthog/temporal/data_imports/external_data_job.py index afdff32d658f7..62a1e1bc834ed 100644 --- a/posthog/temporal/data_imports/external_data_job.py +++ b/posthog/temporal/data_imports/external_data_job.py @@ -16,6 +16,7 @@ from posthog.settings.base_variables import TEST from posthog.temporal.batch_exports.base import PostHogWorkflow from posthog.temporal.common.client import sync_connect +from posthog.temporal.data_imports.util import is_posthog_team from posthog.temporal.data_imports.workflow_activities.check_billing_limits import ( CheckBillingLimitsActivityInputs, check_billing_limits_activity, @@ -184,7 +185,11 @@ def parse_inputs(inputs: list[str]) -> ExternalDataWorkflowInputs: async def run(self, inputs: ExternalDataWorkflowInputs): assert inputs.external_data_schema_id is not None - if settings.TEMPORAL_TASK_QUEUE != DATA_WAREHOUSE_TASK_QUEUE_V2 and not TEST: + if ( + settings.TEMPORAL_TASK_QUEUE != DATA_WAREHOUSE_TASK_QUEUE_V2 + and not TEST + and is_posthog_team(inputs.team_id) + ): await workflow.execute_activity( trigger_pipeline_v2, inputs, diff --git a/posthog/temporal/data_imports/pipelines/pipeline_sync.py b/posthog/temporal/data_imports/pipelines/pipeline_sync.py index fa69f91419c31..5a7e8dc2f743e 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline_sync.py +++ b/posthog/temporal/data_imports/pipelines/pipeline_sync.py @@ -458,7 +458,11 @@ def validate_schema_and_update_table_sync( ).get(pk=run_id) using_v2_pipeline = job.pipeline_version == ExternalDataJob.PipelineVersion.V2 - pipeline_version = ExternalDataJob.PipelineVersion(job.pipeline_version) + pipeline_version = ( + ExternalDataJob.PipelineVersion.V1 + if job.pipeline_version is None + else ExternalDataJob.PipelineVersion(job.pipeline_version) + ) # Temp so we dont create a bunch of orphaned Table objects if using_v2_pipeline: From 7875672e20dfd7319795d937d592cb7e85afa1f8 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Thu, 5 Dec 2024 15:34:19 +0000 Subject: [PATCH 18/22] Fixed tests --- .../temporal/data_imports/pipelines/test/test_pipeline_sync.py | 2 ++ posthog/temporal/tests/batch_exports/test_import_data.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py b/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py index c9060df15e2d0..5b765e35cea14 100644 --- a/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py +++ b/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py @@ -85,6 +85,7 @@ def _create_pipeline(self, schema_name: str, incremental: bool): is_incremental=incremental, team_id=self.team.pk, job_id=str(job.pk), + db_incremental_field_last_value=0, ), logger=structlog.get_logger(), incremental=incremental, @@ -135,6 +136,7 @@ def mock_create_pipeline(local_self: Any): ) as mock_validate_schema_and_update_table, patch("posthog.temporal.data_imports.pipelines.pipeline_sync.get_delta_tables"), patch("posthog.temporal.data_imports.pipelines.pipeline_sync.update_last_synced_at_sync"), + patch("posthog.temporal.data_imports.pipelines.pipeline_sync.save_last_incremental_value"), override_settings( BUCKET_URL=f"s3://{BUCKET_NAME}", AIRBYTE_BUCKET_KEY=settings.OBJECT_STORAGE_ACCESS_KEY_ID, diff --git a/posthog/temporal/tests/batch_exports/test_import_data.py b/posthog/temporal/tests/batch_exports/test_import_data.py index e5ec26e3e89e9..abf9bb56b094e 100644 --- a/posthog/temporal/tests/batch_exports/test_import_data.py +++ b/posthog/temporal/tests/batch_exports/test_import_data.py @@ -87,6 +87,7 @@ def test_postgres_source_without_ssh_tunnel(activity_environment, team, **kwargs table_names=["table_1"], incremental_field=None, incremental_field_type=None, + db_incremental_field_last_value=None, team_id=team.id, using_ssl=True, ) @@ -128,6 +129,7 @@ def test_postgres_source_with_ssh_tunnel_disabled(activity_environment, team, ** table_names=["table_1"], incremental_field=None, incremental_field_type=None, + db_incremental_field_last_value=None, team_id=team.id, using_ssl=True, ) @@ -187,6 +189,7 @@ def __exit__(self, exc_type, exc_value, exc_traceback): table_names=["table_1"], incremental_field=None, incremental_field_type=None, + db_incremental_field_last_value=None, team_id=team.id, using_ssl=True, ) From 1b983b51e1b6a183e7bc4a968e4a3c94228252fa Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Thu, 5 Dec 2024 16:52:09 +0000 Subject: [PATCH 19/22] Remove rogue file --- .../errorTrackingGroupSceneLogic.ts | 178 ------------------ 1 file changed, 178 deletions(-) delete mode 100644 frontend/src/scenes/error-tracking/errorTrackingGroupSceneLogic.ts diff --git a/frontend/src/scenes/error-tracking/errorTrackingGroupSceneLogic.ts b/frontend/src/scenes/error-tracking/errorTrackingGroupSceneLogic.ts deleted file mode 100644 index 506fcdec0e1e0..0000000000000 --- a/frontend/src/scenes/error-tracking/errorTrackingGroupSceneLogic.ts +++ /dev/null @@ -1,178 +0,0 @@ -import { actions, connect, kea, key, listeners, path, props, reducers, selectors } from 'kea' -import { loaders } from 'kea-loaders' -import { actionToUrl, router, urlToAction } from 'kea-router' -import api from 'lib/api' -import { Dayjs, dayjs } from 'lib/dayjs' -import { Scene } from 'scenes/sceneTypes' -import { urls } from 'scenes/urls' - -import { ErrorTrackingGroup } from '~/queries/schema' -import { Breadcrumb } from '~/types' - -import type { errorTrackingGroupSceneLogicType } from './errorTrackingGroupSceneLogicType' -import { errorTrackingLogic } from './errorTrackingLogic' -import { errorTrackingGroupEventsQuery, errorTrackingGroupQuery } from './queries' - -export interface ErrorTrackingEvent { - uuid: string - timestamp: Dayjs - properties: Record - person: { - distinct_id: string - uuid?: string - created_at?: string - properties?: Record - } -} - -export interface ErrorTrackingGroupSceneLogicProps { - fingerprint: ErrorTrackingGroup['fingerprint'] -} - -export enum ErrorGroupTab { - Overview = 'overview', - Breakdowns = 'breakdowns', -} - -export const errorTrackingGroupSceneLogic = kea([ - path((key) => ['scenes', 'error-tracking', 'errorTrackingGroupSceneLogic', key]), - props({} as ErrorTrackingGroupSceneLogicProps), - key((props) => JSON.stringify(props.fingerprint)), - - connect({ - values: [errorTrackingLogic, ['dateRange', 'filterTestAccounts', 'filterGroup', 'hasGroupActions']], - }), - - actions({ - setErrorGroupTab: (tab: ErrorGroupTab) => ({ tab }), - setActiveEventUUID: (uuid: ErrorTrackingEvent['uuid']) => ({ uuid }), - updateGroup: (group: Partial>) => ({ group }), - }), - - reducers(() => ({ - errorGroupTab: [ - ErrorGroupTab.Overview as ErrorGroupTab, - { - setErrorGroupTab: (_, { tab }) => tab, - }, - ], - activeEventUUID: [ - undefined as ErrorTrackingEvent['uuid'] | undefined, - { - setActiveEventUUID: (_, { uuid }) => uuid, - }, - ], - })), - - loaders(({ props, values }) => ({ - group: [ - null as ErrorTrackingGroup | null, - { - loadGroup: async () => { - const response = await api.query( - errorTrackingGroupQuery({ - fingerprint: props.fingerprint, - dateRange: values.dateRange, - filterTestAccounts: values.filterTestAccounts, - filterGroup: values.filterGroup, - }), - {}, - undefined, - true - ) - - // ErrorTrackingQuery returns a list of groups - // when a fingerprint is supplied there will only be a single group - return response.results[0] - }, - updateGroup: async ({ group }) => { - const response = await api.errorTracking.updateIssue(props.fingerprint, group) - return { ...values.group, ...response } - }, - }, - ], - events: [ - [] as ErrorTrackingEvent[], - { - loadEvents: async () => { - const response = await api.query( - errorTrackingGroupEventsQuery({ - select: ['uuid', 'properties', 'timestamp', 'person'], - fingerprints: values.combinedFingerprints, - dateRange: values.dateRange, - filterTestAccounts: values.filterTestAccounts, - filterGroup: values.filterGroup, - offset: values.events.length, - }) - ) - - const newResults = response.results.map((r) => ({ - uuid: r[0], - properties: JSON.parse(r[1]), - timestamp: dayjs(r[2]), - person: r[3], - })) - - return [...values.events, ...newResults] - }, - }, - ], - })), - - listeners(({ values, actions }) => ({ - loadGroupSuccess: () => { - actions.loadEvents() - }, - loadEventsSuccess: () => { - if (!values.activeEventUUID) { - actions.setActiveEventUUID(values.events[0]?.uuid) - } - }, - })), - - selectors({ - breadcrumbs: [ - (s) => [s.group], - (group): Breadcrumb[] => { - const exceptionType = group?.exception_type || 'Unknown Type' - return [ - { - key: Scene.ErrorTracking, - name: 'Error tracking', - path: urls.errorTracking(), - }, - { - key: [Scene.ErrorTrackingGroup, exceptionType], - name: exceptionType, - }, - ] - }, - ], - - combinedFingerprints: [ - (s) => [s.group], - (group): ErrorTrackingGroup['fingerprint'][] => - group ? [group.fingerprint, ...group.merged_fingerprints] : [], - ], - }), - - actionToUrl(({ values }) => ({ - setErrorGroupTab: () => { - const searchParams = router.values.searchParams - - if (values.errorGroupTab != ErrorGroupTab.Overview) { - searchParams['tab'] = values.errorGroupTab - } - - return [router.values.location.pathname, searchParams] - }, - })), - - urlToAction(({ actions }) => ({ - [urls.errorTrackingGroup('*')]: (_, searchParams) => { - if (searchParams.tab) { - actions.setErrorGroupTab(searchParams.tab) - } - }, - })), -]) From e7f62070279d587b9a7aa340c40caf3e07c8bbd4 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Fri, 6 Dec 2024 12:40:15 +0000 Subject: [PATCH 20/22] Fixes --- mypy-baseline.txt | 1 - posthog/migrations/0528_externaldatajob_pipeline_version.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mypy-baseline.txt b/mypy-baseline.txt index 09445b9a9eb87..8b815fbdb5ec5 100644 --- a/mypy-baseline.txt +++ b/mypy-baseline.txt @@ -826,7 +826,6 @@ posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: Need type annotation for "_execute_async_calls" (hint: "_execute_async_calls: list[] = ...") [var-annotated] posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: Need type annotation for "_cursors" (hint: "_cursors: list[] = ...") [var-annotated] posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: List item 0 has incompatible type "tuple[str, str, int, int, int, int, str, int]"; expected "tuple[str, str, int, int, str, str, str, str]" [list-item] -posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py:0: error: Missing positional argument "db_incremental_field_last_value" in call to "__call__" of "SourceFactory" [call-arg] posthog/migrations/0237_remove_timezone_from_teams.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/migrations/0228_fix_tile_layouts.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined] diff --git a/posthog/migrations/0528_externaldatajob_pipeline_version.py b/posthog/migrations/0528_externaldatajob_pipeline_version.py index 0a8cb0606d8ca..c5b8f1630df2d 100644 --- a/posthog/migrations/0528_externaldatajob_pipeline_version.py +++ b/posthog/migrations/0528_externaldatajob_pipeline_version.py @@ -24,6 +24,7 @@ class Migration(migrations.Migration): UPDATE posthog_externaldatajob SET pipeline_version = 'v1-dlt-sync' WHERE pipeline_version is null - """ + """, + reverse_sql=migrations.RunSQL.noop, ), ] From 31f83c356985c62a3ab3adfd4af96f4e21377c85 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Fri, 6 Dec 2024 13:55:26 +0000 Subject: [PATCH 21/22] Updated mypy --- mypy-baseline.txt | 216 +++++++++++++++++++--------------------------- 1 file changed, 87 insertions(+), 129 deletions(-) diff --git a/mypy-baseline.txt b/mypy-baseline.txt index 78020a95ec1b4..24624c0594402 100644 --- a/mypy-baseline.txt +++ b/mypy-baseline.txt @@ -1,4 +1,67 @@ posthog/warehouse/models/ssh_tunnel.py:0: error: Incompatible types in assignment (expression has type "NoEncryption", variable has type "BestAvailableEncryption") [assignment] +posthog/temporal/data_imports/pipelines/sql_database_v2/schema_types.py:0: error: Statement is unreachable [unreachable] +posthog/temporal/data_imports/pipelines/sql_database_v2/schema_types.py:0: error: Non-overlapping equality check (left operand type: "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'json', 'decimal', 'wei', 'date', 'time'] | None", right operand type: "Literal['interval']") [comparison-overlap] +posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, ndarray[Any, dtype[Any]]]"; expected type "str" [index] +posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, ndarray[Any, dtype[Any]]]"; expected type "str" [index] +posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, TColumnSchema]"; expected type "str" [index] +posthog/temporal/data_imports/pipelines/sql_database/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Item "None" of "Incremental[Any] | None" has no attribute "row_order" [union-attr] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "Literal['asc', 'desc'] | Any | None", variable has type "Literal['asc', 'desc']") [assignment] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Column[Any]") [assignment] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Literal['asc', 'desc']") [assignment] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Item "None" of "dict[str, Any] | None" has no attribute "get" [union-attr] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Argument "primary_key" to "make_hints" has incompatible type "list[str] | None"; expected "str | Sequence[str] | Callable[[Any], str | Sequence[str]]" [arg-type] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Dict entry 2 has incompatible type "Literal['auto']": "None"; expected "Literal['json_response', 'header_link', 'auto', 'single_page', 'cursor', 'offset', 'page_number']": "type[BasePaginator]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "AuthConfigBase") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Argument 1 to "get_auth_class" has incompatible type "Literal['bearer', 'api_key', 'http_basic'] | None"; expected "Literal['bearer', 'api_key', 'http_basic']" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Need type annotation for "dependency_graph" [var-annotated] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "None", target has type "ResolvedParam") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible return value type (got "tuple[TopologicalSorter[Any], dict[str, EndpointResource], dict[str, ResolvedParam]]", expected "tuple[Any, dict[str, EndpointResource], dict[str, ResolvedParam | None]]") [return-value] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("str | Endpoint | None") [operator] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type variable "StrOrLiteralStr" of "parse" of "Formatter" cannot be "str | None" [type-var] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None") [operator] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None") [operator] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" is not indexable [index] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" has no attribute "pop" [union-attr] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" is not indexable [index] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "str | None" has no attribute "format" [union-attr] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Argument 1 to "single_entity_path" has incompatible type "str | None"; expected "str" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" has no attribute "items" [union-attr] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "str") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "str") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Statement is unreachable [unreachable] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 0 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, Any]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 1 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, Any]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 0 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, ResolveParamConfig | IncrementalParamConfig | Any]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 1 has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "SupportsKeysAndGetItem[str, ResolveParamConfig | IncrementalParamConfig | Any]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Not all union combinations were tried because there are too many unions [misc] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 2 to "source" has incompatible type "str | None"; expected "str" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 3 to "source" has incompatible type "str | None"; expected "str" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 4 to "source" has incompatible type "int | None"; expected "int" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 6 to "source" has incompatible type "Schema | None"; expected "Schema" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 7 to "source" has incompatible type "Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | None"; expected "Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 8 to "source" has incompatible type "type[BaseConfiguration] | None"; expected "type[BaseConfiguration]" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "build_resource_dependency_graph" has incompatible type "EndpointResourceBase | None"; expected "EndpointResourceBase" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Incompatible types in assignment (expression has type "list[str] | None", variable has type "list[str]") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "setup_incremental_object" has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "dict[str, Any]" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument "base_url" to "RESTClient" has incompatible type "str | None"; expected "str" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "exclude_keys" has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "Mapping[str, Any]" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Incompatible default for argument "resolved_param" (default has type "ResolvedParam | None", argument has type "ResolvedParam") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/utils.py:0: error: No overload variant of "asdict" matches argument type "type[DataclassInstance]" [call-overload] posthog/utils.py:0: note: Possible overload variants: posthog/utils.py:0: note: def asdict(obj: DataclassInstance) -> dict[str, Any] @@ -88,6 +151,7 @@ posthog/hogql_queries/legacy_compatibility/filter_to_query.py:0: error: Argument posthog/hogql_queries/legacy_compatibility/filter_to_query.py:0: error: Dict entry 0 has incompatible type "str": "PathsFilter"; expected "str": "TrendsFilter" [dict-item] posthog/hogql_queries/legacy_compatibility/filter_to_query.py:0: error: Dict entry 0 has incompatible type "str": "LifecycleFilter"; expected "str": "TrendsFilter" [dict-item] posthog/hogql_queries/legacy_compatibility/filter_to_query.py:0: error: Dict entry 0 has incompatible type "str": "StickinessFilter"; expected "str": "TrendsFilter" [dict-item] +posthog/warehouse/models/external_data_schema.py:0: error: Name "update_incremental_field_last_value" already defined on line 0 [no-redef] posthog/session_recordings/models/session_recording.py:0: error: Argument "distinct_id" to "MissingPerson" has incompatible type "str | None"; expected "str" [arg-type] posthog/session_recordings/models/session_recording.py:0: error: Incompatible type for lookup 'persondistinctid__team_id': (got "Team", expected "str | int") [misc] posthog/models/hog_functions/hog_function.py:0: error: Argument 1 to "get" of "dict" has incompatible type "str | None"; expected "str" [arg-type] @@ -225,10 +289,6 @@ posthog/demo/matrix/matrix.py:0: error: Name "timezone.datetime" is not defined posthog/demo/matrix/matrix.py:0: error: Name "timezone.datetime" is not defined [name-defined] posthog/demo/matrix/matrix.py:0: error: Name "timezone.datetime" is not defined [name-defined] posthog/api/shared.py:0: error: Incompatible return value type (got "int | None", expected "Level | None") [return-value] -ee/billing/quota_limiting.py:0: error: Unsupported target for indexed assignment ("object") [index] -ee/billing/quota_limiting.py:0: error: "object" has no attribute "get" [attr-defined] -ee/billing/quota_limiting.py:0: error: Unsupported target for indexed assignment ("object") [index] -ee/billing/quota_limiting.py:0: error: Unsupported target for indexed assignment ("object") [index] posthog/test/base.py:0: error: Module has no attribute "configure" [attr-defined] posthog/test/base.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Organization") [assignment] posthog/test/base.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Project") [assignment] @@ -254,8 +314,6 @@ ee/tasks/subscriptions/email_subscriptions.py:0: error: Item "None" of "User | N ee/tasks/subscriptions/email_subscriptions.py:0: error: Item "None" of "datetime | None" has no attribute "isoformat" [union-attr] ee/tasks/subscriptions/email_subscriptions.py:0: error: Item "None" of "datetime | None" has no attribute "strftime" [union-attr] ee/tasks/subscriptions/email_subscriptions.py:0: error: Item "None" of "User | None" has no attribute "first_name" [union-attr] -ee/billing/billing_manager.py:0: error: Module has no attribute "utc" [attr-defined] -ee/billing/billing_manager.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "bool | Combinable | None") [assignment] posthog/models/property/util.py:0: error: Incompatible type for lookup 'pk': (got "str | int | list[str]", expected "str | int") [misc] posthog/models/property/util.py:0: error: Argument 3 to "format_filter_query" has incompatible type "HogQLContext | None"; expected "HogQLContext" [arg-type] posthog/models/property/util.py:0: error: Argument 3 to "format_cohort_subquery" has incompatible type "HogQLContext | None"; expected "HogQLContext" [arg-type] @@ -274,15 +332,21 @@ posthog/hogql/property.py:0: error: Incompatible type for lookup 'id': (got "str posthog/hogql/property.py:0: error: Incompatible type for lookup 'pk': (got "str | float", expected "str | int") [misc] posthog/api/utils.py:0: error: Incompatible types in assignment (expression has type "type[EventDefinition]", variable has type "type[EnterpriseEventDefinition]") [assignment] posthog/api/utils.py:0: error: Argument 1 to "UUID" has incompatible type "int | str"; expected "str | None" [arg-type] +ee/billing/quota_limiting.py:0: error: Unsupported target for indexed assignment ("object") [index] +ee/billing/quota_limiting.py:0: error: "object" has no attribute "get" [attr-defined] +ee/billing/quota_limiting.py:0: error: Unsupported target for indexed assignment ("object") [index] +ee/billing/quota_limiting.py:0: error: Unsupported target for indexed assignment ("object") [index] posthog/hogql/filters.py:0: error: Incompatible default for argument "team" (default has type "None", argument has type "Team") [assignment] posthog/hogql/filters.py:0: note: PEP 484 prohibits implicit Optional. Accordingly, mypy has changed its default to no_implicit_optional=True posthog/hogql/filters.py:0: note: Use https://github.com/hauntsaninja/no_implicit_optional to automatically upgrade your codebase -posthog/api/capture.py:0: error: Module has no attribute "utc" [attr-defined] +ee/billing/billing_manager.py:0: error: Module has no attribute "utc" [attr-defined] +ee/billing/billing_manager.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "bool | Combinable | None") [assignment] posthog/hogql/query.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "str | SelectQuery | SelectSetQuery") [assignment] posthog/hogql/query.py:0: error: Incompatible types in assignment (expression has type "Expr", variable has type "SelectQuery | SelectSetQuery") [assignment] posthog/hogql/query.py:0: error: Argument 1 to "get_default_limit_for_context" has incompatible type "LimitContext | None"; expected "LimitContext" [arg-type] posthog/hogql/query.py:0: error: Subclass of "SelectQuery" and "SelectSetQuery" cannot exist: would have incompatible method signatures [unreachable] posthog/api/organization.py:0: error: Incompatible return value type (got "int | None", expected "Level | None") [return-value] +posthog/api/capture.py:0: error: Module has no attribute "utc" [attr-defined] posthog/queries/person_query.py:0: error: Incompatible type for lookup 'pk': (got "str | int | list[str]", expected "str | int") [misc] posthog/queries/event_query/event_query.py:0: error: Incompatible type for lookup 'pk': (got "str | int | list[str]", expected "str | int") [misc] posthog/hogql_queries/sessions_timeline_query_runner.py:0: error: Statement is unreachable [unreachable] @@ -407,9 +471,6 @@ posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(None = ..., /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> Callable[[Callable[TResourceFunParams, Any]], TDltResourceImpl] posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(None = ..., /, name: str | Callable[[Any], str] = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ..., standalone: Literal[True] = ...) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, TDltResourceImpl]] posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(list[Any] | tuple[Any] | Iterator[Any], /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> TDltResourceImpl -posthog/temporal/data_imports/pipelines/sql_database_v2/schema_types.py:0: error: Statement is unreachable [unreachable] -posthog/temporal/data_imports/pipelines/sql_database_v2/schema_types.py:0: error: Non-overlapping equality check (left operand type: "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'json', 'decimal', 'wei', 'date', 'time'] | None", right operand type: "Literal['interval']") [comparison-overlap] -posthog/temporal/data_imports/pipelines/sql_database/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/tasks/test/test_update_survey_iteration.py:0: error: Item "None" of "FeatureFlag | None" has no attribute "filters" [union-attr] posthog/tasks/test/test_stop_surveys_reached_target.py:0: error: No overload variant of "__sub__" of "datetime" matches argument type "None" [operator] posthog/tasks/test/test_stop_surveys_reached_target.py:0: note: Possible overload variants: @@ -543,10 +604,22 @@ posthog/warehouse/data_load/validate_schema.py:0: error: Incompatible types in a posthog/warehouse/data_load/validate_schema.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "str | int | Combinable") [assignment] posthog/warehouse/data_load/validate_schema.py:0: error: Incompatible types in assignment (expression has type "dict[str, dict[str, str | bool]] | dict[str, str]", variable has type "dict[str, dict[str, str]]") [assignment] posthog/warehouse/data_load/source_templates.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "Type") [assignment] -posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, ndarray[Any, dtype[Any]]]"; expected type "str" [index] -posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, ndarray[Any, dtype[Any]]]"; expected type "str" [index] -posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, TColumnSchema]"; expected type "str" [index] +posthog/warehouse/api/external_data_schema.py:0: error: Incompatible return value type (got "str | None", expected "SyncType | None") [return-value] +posthog/warehouse/api/external_data_schema.py:0: error: Argument 1 to "get_sql_schemas_for_source_type" has incompatible type "str"; expected "Type" [arg-type] +posthog/warehouse/api/external_data_schema.py:0: error: No overload variant of "get" of "dict" matches argument type "str" [call-overload] +posthog/warehouse/api/external_data_schema.py:0: note: Possible overload variants: +posthog/warehouse/api/external_data_schema.py:0: note: def get(self, Type, /) -> dict[str, list[IncrementalField]] | None +posthog/warehouse/api/external_data_schema.py:0: note: def get(self, Type, dict[str, list[IncrementalField]], /) -> dict[str, list[IncrementalField]] +posthog/warehouse/api/external_data_schema.py:0: note: def [_T] get(self, Type, _T, /) -> dict[str, list[IncrementalField]] | _T +posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: error: No overload variant of "get" of "dict" matches argument types "str", "tuple[()]" [call-overload] +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: Possible overload variants: +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def get(self, Type, /) -> Sequence[str] | None +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def get(self, Type, Sequence[str], /) -> Sequence[str] +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def [_T] get(self, Type, _T, /) -> Sequence[str] | _T +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: error: Argument "source_id" to "sync_old_schemas_with_new_schemas" has incompatible type "str"; expected "UUID" [arg-type] posthog/tasks/exports/test/test_csv_exporter.py:0: error: Function is missing a return type annotation [no-untyped-def] posthog/tasks/exports/test/test_csv_exporter.py:0: error: Function is missing a type annotation [no-untyped-def] posthog/tasks/exports/test/test_csv_exporter.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def] @@ -719,22 +792,6 @@ posthog/temporal/tests/batch_exports/test_batch_exports.py:0: error: TypedDict k posthog/temporal/data_modeling/run_workflow.py:0: error: Dict entry 20 has incompatible type "str": "Literal['complex']"; expected "str": "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'binary', 'json', 'decimal', 'wei', 'date', 'time']" [dict-item] posthog/temporal/data_modeling/run_workflow.py:0: error: Dict entry 21 has incompatible type "str": "Literal['complex']"; expected "str": "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'binary', 'json', 'decimal', 'wei', 'date', 'time']" [dict-item] posthog/temporal/data_modeling/run_workflow.py:0: error: Dict entry 22 has incompatible type "str": "Literal['complex']"; expected "str": "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'binary', 'json', 'decimal', 'wei', 'date', 'time']" [dict-item] -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: error: No overload variant of "get" of "dict" matches argument types "str", "tuple[()]" [call-overload] -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: Possible overload variants: -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def get(self, Type, /) -> Sequence[str] | None -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def get(self, Type, Sequence[str], /) -> Sequence[str] -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def [_T] get(self, Type, _T, /) -> Sequence[str] | _T -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: error: Argument "source_id" to "sync_old_schemas_with_new_schemas" has incompatible type "str"; expected "UUID" [arg-type] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Item "None" of "Incremental[Any] | None" has no attribute "row_order" [union-attr] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "Literal['asc', 'desc'] | Any | None", variable has type "Literal['asc', 'desc']") [assignment] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Column[Any]") [assignment] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Literal['asc', 'desc']") [assignment] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Item "None" of "dict[str, Any] | None" has no attribute "get" [union-attr] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Argument "primary_key" to "make_hints" has incompatible type "list[str] | None"; expected "str | Sequence[str] | Callable[[Any], str | Sequence[str]]" [arg-type] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: "FilesystemDestinationClientConfiguration" has no attribute "delta_jobs_per_write" [attr-defined] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: "type[FilesystemDestinationClientConfiguration]" has no attribute "delta_jobs_per_write" [attr-defined] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "DataWarehouseCredential | Combinable | None") [assignment] @@ -769,23 +826,6 @@ posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: Need type annotation for "_execute_async_calls" (hint: "_execute_async_calls: list[] = ...") [var-annotated] posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: Need type annotation for "_cursors" (hint: "_cursors: list[] = ...") [var-annotated] posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: List item 0 has incompatible type "tuple[str, str, int, int, int, int, str, int]"; expected "tuple[str, str, int, int, str, str, str, str]" [list-item] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: error: No overload variant of "with_only_columns" of "Select" matches argument type "ReadOnlyColumnCollection[str, Column[Any]]" [call-overload] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: Possible overload variants: -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], /) -> Select[tuple[_T0]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], /) -> Select[tuple[_T0, _T1]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], /) -> Select[tuple[_T0, _T1, _T2]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], /) -> Select[tuple[_T0, _T1, _T2, _T3]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5, _T6] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], TypedColumnsClauseRole[_T6] | SQLCoreOperations[_T6] | type[_T6], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5, _T6]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], TypedColumnsClauseRole[_T6] | SQLCoreOperations[_T6] | type[_T6], TypedColumnsClauseRole[_T7] | SQLCoreOperations[_T7] | type[_T7], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def with_only_columns(self, *entities: TypedColumnsClauseRole[Any] | ColumnsClauseRole | SQLCoreOperations[Any] | Literal['*', 1] | type[Any] | Inspectable[_HasClauseElement[Any]] | _HasClauseElement[Any], maintain_column_froms: bool = ..., **Any) -> Select[Any] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: error: No overload variant of "resource" matches argument types "Callable[[Engine, Table, int, Literal['sqlalchemy', 'pyarrow', 'pandas', 'connectorx'], Incremental[Any] | None, bool, Callable[[Table], None] | None, Literal['minimal', 'full', 'full_with_precision'], dict[str, Any] | None, Callable[[TypeEngine[Any]], TypeEngine[Any] | type[TypeEngine[Any]] | None] | None, list[str] | None, Callable[[Select[Any], Table], Select[Any]] | None, list[str] | None], Iterator[Any]]", "str", "list[str] | None", "list[str] | None", "dict[str, TColumnSchema]", "Collection[str]", "str" [call-overload] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: Possible overload variants: -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TResourceFunParams`-1, TDltResourceImpl: DltResource] resource(Callable[TResourceFunParams, Any], /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> TDltResourceImpl -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(None = ..., /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> Callable[[Callable[TResourceFunParams, Any]], TDltResourceImpl] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(None = ..., /, name: str | Callable[[Any], str] = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ..., standalone: Literal[True] = ...) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, TDltResourceImpl]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(list[Any] | tuple[Any] | Iterator[Any], /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> TDltResourceImpl posthog/migrations/0237_remove_timezone_from_teams.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/migrations/0228_fix_tile_layouts.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined] @@ -795,29 +835,6 @@ posthog/api/plugin_log_entry.py:0: error: Module "django.utils.timezone" does no posthog/temporal/tests/batch_exports/test_redshift_batch_export_workflow.py:0: error: Incompatible types in assignment (expression has type "str | int", variable has type "int") [assignment] posthog/api/sharing.py:0: error: Item "None" of "list[Any] | None" has no attribute "__iter__" (not iterable) [union-attr] posthog/temporal/data_imports/external_data_job.py:0: error: Argument "status" to "update_external_job_status" has incompatible type "str"; expected "Status" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Dict entry 2 has incompatible type "Literal['auto']": "None"; expected "Literal['json_response', 'header_link', 'auto', 'single_page', 'cursor', 'offset', 'page_number']": "type[BasePaginator]" [dict-item] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "AuthConfigBase") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Argument 1 to "get_auth_class" has incompatible type "Literal['bearer', 'api_key', 'http_basic'] | None"; expected "Literal['bearer', 'api_key', 'http_basic']" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Need type annotation for "dependency_graph" [var-annotated] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "None", target has type "ResolvedParam") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible return value type (got "tuple[TopologicalSorter[Any], dict[str, EndpointResource], dict[str, ResolvedParam]]", expected "tuple[Any, dict[str, EndpointResource], dict[str, ResolvedParam | None]]") [return-value] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("str | Endpoint | None") [operator] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type variable "StrOrLiteralStr" of "parse" of "Formatter" cannot be "str | None" [type-var] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None") [operator] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None") [operator] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" is not indexable [index] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" has no attribute "pop" [union-attr] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" is not indexable [index] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "str | None" has no attribute "format" [union-attr] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Argument 1 to "single_entity_path" has incompatible type "str | None"; expected "str" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" has no attribute "items" [union-attr] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "str") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "str") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Statement is unreachable [unreachable] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 0 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, Any]" [dict-item] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 1 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, Any]" [dict-item] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 0 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, ResolveParamConfig | IncrementalParamConfig | Any]" [dict-item] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 1 has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "SupportsKeysAndGetItem[str, ResolveParamConfig | IncrementalParamConfig | Any]" [dict-item] posthog/api/test/batch_exports/conftest.py:0: error: Signature of "run" incompatible with supertype "Worker" [override] posthog/api/test/batch_exports/conftest.py:0: note: Superclass: posthog/api/test/batch_exports/conftest.py:0: note: def run(self) -> Coroutine[Any, Any, None] @@ -832,49 +849,6 @@ posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] posthog/temporal/tests/data_imports/test_end_to_end.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Not all union combinations were tried because there are too many unions [misc] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 2 to "source" has incompatible type "str | None"; expected "str" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 3 to "source" has incompatible type "str | None"; expected "str" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 4 to "source" has incompatible type "int | None"; expected "int" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 6 to "source" has incompatible type "Schema | None"; expected "Schema" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 7 to "source" has incompatible type "Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | None"; expected "Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 8 to "source" has incompatible type "type[BaseConfiguration] | None"; expected "type[BaseConfiguration]" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "build_resource_dependency_graph" has incompatible type "EndpointResourceBase | None"; expected "EndpointResourceBase" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Incompatible types in assignment (expression has type "list[str] | None", variable has type "list[str]") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "setup_incremental_object" has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "dict[str, Any]" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument "base_url" to "RESTClient" has incompatible type "str | None"; expected "str" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "exclude_keys" has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "Mapping[str, Any]" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Incompatible default for argument "resolved_param" (default has type "ResolvedParam | None", argument has type "ResolvedParam") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/api/test/test_team.py:0: error: "HttpResponse" has no attribute "json" [attr-defined] -posthog/api/test/test_team.py:0: error: "HttpResponse" has no attribute "json" [attr-defined] -posthog/test/test_middleware.py:0: error: Incompatible types in assignment (expression has type "_MonkeyPatchedWSGIResponse", variable has type "_MonkeyPatchedResponse") [assignment] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/management/commands/test/test_create_batch_export_from_app.py:0: error: Incompatible return value type (got "dict[str, Collection[str]]", expected "dict[str, str]") [return-value] posthog/management/commands/test/test_create_batch_export_from_app.py:0: error: Incompatible types in assignment (expression has type "dict[str, Collection[str]]", variable has type "dict[str, str]") [assignment] posthog/management/commands/test/test_create_batch_export_from_app.py:0: error: Unpacked dict entry 1 has incompatible type "str"; expected "SupportsKeysAndGetItem[str, str]" [dict-item] @@ -916,22 +890,6 @@ posthog/api/test/batch_exports/test_update.py:0: error: Value of type "BatchExpo posthog/api/test/batch_exports/test_update.py:0: error: Value of type "BatchExport" is not indexable [index] posthog/api/test/batch_exports/test_update.py:0: error: Value of type "BatchExport" is not indexable [index] posthog/api/test/batch_exports/test_pause.py:0: error: "batch_export_delete_schedule" does not return a value (it only ever returns None) [func-returns-value] -posthog/warehouse/api/external_data_schema.py:0: error: Incompatible return value type (got "str | None", expected "SyncType | None") [return-value] -posthog/warehouse/api/external_data_schema.py:0: error: Argument 1 to "get_sql_schemas_for_source_type" has incompatible type "str"; expected "Type" [arg-type] -posthog/warehouse/api/external_data_schema.py:0: error: No overload variant of "get" of "dict" matches argument type "str" [call-overload] -posthog/warehouse/api/external_data_schema.py:0: note: Possible overload variants: -posthog/warehouse/api/external_data_schema.py:0: note: def get(self, Type, /) -> dict[str, list[IncrementalField]] | None -posthog/warehouse/api/external_data_schema.py:0: note: def get(self, Type, dict[str, list[IncrementalField]], /) -> dict[str, list[IncrementalField]] -posthog/warehouse/api/external_data_schema.py:0: note: def [_T] get(self, Type, _T, /) -> dict[str, list[IncrementalField]] | _T -posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/external_data_job.py:0: error: Argument "status" to "update_external_job_status" has incompatible type "str"; expected "Status" [arg-type] -posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] -posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] -posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] -posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] -posthog/temporal/tests/data_imports/test_end_to_end.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/api/query.py:0: error: Statement is unreachable [unreachable] posthog/api/test/test_capture.py:0: error: Statement is unreachable [unreachable] posthog/api/test/test_capture.py:0: error: Incompatible return value type (got "_MonkeyPatchedWSGIResponse", expected "HttpResponse") [return-value] From ab4bb03c63f9d1f0422ff16da6f5d7684ccba7e0 Mon Sep 17 00:00:00 2001 From: Tom Owers Date: Tue, 10 Dec 2024 12:58:42 +0000 Subject: [PATCH 22/22] Remove double defined func --- posthog/warehouse/models/external_data_schema.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/posthog/warehouse/models/external_data_schema.py b/posthog/warehouse/models/external_data_schema.py index b9629c6410672..361d3c8413066 100644 --- a/posthog/warehouse/models/external_data_schema.py +++ b/posthog/warehouse/models/external_data_schema.py @@ -99,22 +99,6 @@ def soft_delete(self): self.deleted_at = datetime.now() self.save() - def update_incremental_field_last_value(self, last_value: Any) -> None: - incremental_field_type = self.sync_type_config.get("incremental_field_type") - - last_value_py = last_value.item() if isinstance(last_value, numpy.generic) else last_value - - if ( - incremental_field_type == IncrementalFieldType.Integer - or incremental_field_type == IncrementalFieldType.Numeric - ): - last_value_json = last_value_py - else: - last_value_json = str(last_value_py) - - self.sync_type_config["incremental_field_last_value"] = last_value_json - self.save() - @database_sync_to_async def asave_external_data_schema(schema: ExternalDataSchema) -> None: