Skip to content

Commit

Permalink
Merge dev branch, number of improvements on ui and algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
dyang415 committed Sep 19, 2023
1 parent 35a9376 commit 083e1f5
Show file tree
Hide file tree
Showing 33 changed files with 1,300 additions and 1,012 deletions.
2 changes: 2 additions & 0 deletions backend/app/common/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class EmptyDataFrameError(Exception):
pass
1 change: 1 addition & 0 deletions backend/app/data_source/bigquery/bigquery_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def get_schema(self, full_name: str) -> BigquerySchema:
description=field.description,
type=field.field_type,
mode=field.mode,
values=[],
numDistinctValues=num_distinct_value_by_field[field.name]
if (field.mode != "REPEATED" and field.field_type != "RECORD")
else 0
Expand Down
12 changes: 10 additions & 2 deletions backend/app/data_source/file/file_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ def load_schema(self) -> FileSchema:
[pl.col(column).n_unique() for column in df.columns]
).row(0, named=True)

column_values_df = pl.concat(
[df.lazy().select(pl.col(column).unique().limit(500).cast(pl.Utf8).alias("values")).with_columns(pl.lit(column).alias("column"))
for column, num_distinct_values in column_to_num_distinct_values.items()]) \
.groupby("column").agg(pl.col("values").explode()).collect()
column_to_values = {row['column']: row['values'] for row in column_values_df.rows(named=True)}

logger.info("Calculating total rows")
count = df.select(pl.col(df.columns[0]).count()).row(0)[0]

Expand All @@ -52,7 +58,8 @@ def load_schema(self) -> FileSchema:
description="",
type=data_type,
mode="NULLABLE",
numDistinctValues=num_distinct_values
numDistinctValues=num_distinct_values,
values=column_to_values[column]
))
else:
df = df.with_columns(pl.col(column).cast(pl.Date))
Expand All @@ -70,7 +77,8 @@ def load_schema(self) -> FileSchema:
numDistinctValues=num_distinct_values,
minDate=min_date,
maxDate=max_date,
numRowsByDate={row[column]: row["count"] for row in num_rows_by_date_df.rows(named=True)}
numRowsByDate={row[column]: row["count"] for row in num_rows_by_date_df.rows(named=True)},
values=column_to_values[column]
))
return FileSchema(
name=self.file_name,
Expand Down
2 changes: 2 additions & 0 deletions backend/app/data_source/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
from dataclasses import dataclass
from datetime import date
from typing import Union, Optional

FieldType = Union['DATE', 'TIMESTAMP', 'VARCHAR', 'FLOAT', 'INTEGER', 'BOOLEAN']
Expand All @@ -13,6 +14,7 @@ class Field:
type: FieldType
mode: FieldMode
numDistinctValues: int
values: list[int, float, str, date]


@dataclass(frozen=True)
Expand Down
7 changes: 6 additions & 1 deletion backend/app/index_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@ def render_index():
"settings": SettingsService(current_app.config).build_settings()
}

return render_template("index.html", bundle_imports=bundle_imports, server_data=orjson.dumps(server_data).decode("utf-8"))
return render_template(
"index.html",
enable_telemtry=server_data['settings'].enableTelemetry,
bundle_imports=bundle_imports,
server_data=orjson.dumps(server_data).decode("utf-8")
)

@expose('/')
@expose('/dashboard')
Expand Down
67 changes: 41 additions & 26 deletions backend/app/insight/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from loguru import logger
from orjson import orjson

from app.common.errors import EmptyDataFrameError
from app.common.request_utils import build_error_response
from app.insight.datasource.bqMetrics import BqMetrics
from app.insight.services.insight_builders import DFBasedInsightBuilder
from app.insight.services.metrics import AggregateMethod, SingleColumnMetric, DualColumnMetric, CombineMethod, DimensionValuePair
from app.insight.services.metrics import AggregateMethod, SingleColumnMetric, DualColumnMetric, CombineMethod, DimensionValuePair, Filter
from app.insight.services.segment_insight_builder import get_related_segments, get_segment_insight


Expand All @@ -34,14 +36,18 @@ def parse_date_info(data):

return baseline_start, baseline_end, comparison_start, comparison_end, date_column, date_column_type

@staticmethod
def parse_filters(data):
return [Filter(**filter) for filter in data['filters']]

@staticmethod
def parse_data(data):
baseline_start, baseline_end, comparison_start, comparison_end, date_column, date_column_type = InsightApi.parse_date_info(data)
expected_value = 0
group_by_columns = data['groupByColumns']
filters = InsightApi.parse_filters(data)

return (
baseline_start, baseline_end, comparison_start, comparison_end, date_column, date_column_type, group_by_columns, expected_value
baseline_start, baseline_end, comparison_start, comparison_end, date_column, date_column_type, group_by_columns, filters
)

@staticmethod
Expand Down Expand Up @@ -103,8 +109,11 @@ def parse_metrics(metric_column):
def get_bq_insight(self):
data = request.get_json()
table_name = data['tableName']
expected_value = data['expectedValue']

(baselineStart, baselineEnd, comparisonStart, comparisonEnd, date_column, date_column_type, group_by_columns, expected_value) = self.parse_data(data)
(
baselineStart, baselineEnd, comparisonStart, comparisonEnd, date_column, date_column_type, group_by_columns, filters
) = self.parse_data(data)

metric = self.parse_metrics(data['metricColumn'])

Expand All @@ -122,10 +131,8 @@ def get_bq_insight(self):
@expose('file/segment', methods=['POST'])
def get_segment_insight(self):
data = request.get_json()

file_id = data['fileId']

(baselineStart, baselineEnd, comparisonStart, comparisonEnd, date_column, date_column_type, group_by_columns, expected_value) = self.parse_data(data)
(baselineStart, baselineEnd, comparisonStart, comparisonEnd, date_column, date_column_type, group_by_columns, filters) = self.parse_data(data)

metric = self.parse_metrics(data['metricColumn'])
segment_key = data['segmentKey']
Expand All @@ -144,7 +151,8 @@ def get_segment_insight(self):
date_column,
(baselineStart, baselineEnd),
(comparisonStart, comparisonEnd),
[metric]
[metric],
filters
)
)

Expand All @@ -155,6 +163,7 @@ def get_related_segments(self):
(baseline_start, baseline_end, comparison_start, comparison_end, date_column, date_column_type) = self.parse_date_info(data)
metric_column = data['metricColumn']
metric = self.parse_metrics(metric_column)
filters = self.parse_filters(data)

file_id = data['fileId']
logger.info('Reading file')
Expand All @@ -167,33 +176,39 @@ def get_related_segments(self):
(baseline_start, baseline_end),
(comparison_start, comparison_end),
[DimensionValuePair(key_component['dimension'], key_component['value']) for key_component in data['segmentKey']],
metric
metric,
filters
)
)

@expose('file/metric', methods=['POST'])
def get_insight(self):
data = request.get_json()
file_id = data['fileId']

(baselineStart, baselineEnd, comparisonStart, comparisonEnd, date_column, date_column_type, group_by_columns, expected_value) = self.parse_data(data)
expected_value = data['expectedValue']
(baselineStart, baselineEnd, comparisonStart, comparisonEnd, date_column, date_column_type, group_by_columns, filters) = self.parse_data(data)

metric_column = data['metricColumn']
metric = self.parse_metrics(metric_column)

logger.info('Reading file')
df = pl.read_csv(f'/tmp/dsensei/{file_id}') \
.with_columns(pl.col(date_column).str.slice(0, 10).str.to_date().alias("date"))
try:
logger.info('Reading file')
df = pl.read_csv(f'/tmp/dsensei/{file_id}') \
.with_columns(pl.col(date_column).str.slice(0, 10).str.to_date().alias("date"))

logger.info('File loaded')

insight_builder = DFBasedInsightBuilder(
df,
(baselineStart, baselineEnd),
(comparisonStart, comparisonEnd),
group_by_columns,
[metric],
expected_value
)

return insight_builder.build()
logger.info('File loaded')
insight_builder = DFBasedInsightBuilder(
df,
(baselineStart, baselineEnd),
(comparisonStart, comparisonEnd),
group_by_columns,
[metric],
expected_value,
filters
)
return insight_builder.build()
except EmptyDataFrameError:
return build_error_response("EMPTY_DATASET"), 400
except Exception as e:
logger.exception(e)
return build_error_response(str(e)), 500
Loading

0 comments on commit 083e1f5

Please sign in to comment.