Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/fix enrichment new catalogue #1083

Merged
merged 27 commits into from
Oct 23, 2019
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
81a8491
Get geographies from metadata
Oct 9, 2019
9c822d7
Merge branch 'feature/1038-nested-filters' of github.com:CartoDB/cart…
Oct 9, 2019
825bba1
Merge branch 'feature/1038-nested-filters' of github.com:CartoDB/cart…
Oct 9, 2019
acaea5a
Add external table to query
Oct 9, 2019
8bb6da9
Add alias in variables to enrich
Oct 9, 2019
9e6491a
Revert "Add alias in variables to enrich"
Oct 9, 2019
2137acb
Multiple fixes
Oct 9, 2019
1a30f24
Merge branch 'develop' of github.com:CartoDB/cartoframes into feature…
Oct 9, 2019
55f392d
Import catalog class well
Oct 9, 2019
782ea31
Merge branch 'develop' of github.com:CartoDB/cartoframes into feature…
Oct 15, 2019
c127691
Fix test
Oct 15, 2019
ab9c05e
Fix test
Oct 15, 2019
17f79a5
Fix test
Oct 15, 2019
3796850
Fix test
Oct 15, 2019
f5c71bc
Fix test
Oct 15, 2019
ab02cdd
Fix test
Oct 15, 2019
58df0b5
Remove unused module
Oct 15, 2019
f7d54d0
Rename catalog dataset
Oct 15, 2019
b7da8ba
Merge branch 'develop' of github.com:CartoDB/cartoframes into feature…
Oct 21, 2019
32b1545
Use variable get_list feature instead making one request for every va…
Oct 21, 2019
23e50ea
Fix bugs regarding polygons enrichment and enhacements in points enri…
Oct 22, 2019
fec43d9
Extract imports in decode_geometry
Jesus89 Oct 23, 2019
b645461
Add information about DO 2.0
elenatorro Oct 23, 2019
752847d
Rearrange args
elenatorro Oct 23, 2019
57aee4b
Use CustomJSONDecoder to convert a DataFrame
Jesus89 Oct 23, 2019
41372bf
Improve legend exceptions
Jesus89 Oct 23, 2019
3155cf8
Merge branch 'feature/fix_enrichment_new_catalogue' of github.com:Car…
Jesus89 Oct 23, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 25 additions & 33 deletions cartoframes/data/enrichment/enrichment_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from ...exceptions import EnrichmentException
from ...auth import get_default_credentials
from ...utils.geom_utils import _compute_geometry_from_geom
from ..observatory.variable import Variable
from ..observatory import Variable
from ..observatory import CatalogDataset


_ENRICHMENT_ID = 'enrichment_id'
Expand All @@ -27,7 +28,11 @@ def enrich(query_function, **kwargs):

queries = _enrichment_queries(user_dataset, tablename, query_function, **kwargs)

return _execute_enrichment(bq_client, queries, data_copy, kwargs['data_geom_column'])
data_enriched = _execute_enrichment(bq_client, queries, data_copy, kwargs['data_geom_column'])

data_enriched[kwargs['data_geom_column']] = _compute_geometry_from_geom(data_enriched[kwargs['data_geom_column']])

return data_enriched


def _get_credentials(credentials=None):
Expand Down Expand Up @@ -58,11 +63,11 @@ def _upload_dataframe(bq_client, user_dataset, data_copy, data_geom_column):


def _enrichment_queries(user_dataset, tablename, query_function, **kwargs):
is_polygon_enrichment = 'agg_operators' in kwargs
variables = __process_variables(kwargs['variables'], is_polygon_enrichment)

variables = __process_variables(kwargs['variables'])

table_to_geotable, table_to_variables, table_to_project, table_to_dataset =\
__process_enrichment_variables(variables, user_dataset)
table_to_geotable, table_to_variables,\
table_to_project, table_to_dataset = __process_enrichment_variables(variables, user_dataset)

filters_str = __process_filters(kwargs['filters'])

Expand Down Expand Up @@ -117,7 +122,7 @@ def __copy_data_and_generate_enrichment_id(data, enrichment_id_column, geometry_
return data_copy


def __process_variables(variables):
def __process_variables(variables, is_polygon_enrichment):

variables_result = list()
if isinstance(variables, Variable):
Expand All @@ -134,6 +139,9 @@ def __process_variables(variables):
else:
raise EnrichmentException('Variable(s) to enrich should be an instance of Variable / CatalogList / str / list')

if is_polygon_enrichment:
variables_result = [variable for variable in variables_result if variable.agg_method is not None]

return variables_result


Expand Down Expand Up @@ -162,14 +170,6 @@ def __process_agg_operators(agg_operators, variables):
return agg_operators_result
Copy link
Contributor

@elenatorro elenatorro Oct 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__process_agg_operators method should also take into account what happens if this argument is a string, as we're doing in the enrich_polygons method. If it's a string, it throws 'str' object has no attribute 'copy'

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed!



def __get_tables_and_variables(variables, user_dataset):

table_to_geotable, table_to_variables, table_to_project, table_to_dataset =\
__process_enrichment_variables(variables, user_dataset)

return table_to_geotable, table_to_variables, table_to_project, table_to_dataset


def __process_enrichment_variables(variables, user_dataset):
table_to_geotable = dict()
table_to_variables = defaultdict(list)
Expand All @@ -181,11 +181,12 @@ def __process_enrichment_variables(variables, user_dataset):
dataset_name = variable.schema_name
table_name = variable.dataset_name
variable_name = variable.column_name
dataset_geotable, geotable = __get_properties_geotable(variable)

if project_name != _PUBLIC_PROJECT:
table_name = '{dataset}_{table}'.format(dataset=dataset_name,
table=table_name,
user_dataset=user_dataset)
table_name = 'view_{dataset}_{table}'.format(dataset=dataset_name,
table=table_name,
user_dataset=user_dataset)

if table_name not in table_to_dataset:
if project_name != _PUBLIC_PROJECT:
Expand All @@ -194,13 +195,9 @@ def __process_enrichment_variables(variables, user_dataset):
table_to_dataset[table_name] = _PUBLIC_DATASET

if table_name not in table_to_geotable:
geotable = __get_name_geotable_from_datatable(table_name)

if project_name != _PUBLIC_PROJECT:
geotable = '{dataset}_{geotable}'.format(dataset=dataset_name,
geotable=geotable,
user_dataset=user_dataset)

geotable = 'view_{dataset}_{geotable}'.format(dataset=dataset_geotable,
geotable=geotable)
table_to_geotable[table_name] = geotable

if table_name not in table_to_project:
Expand All @@ -214,15 +211,10 @@ def __process_enrichment_variables(variables, user_dataset):
return table_to_geotable, table_to_variables, table_to_project, table_to_dataset


def __get_name_geotable_from_datatable(datatable):

datatable_split = datatable.split('_')
def __get_properties_geotable(variable):

if len(datatable_split) == 8:
geo_information = datatable_split[3:6]
elif len(datatable_split) == 7:
geo_information = datatable_split[2:5]
geography_id = CatalogDataset.get(variable.dataset).geography

geotable = 'geography_{geo_information_joined}'.format(geo_information_joined='_'.join(geo_information))
_, geo_dataset, geo_table = geography_id.split('.')

return geotable
return geo_dataset, geo_table
74 changes: 0 additions & 74 deletions cartoframes/data/enrichment/enrichment_utils.py

This file was deleted.

16 changes: 8 additions & 8 deletions cartoframes/data/enrichment/points_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ def enrich_points(data, variables, data_geom_column='geometry', filters=dict(),


def _prepare_sql(enrichment_id, filters_processed, table_to_geotable, table_to_variables,
table_to_project, table_to_dataset, user_dataset, working_project, data_table, **kwargs):
table_to_project, table_to_dataset, user_dataset, working_project,
data_table, **kwargs):

sqls = list()

Expand All @@ -95,20 +96,19 @@ def _prepare_sql(enrichment_id, filters_processed, table_to_geotable, table_to_v
sql = '''
SELECT data_table.{enrichment_id},
{variables},
ST_Area(enrichment_geo_table.geom) AS {variables_underscored}_area,
NULL AS {variables_underscored}_population
ST_Area(enrichment_geo_table.geom) AS {enrichment_table}_area
FROM `{project}.{dataset}.{enrichment_table}` enrichment_table
JOIN `{project}.{dataset}.{enrichment_geo_table}` enrichment_geo_table
ON enrichment_table.geoid = enrichment_geo_table.geoid
JOIN `{working_project}.{user_dataset}.{data_table}` data_table
ON ST_Within(data_table.{data_geom_column}, enrichment_geo_table.geom)
{filters};
'''.format(enrichment_id=enrichment_id, variables=', '.join(variables),
variables_underscored='_'.join(variables), enrichment_table=table,
enrichment_geo_table=table_to_geotable[table], user_dataset=user_dataset,
working_project=working_project, data_table=data_table,
'''.format(enrichment_id=enrichment_id, variables_underscored='_'.join(variables),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

variables_underscored is not being used

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed unused variable from format method!

enrichment_table=table, enrichment_geo_table=table_to_geotable[table],
user_dataset=user_dataset, working_project=working_project, data_table=data_table,
data_geom_column=kwargs['data_geom_column'], filters=filters_processed,
project=table_to_project[table], dataset=table_to_dataset[table])
project=table_to_project[table], dataset=table_to_dataset[table],
variables=', '.join(['enrichment_table.{}'.format(variable) for variable in variables]))

sqls.append(sql)

Expand Down
22 changes: 12 additions & 10 deletions cartoframes/data/enrichment/polygons_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,31 +133,33 @@ def enrich_polygons(data, variables, agg_operators, data_geom_column='geometry',


def _prepare_sql(enrichment_id, filters_processed, table_to_geotable, table_to_variables,
table_to_project, table_to_dataset, user_dataset, working_project, data_table, **kwargs):
table_to_project, table_to_dataset, user_dataset, working_project,
data_table, **kwargs):

grouper = 'group by data_table.{enrichment_id}'.format(enrichment_id=enrichment_id)

sqls = list()

for table, variables in table_to_variables.items():
agg_operators = kwargs.get('agg_operators')

if 'agg_operators' in kwargs:
if agg_operators is not None:
elenatorro marked this conversation as resolved.
Show resolved Hide resolved

if isinstance(kwargs['agg_operators'], str):
agg_operators = {variable: kwargs['agg_operators'] for variable in variables}
else:
agg_operators = kwargs['agg_operators']
if isinstance(agg_operators, str):
agg_operators = {variable: agg_operators for variable in variables}

variables_sql = ['{operator}({variable} * \
variables_sql = ['{operator}(enrichment_table.{variable} * \
(ST_Area(ST_Intersection(enrichment_geo_table.geom, data_table.{data_geom_column}))\
/ ST_area(data_table.{data_geom_column}))) as {variable}'.format(variable=variable,
data_geom_column=kwargs['data_geom_column'],
operator=agg_operators[variable]) for variable in variables]

else:
variables_sql = variables + ['ST_Area(ST_Intersection(geo_table.geom, data_table.{data_geom_column}))\
/ ST_area(data_table.{data_geom_column}) AS measures_proportion'.format(
data_geom_column=kwargs['data_geom_column'])]
variables_sql = ['enrichment_table.{}'.format(variable) for variable in variables] +\
['ST_Area(ST_Intersection(enrichment_geo_table.geom, data_table.{data_geom_column}))\
/ ST_area(data_table.{data_geom_column}) AS measures_proportion'.format(
data_geom_column=kwargs['data_geom_column'])]

grouper = ''

sql = '''
Expand Down
45 changes: 21 additions & 24 deletions test/data/enrichment/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,12 @@ def test_enrichment_query_by_points_one_variable(self):
queries = _enrichment_queries(user_dataset, tablename, query_function, **kwargs)

expected_queries = ['''SELECT data_table.enrichment_id,
CRMCYBURG,
ST_Area(enrichment_geo_table.geom) AS CRMCYBURG_area,
NULL AS CRMCYBURG_population
enrichment_table.CRMCYBURG,
ST_Area(enrichment_geo_table.geom) AS view_ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018_area
FROM `carto-do-customers.{user_dataset}\
.ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018` enrichment_table
.view_ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018` enrichment_table
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we are trying to avoid using real names

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need real names because of functions are using the real catalog, so we need real examples of table names. Also, I cannot see any problems, because we are offering publicly this dataset through website and catalog

JOIN `carto-do-customers.{user_dataset}\
.ags_geography_usa_blockgroup_2015` enrichment_geo_table
.view_ags_geography_usa_blockgroup_2015` enrichment_geo_table
ON enrichment_table.geoid = enrichment_geo_table.geoid
JOIN `carto-do-customers.{user_dataset}.{tablename}` data_table
ON ST_Within(data_table.{geometry_column}, enrichment_geo_table.geom)
Expand Down Expand Up @@ -111,25 +110,23 @@ def test_enrichment_query_by_points_two_variables(self):
queries = _enrichment_queries(user_dataset, tablename, query_function, **kwargs)

expected_queries = ['''SELECT data_table.enrichment_id,
CRMCYBURG,
ST_Area(enrichment_geo_table.geom) AS CRMCYBURG_area,
NULL AS CRMCYBURG_population
enrichment_table.CRMCYBURG,
ST_Area(enrichment_geo_table.geom) AS view_ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018_area
FROM `carto-do-customers.{user_dataset}\
.ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018` enrichment_table
.view_ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018` enrichment_table
JOIN `carto-do-customers.{user_dataset}\
.ags_geography_usa_blockgroup_2015` enrichment_geo_table
.view_ags_geography_usa_blockgroup_2015` enrichment_geo_table
ON enrichment_table.geoid = enrichment_geo_table.geoid
JOIN `carto-do-customers.{user_dataset}.{tablename}` data_table
ON ST_Within(data_table.{geometry_column}, enrichment_geo_table.geom)
WHERE a='b';''', '''
SELECT data_table.enrichment_id,
ticket_size_score,
ST_Area(enrichment_geo_table.geom) AS ticket_size_score_area,
NULL AS ticket_size_score_population
enrichment_table.ticket_size_score,
ST_Area(enrichment_geo_table.geom) AS view_mastercard_financial_mrli_usa_blockgroup_2019_monthly_2019_area
FROM `carto-do-customers.{user_dataset}\
.mastercard_financial_mrli_usa_blockgroup_2019_monthly_2019` enrichment_table
.view_mastercard_financial_mrli_usa_blockgroup_2019_monthly_2019` enrichment_table
JOIN `carto-do-customers.{user_dataset}\
.mastercard_geography_usa_blockgroup_2019` enrichment_geo_table
.view_mastercard_geography_usa_blockgroup_2019` enrichment_geo_table
ON enrichment_table.geoid = enrichment_geo_table.geoid
JOIN `carto-do-customers.{user_dataset}.{tablename}` data_table
ON ST_Within(data_table.{geometry_column}, enrichment_geo_table.geom)
Expand Down Expand Up @@ -158,12 +155,12 @@ def test_enrichment_query_by_polygons_one_variable(self):

queries = _enrichment_queries(user_dataset, tablename, query_function, **kwargs)

expected_queries = ['''SELECT data_table.enrichment_id, avg(CRMCYBURG *\
expected_queries = ['''SELECT data_table.enrichment_id, avg(enrichment_table.CRMCYBURG *\
(ST_Area(ST_Intersection(enrichment_geo_table.geom, data_table.{geometry_column}))\
/ ST_area(data_table.{geometry_column}))) as CRMCYBURG
FROM `carto-do-customers.{user_dataset}.ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018`\
FROM `carto-do-customers.{user_dataset}.view_ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018`\
enrichment_table
JOIN `carto-do-customers.{user_dataset}.ags_geography_usa_blockgroup_2015` enrichment_geo_table
JOIN `carto-do-customers.{user_dataset}.view_ags_geography_usa_blockgroup_2015` enrichment_geo_table
ON enrichment_table.geoid = enrichment_geo_table.geoid
JOIN `carto-do-customers.{user_dataset}.{tablename}` data_table
ON ST_Intersects(data_table.{geometry_column}, enrichment_geo_table.geom)
Expand Down Expand Up @@ -194,23 +191,23 @@ def test_enrichment_query_by_polygons_two_variables(self):

queries = _enrichment_queries(user_dataset, tablename, query_function, **kwargs)

expected_queries = ['''SELECT data_table.enrichment_id, avg(CRMCYBURG *\
expected_queries = ['''SELECT data_table.enrichment_id, avg(enrichment_table.CRMCYBURG *\
(ST_Area(ST_Intersection(enrichment_geo_table.geom, data_table.{geometry_column}))\
/ ST_area(data_table.{geometry_column}))) as CRMCYBURG
FROM `carto-do-customers.{user_dataset}.ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018`\
FROM `carto-do-customers.{user_dataset}.view_ags_demographics_crimerisk_usa_blockgroup_2015_yearly_2018`\
enrichment_table
JOIN `carto-do-customers.{user_dataset}.ags_geography_usa_blockgroup_2015` enrichment_geo_table
JOIN `carto-do-customers.{user_dataset}.view_ags_geography_usa_blockgroup_2015` enrichment_geo_table
ON enrichment_table.geoid = enrichment_geo_table.geoid
JOIN `carto-do-customers.{user_dataset}.{tablename}` data_table
ON ST_Intersects(data_table.{geometry_column}, enrichment_geo_table.geom)
WHERE a='b'
group by data_table.enrichment_id;''', '''
SELECT data_table.enrichment_id, avg(ticket_size_score *\
SELECT data_table.enrichment_id, avg(enrichment_table.ticket_size_score *\
(ST_Area(ST_Intersection(enrichment_geo_table.geom, data_table.{geometry_column}))\
/ ST_area(data_table.{geometry_column}))) as ticket_size_score
FROM `carto-do-customers.{user_dataset}.mastercard_financial_mrli_usa_blockgroup_2019_monthly_2019`\
FROM `carto-do-customers.{user_dataset}.view_mastercard_financial_mrli_usa_blockgroup_2019_monthly_2019`\
enrichment_table
JOIN `carto-do-customers.{user_dataset}.mastercard_geography_usa_blockgroup_2019` enrichment_geo_table
JOIN `carto-do-customers.{user_dataset}.view_mastercard_geography_usa_blockgroup_2019` enrichment_geo_table
ON enrichment_table.geoid = enrichment_geo_table.geoid
JOIN `carto-do-customers.{user_dataset}.{tablename}` data_table
ON ST_Intersects(data_table.{geometry_column}, enrichment_geo_table.geom)
Expand Down