Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixes case where dataframe columns are a subset of util cols #523

Merged
merged 6 commits into from
Dec 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ wheels/
*.egg-info/
.installed.cfg
*.egg
Pipfile
Pipfile.lock

# Swap files
.*.sw[nop]
Expand All @@ -40,3 +42,4 @@ wheels/
CARTOCREDS.json
SITEKEY.txt
test/secret.json
examples/scratch/*
44 changes: 3 additions & 41 deletions cartoframes/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ def _send_batches(self, df, table_name, temp_dir, geom_col, pgcolnames,
# combine chunks into final table
try:
select_base = 'SELECT {schema} FROM "{{table}}"'.format(
schema=_df2pg_schema(df, pgcolnames))
schema=utils.df2pg_schema(df, pgcolnames))
unioned_tables = '\nUNION ALL\n'.join([select_base.format(table=t)
for t in subtables])
self._debug_print(unioned=unioned_tables)
Expand Down Expand Up @@ -651,7 +651,7 @@ def _set_schema(self, dataframe, table_name, pgcolnames):
'NULLIF("{col}", \'\')::{ctype}')
# alter non-util columns that are not type text
alter_cols = ', '.join(alter_temp.format(col=c,
ctype=_dtypes2pg(t))
ctype=utils.dtypes2pg(t))
for c, t in zip(pgcolnames,
dataframe.dtypes)
if c not in util_cols and t != 'object')
Expand Down Expand Up @@ -1926,6 +1926,7 @@ def _debug_print(self, **kwargs):
value=str_value))


# TODO: move all of the below to the utils module
def _add_encoded_geom(df, geom_col):
"""Add encoded geometry to DataFrame"""
# None if not a GeoDataFrame
Expand Down Expand Up @@ -1985,42 +1986,3 @@ def _decode_geom(ewkb):
if ewkb:
return wkb.loads(ba.unhexlify(ewkb))
return None


def _dtypes2pg(dtype):
"""Returns equivalent PostgreSQL type for input `dtype`"""
mapping = {
'float64': 'numeric',
'int64': 'numeric',
'float32': 'numeric',
'int32': 'numeric',
'object': 'text',
'bool': 'boolean',
'datetime64[ns]': 'timestamp',
}
return mapping.get(str(dtype), 'text')


def _pg2dtypes(pgtype):
"""Returns equivalent dtype for input `pgtype`."""
mapping = {
'date': 'datetime64[ns]',
'number': 'float64',
'string': 'object',
'boolean': 'bool',
'geometry': 'object',
}
return mapping.get(str(pgtype), 'object')


def _df2pg_schema(dataframe, pgcolnames):
"""Print column names with PostgreSQL schema for the SELECT statement of
a SQL query"""
schema = ', '.join([
'NULLIF("{col}", \'\')::{t} AS {col}'.format(col=c,
t=_dtypes2pg(t))
for c, t in zip(pgcolnames, dataframe.dtypes)
if c not in ('the_geom', 'the_geom_webmercator', 'cartodb_id')])
if 'the_geom' in pgcolnames:
return '"the_geom", ' + schema
return schema
67 changes: 59 additions & 8 deletions cartoframes/utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
"""general utility functions"""
import sys
from tqdm import tqdm
from functools import wraps
from warnings import filterwarnings, catch_warnings

from tqdm import tqdm


def dict_items(indict):
"""function for iterating through dict items compatible with py2 and 3

def dict_items(d):
Args:
indict (dict): Dictionary that will be turned into items iterator
"""
if sys.version_info >= (3, 0):
return d.items()
else:
return d.iteritems()
return indict.items()
return indict.iteritems()


def cssify(css_dict):
"""Function to get CartoCSS from Python dicts"""
css = ''
for key, value in dict_items(css_dict):
css += '{key} {{ '.format(key=key)
Expand Down Expand Up @@ -61,9 +68,9 @@ def norm_colname(colname):
"""
last_char_special = False
char_list = []
for e in str(colname):
if e.isalnum():
char_list.append(e.lower())
for colchar in str(colname):
if colchar.isalnum():
char_list.append(colchar.lower())
last_char_special = False
else:
if not last_char_special:
Expand Down Expand Up @@ -128,3 +135,47 @@ def wrapper(*args, **kwargs):
evaled_func = func(*args, **kwargs)
return evaled_func
return wrapper


# schema definition functions
def dtypes2pg(dtype):
"""Returns equivalent PostgreSQL type for input `dtype`"""
mapping = {
'float64': 'numeric',
'int64': 'numeric',
'float32': 'numeric',
'int32': 'numeric',
'object': 'text',
'bool': 'boolean',
'datetime64[ns]': 'timestamp',
}
return mapping.get(str(dtype), 'text')


# NOTE: this is not currently used anywhere
def pg2dtypes(pgtype):
"""Returns equivalent dtype for input `pgtype`."""
mapping = {
'date': 'datetime64[ns]',
'number': 'float64',
'string': 'object',
'boolean': 'bool',
'geometry': 'object',
}
return mapping.get(str(pgtype), 'object')


def df2pg_schema(dataframe, pgcolnames):
"""Print column names with PostgreSQL schema for the SELECT statement of
a SQL query"""
util_cols = set(('the_geom', 'the_geom_webmercator', 'cartodb_id'))
if set(dataframe.columns).issubset(util_cols):
return ', '.join(dataframe.columns)
schema = ', '.join([
'NULLIF("{col}", \'\')::{t} AS {col}'.format(col=c,
t=dtypes2pg(t))
for c, t in zip(pgcolnames, dataframe.dtypes)
if c not in util_cols])
if 'the_geom' in pgcolnames:
return '"the_geom", ' + schema
return schema
56 changes: 0 additions & 56 deletions test/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,31 +789,6 @@ def test_cartocontext_check_query(self):
with self.assertRaises(ValueError):
cc._check_query(success_query, style_cols=fail_cols)

def test_df2pg_schema(self):
"""context._df2pg_schema"""
from cartoframes.context import _df2pg_schema
data = [{'id': 'a', 'val': 1.1, 'truth': True, 'idnum': 1},
{'id': 'b', 'val': 2.2, 'truth': True, 'idnum': 2},
{'id': 'c', 'val': 3.3, 'truth': False, 'idnum': 3}]
df = pd.DataFrame(data).astype({'id': 'object',
'val': float,
'truth': bool,
'idnum': int})
# specify order of columns
df = df[['id', 'val', 'truth', 'idnum']]
pgcols = ['id', 'val', 'truth', 'idnum']
ans = ('NULLIF("id", \'\')::text AS id, '
'NULLIF("val", \'\')::numeric AS val, '
'NULLIF("truth", \'\')::boolean AS truth, '
'NULLIF("idnum", \'\')::numeric AS idnum')

self.assertEqual(ans, _df2pg_schema(df, pgcols))

# add the_geom
df['the_geom'] = 'Point(0 0)'
ans = '\"the_geom\", ' + ans
pgcols.append('the_geom')
self.assertEqual(ans, _df2pg_schema(df, pgcols))

@unittest.skipIf(WILL_SKIP, 'no carto credentials, skipping this test')
def test_add_encoded_geom(self):
Expand Down Expand Up @@ -875,37 +850,6 @@ def test_encode_geom(self):
self.assertEqual(ewkb_resp, ewkb)
self.assertIsNone(_encode_geom(None))

def test_dtypes2pg(self):
"""context._dtypes2pg"""
from cartoframes.context import _dtypes2pg
results = {
'float64': 'numeric',
'int64': 'numeric',
'float32': 'numeric',
'int32': 'numeric',
'object': 'text',
'bool': 'boolean',
'datetime64[ns]': 'timestamp',
'unknown_dtype': 'text'
}
for i in results:
self.assertEqual(_dtypes2pg(i), results[i])

def test_pg2dtypes(self):
"""context._pg2dtypes"""
from cartoframes.context import _pg2dtypes
results = {
'date': 'datetime64[ns]',
'number': 'float64',
'string': 'object',
'boolean': 'bool',
'geometry': 'object',
'unknown_pgdata': 'object'
}
for i in results:
result = _pg2dtypes(i)
self.assertEqual(result, results[i])

def test_debug_print(self):
"""context._debug_print"""
cc = cartoframes.CartoContext(base_url=self.baseurl,
Expand Down
62 changes: 61 additions & 1 deletion test/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Unit tests for cartoframes.utils"""
import unittest
from collections import OrderedDict

import pandas as pd

from cartoframes.utils import (dict_items, cssify, norm_colname,
normalize_colnames, importify_params)
from collections import OrderedDict


class TestUtils(unittest.TestCase):
Expand Down Expand Up @@ -139,3 +142,60 @@ def test_importify_params(self):
ans = ('true', 'false', 'true', 'gulab jamon', )
for idx, p in enumerate(params):
self.assertTrue(importify_params(p), ans[idx])

def test_dtypes2pg(self):
"""utils.dtypes2pg"""
from cartoframes.utils import dtypes2pg
results = {
'float64': 'numeric',
'int64': 'numeric',
'float32': 'numeric',
'int32': 'numeric',
'object': 'text',
'bool': 'boolean',
'datetime64[ns]': 'timestamp',
'unknown_dtype': 'text'
}
for i in results:
self.assertEqual(dtypes2pg(i), results[i])

def test_pg2dtypes(self):
"""context._pg2dtypes"""
from cartoframes.utils import pg2dtypes
results = {
'date': 'datetime64[ns]',
'number': 'float64',
'string': 'object',
'boolean': 'bool',
'geometry': 'object',
'unknown_pgdata': 'object'
}
for i in results:
result = pg2dtypes(i)
self.assertEqual(result, results[i])

def test_df2pg_schema(self):
"""utils.df2pg_schema"""
from cartoframes.utils import df2pg_schema
data = [{'id': 'a', 'val': 1.1, 'truth': True, 'idnum': 1},
{'id': 'b', 'val': 2.2, 'truth': True, 'idnum': 2},
{'id': 'c', 'val': 3.3, 'truth': False, 'idnum': 3}]
df = pd.DataFrame(data).astype({'id': 'object',
'val': float,
'truth': bool,
'idnum': int})
# specify order of columns
df = df[['id', 'val', 'truth', 'idnum']]
pgcols = ['id', 'val', 'truth', 'idnum']
ans = ('NULLIF("id", \'\')::text AS id, '
'NULLIF("val", \'\')::numeric AS val, '
'NULLIF("truth", \'\')::boolean AS truth, '
'NULLIF("idnum", \'\')::numeric AS idnum')

self.assertEqual(ans, df2pg_schema(df, pgcols))

# add the_geom
df['the_geom'] = 'Point(0 0)'
ans = '\"the_geom\", ' + ans
pgcols.append('the_geom')
self.assertEqual(ans, df2pg_schema(df, pgcols))