Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support uploading DataFrames with non-ascii texts in Python 2 #1001

Merged
merged 6 commits into from
Sep 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions cartoframes/data/dataset/registry/dataframe_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pandas as pd
from warnings import warn
from tqdm import tqdm

from carto.exceptions import CartoException, CartoRateLimitException
Expand Down Expand Up @@ -131,24 +130,33 @@ def _rows(df, dataframe_columns_info, with_lnglat):
if dataframe_columns_info.geom_column and col == dataframe_columns_info.geom_column:
geom = decode_geometry(val, dataframe_columns_info.enc_type)
if geom:
row_data.append('SRID=4326;{}'.format(geom.wkt))
val = 'SRID=4326;{}'.format(geom.wkt)
else:
row_data.append('')
else:
row_data.append('{}'.format(val))
val = ''
row_data.append(_encoded(val))

if with_lnglat:
lng_val = row[with_lnglat[0]]
lat_val = row[with_lnglat[1]]
if lng_val and lat_val:
row_data.append('SRID=4326;POINT ({lng} {lat})'.format(lng=lng_val, lat=lat_val))
val = 'SRID=4326;POINT ({lng} {lat})'.format(lng=lng_val, lat=lat_val)
else:
row_data.append('')
val = ''
row_data.append(_encoded(val))

csv_row = _encoded('|').join(row_data)
csv_row += _encoded('\n')

csv_row = '|'.join(row_data)
csv_row += '\n'
yield csv_row

yield csv_row.encode()

def _encoded(val):
jgoizueta marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(val, type(u'')):
return val.encode('utf-8')
elif isinstance(val, type(b'')):
return val
else:
return u'{}'.format(val).encode('utf-8')


def _is_null(val):
Expand Down
13 changes: 12 additions & 1 deletion test/data/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from cartoframes.auth import Credentials
from cartoframes.data.clients import SQLClient
from cartoframes.utils.geom_utils import setting_value_exception
from cartoframes.utils.columns import normalize_name
from cartoframes.utils.columns import normalize_name, DataframeColumnsInfo
from cartoframes.utils.utils import load_geojson
from cartoframes.data import StrategiesRegistry
from cartoframes.data.dataset.registry.dataframe_dataset import DataFrameDataset, _rows
Expand Down Expand Up @@ -906,3 +906,14 @@ def test_rows_null_geom(self):
rows = _rows(df, dataframe_columns_info, with_lnglat)

self.assertEqual(list(rows), [b'|\n', b'|\n'])

def test_rows_non_ascii(self):
attribute = 'áéí'
unicode_attribute = u'áéí'
encoded_attribute = unicode_attribute.encode('utf-8')
encoded_line = encoded_attribute + '\n'.encode()

df = pd.DataFrame.from_dict({'test': [attribute, unicode_attribute, encoded_attribute, 'xyz']})
columns_info = DataframeColumnsInfo(df)
rows = _rows(df, columns_info, None)
self.assertEqual(list(rows), [encoded_line, encoded_line, encoded_line, b'xyz\n'])