Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download #1050

Merged
merged 38 commits into from
Oct 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
e577715
first PoC
simon-contreras-deel Oct 2, 2019
22f0e9f
download by query
simon-contreras-deel Oct 3, 2019
e6e4a83
download as a dataframe
simon-contreras-deel Oct 3, 2019
5e118c0
storage api version
simon-contreras-deel Oct 3, 2019
bc142c4
download file
simon-contreras-deel Oct 3, 2019
afa0a43
download default file
simon-contreras-deel Oct 3, 2019
0885cc6
using csv module
simon-contreras-deel Oct 7, 2019
c87d8f8
progress_bar
simon-contreras-deel Oct 7, 2019
6aab3bc
fail is file exists
simon-contreras-deel Oct 7, 2019
cd4b85b
fail if exists param
simon-contreras-deel Oct 7, 2019
52f6eed
rmoving dataframe and storar api methods
simon-contreras-deel Oct 7, 2019
a99ab80
using self.query
simon-contreras-deel Oct 7, 2019
14a6b21
testing download
simon-contreras-deel Oct 7, 2019
1b5b274
updating notebook
simon-contreras-deel Oct 7, 2019
7fbcc00
download_to_file
simon-contreras-deel Oct 7, 2019
5a9385b
Merge branch 'develop' into feature/do-bq-download
simon-contreras-deel Oct 7, 2019
22052b1
Merge branch 'develop' into feature/do-bq-download
simon-contreras-deel Oct 7, 2019
347a770
progress bar param
simon-contreras-deel Oct 7, 2019
0bb9c35
return path instead of warn
simon-contreras-deel Oct 8, 2019
3f7f575
dataset download
simon-contreras-deel Oct 8, 2019
4d7e705
gepgraphy download
simon-contreras-deel Oct 8, 2019
cc7f513
updating notebook
simon-contreras-deel Oct 8, 2019
25f3c5d
basic dataset test
simon-contreras-deel Oct 8, 2019
8b60a43
basic geography test
simon-contreras-deel Oct 8, 2019
5be6f74
detail in test fixtures
simon-contreras-deel Oct 8, 2019
4832fa2
add column names in csv file
simon-contreras-deel Oct 8, 2019
09d64f4
ensuring csv header test
simon-contreras-deel Oct 8, 2019
de769d6
updating notebook
simon-contreras-deel Oct 8, 2019
5106dd1
refactoring download code into entity
simon-contreras-deel Oct 8, 2019
a561a8e
download docs
simon-contreras-deel Oct 8, 2019
a6bdbf0
typo
simon-contreras-deel Oct 8, 2019
2dbe2b3
Improve message
simon-contreras-deel Oct 8, 2019
01f1907
remove BQ client from notebook
simon-contreras-deel Oct 8, 2019
b8562bf
get_do_dataset method
simon-contreras-deel Oct 8, 2019
0d82706
get_do_dataset def and avoid warnings from carto-python
simon-contreras-deel Oct 8, 2019
c43c434
unify mocks
simon-contreras-deel Oct 8, 2019
1256e45
remove ds references
simon-contreras-deel Oct 9, 2019
daeb73c
changelog
simon-contreras-deel Oct 9, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## Changed
- Remove pandas extension in catalog classes (#1038)
- Download dataset and geographies (#1050)

## [1.0b3] - 2019-08-27
### Added
Expand Down Expand Up @@ -282,4 +283,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Adds a compression option for write operations

### Fixed
- Fixes file system path creation to be generic to OS
- Fixes file system path creation to be generic to OS
6 changes: 6 additions & 0 deletions cartoframes/auth/credentials.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

from ..__version__ import __version__

from warnings import filterwarnings
filterwarnings("ignore", category=FutureWarning, module="carto")

if sys.version_info >= (3, 0):
from urllib.parse import urlparse
else:
Expand Down Expand Up @@ -208,6 +211,9 @@ def get_do_token(self):

return token.access_token

def get_do_dataset(self):
return self._username.replace('-', '_')

def get_api_key_auth_client(self):
if not self._api_key_auth_client:
self._api_key_auth_client = APIKeyAuthClient(
Expand Down
59 changes: 57 additions & 2 deletions cartoframes/data/clients/bigquery_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from __future__ import absolute_import

import os
import appdirs
import csv
import tqdm

from google.cloud import bigquery
from google.oauth2.credentials import Credentials as GoogleCredentials
from google.auth.exceptions import RefreshError
Expand All @@ -8,6 +13,8 @@

from ...auth import get_default_credentials

_USER_CONFIG_DIR = appdirs.user_config_dir('cartoframes')


def refresh_client(func):
def wrapper(self, *args, **kwargs):
Expand Down Expand Up @@ -52,6 +59,54 @@ def upload_dataframe(self, dataframe, schema, tablename, project, dataset):

@refresh_client
def query(self, query, **kwargs):
response = self.client.query(query, **kwargs)
return self.client.query(query, **kwargs)

@refresh_client
def get_table(self, project, dataset, table):
full_table_name = '{}.{}.{}'.format(project, dataset, table)
return self.client.get_table(full_table_name)

def get_table_column_names(self, project, dataset, table):
table_info = self.get_table(project, dataset, table)
return [field.name for field in table_info.schema]

def download_to_file(self, project, dataset, table, limit=None, offset=None,
file_path=None, fail_if_exists=False, progress_bar=True):
if not file_path:
file_name = '{}.{}.{}.csv'.format(project, dataset, table)
file_path = os.path.join(_USER_CONFIG_DIR, file_name)

if fail_if_exists and os.path.isfile(file_path):
raise CartoException('The file `{}` already exists.'.format(file_path))

column_names = self.get_table_column_names(project, dataset, table)

query = _download_query(project, dataset, table, limit, offset)
rows_iter = self.query(query).result()

if progress_bar:
pb = tqdm.tqdm_notebook(total=rows_iter.total_rows)

with open(file_path, 'w') as csvfile:
csvwriter = csv.writer(csvfile)

csvwriter.writerow(column_names)

for row in rows_iter:
csvwriter.writerow(row.values())
if progress_bar:
pb.update(1)

return file_path


def _download_query(project, dataset, table, limit=None, offset=None):
full_table_name = '`{}.{}.{}`'.format(project, dataset, table)
query = 'SELECT * FROM {}'.format(full_table_name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will use this for both downloading a plain dataset or a geographies dataset. I'd like to be sure of the geometry format downloaded and written in the CSV.

Could we add a test to make sure that we can read/upload to CARTO the geometry in the downloaded CSV file for a geography table? (or any other type of test that you think it might be interesting for this case)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The geometry is saved as WKT as string in the file

"POLYGON((-4.22608307660106 43.3126868301895, ...))"

We could use a carto-do-public-data for that, but I would not really like to make a tests end to end against big query and carto.

(still thinking about it)


if limit:
query += ' LIMIT {}'.format(limit)
if offset:
query += ' OFFSET {}'.format(offset)

return response
return query
2 changes: 1 addition & 1 deletion cartoframes/data/enrichment/enrichment_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

def enrich(query_function, **kwargs):
credentials = _get_credentials(kwargs['credentials'])
user_dataset = credentials.username.replace('-', '_')
user_dataset = credentials.get_do_dataset()
bq_client = _get_bigquery_client(_WORKING_PROJECT, credentials)

data_copy = _prepare_data(kwargs['data'], kwargs['data_geom_column'])
Expand Down
13 changes: 13 additions & 0 deletions cartoframes/data/observatory/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,16 @@ def is_public_data(self):
@property
def summary(self):
return self.data['summary_jsonb']

def download(self, credentials=None):
"""Download Dataset data.

Args:
credentials (:py:class:`Credentials <cartoframes.auth.Credentials>`, optional):
credentials of CARTO user account. If not provided,
a default credentials (if set with :py:meth:`set_default_credentials
<cartoframes.auth.set_default_credentials>`) will be attempted to be
used.
"""

return self._download(credentials)
36 changes: 36 additions & 0 deletions cartoframes/data/observatory/entity.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
import pandas as pd
from warnings import warn

from google.api_core.exceptions import NotFound

from carto.exceptions import CartoException

from ..clients.bigquery_client import BigQueryClient
from ...auth import get_default_credentials

try:
from abc import ABC, abstractmethod
except ImportError:
from abc import ABCMeta, abstractmethod
ABC = ABCMeta('ABC', (object,), {'__slots__': ()})

_WORKING_PROJECT = 'carto-do-customers'


class CatalogEntity(ABC):

Expand Down Expand Up @@ -45,6 +55,32 @@ def __str__(self):
def __repr__(self):
return '{classname}({entity_id})'.format(classname=self.__class__.__name__, entity_id=self.id)

def _download(self, credentials=None):
credentials = _get_credentials(credentials)
user_dataset = credentials.get_do_dataset()
bq_client = _get_bigquery_client(_WORKING_PROJECT, credentials)

project, dataset, table = self.id.split('.')
view = 'view_{}_{}'.format(dataset.replace('-', '_'), table)

try:
file_path = bq_client.download_to_file(_WORKING_PROJECT, user_dataset, view)
except NotFound:
raise CartoException('You have not purchased the dataset `{}` yet'.format(self.id))

warn('Data saved: {}.'.format(file_path))
warn("To read it you can do: `pandas.read_csv('{}')`.".format(file_path))

return file_path


def _get_credentials(credentials=None):
return credentials or get_default_credentials()


def _get_bigquery_client(project, credentials):
return BigQueryClient(project, credentials)


class CatalogList(list):

Expand Down
13 changes: 13 additions & 0 deletions cartoframes/data/observatory/geography.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,16 @@ def is_public_data(self):
@property
def summary(self):
return self.data['summary_jsonb']

def download(self, credentials=None):
"""Download Geography data.

Args:
credentials (:py:class:`Credentials <cartoframes.auth.Credentials>`, optional):
credentials of CARTO user account. If not provided,
a default credentials (if set with :py:meth:`set_default_credentials
<cartoframes.auth.set_default_credentials>`) will attempted to be
simon-contreras-deel marked this conversation as resolved.
Show resolved Hide resolved
used.
"""

return self._download(credentials)
137 changes: 137 additions & 0 deletions examples/08_data_observatory/download.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from cartoframes.auth import Credentials\n",
"credentials = Credentials.from_file()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Catalog Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from cartoframes.data.observatory.catalog import Catalog\n",
"dataset = Catalog().categories.get('financial').datasets.get('{dataset_id}')\n",
"dataset.to_series()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset.download(credentials)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Catalog Geography "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from cartoframes.data.observatory.geography import Geography\n",
"geography = Geography.get(dataset.geography)\n",
"geography.to_series()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file_path = geography.download(credentials)\n",
"file_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Upload downloaded csv file to CARTO "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(file_path)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from cartoframes.data import Dataset\n",
"\n",
"Dataset(df).upload(table_name='test_do_geography', credentials=credentials, if_exists='replace')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualize it"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from cartoframes.viz import Map, Layer\n",
"Map(Layer('test_do_geography', credentials=credentials))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading