diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e9df6bc6d..66306d3ce5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ This project adheres to [Semantic Versioning](http://semver.org/). ## UNRELEASED + +### Added + - Extra flags were added to the `gapminder` and `stocks` dataset to facilitate testing, documentation and demos [#3305](https://github.com/plotly/plotly.py/issues/3305) + ### Fixed - Fixed regression introduced in version 5.0.0 where pandas/numpy arrays with `dtype` of Object were being converted to `list` values when added to a Figure ([#3292](https://github.com/plotly/plotly.py/issues/3292), [#3293](https://github.com/plotly/plotly.py/pull/3293)) diff --git a/packages/python/plotly/plotly/data/__init__.py b/packages/python/plotly/plotly/data/__init__.py index f64e326c9d..e2b1fa4465 100644 --- a/packages/python/plotly/plotly/data/__init__.py +++ b/packages/python/plotly/plotly/data/__init__.py @@ -3,78 +3,83 @@ """ -def gapminder(): +def gapminder(datetimes=False, centroids=False, year=None): """ -Each row represents a country on a given year. + Each row represents a country on a given year. -https://www.gapminder.org/data/ + https://www.gapminder.org/data/ -Returns: - A `pandas.DataFrame` with 1704 rows and the following columns: - `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap', - 'iso_alpha', 'iso_num']`. -""" - return _get_dataset("gapminder") + Returns: + A `pandas.DataFrame` with 1704 rows and the following columns: + `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap', + 'iso_alpha', 'iso_num']`. + If `datetimes` is True, the 'year' column will be a datetime column + If `centroids` is True, two new columns are added: ['centroid_lat', 'centroid_lon'] + If `year` is an integer, the dataset will be filtered for that year + """ + df = _get_dataset("gapminder") + if datetimes: + df["year"] = (df["year"].astype(str) + "-01-01").astype("datetime64[ns]") + if not centroids: + df.drop(["centroid_lat", "centroid_lon"], axis=1, inplace=True) + if year: + df = df.query("year == %d" % year) + return df def tips(): """ -Each row represents a restaurant bill. + Each row represents a restaurant bill. -https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html + https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html -Returns: - A `pandas.DataFrame` with 244 rows and the following columns: - `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`. -""" + Returns: + A `pandas.DataFrame` with 244 rows and the following columns: + `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`.""" return _get_dataset("tips") def iris(): """ -Each row represents a flower. + Each row represents a flower. -https://en.wikipedia.org/wiki/Iris_flower_data_set + https://en.wikipedia.org/wiki/Iris_flower_data_set -Returns: - A `pandas.DataFrame` with 150 rows and the following columns: - `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`. -""" + Returns: + A `pandas.DataFrame` with 150 rows and the following columns: + `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`.""" return _get_dataset("iris") def wind(): """ -Each row represents a level of wind intensity in a cardinal direction, and its frequency. + Each row represents a level of wind intensity in a cardinal direction, and its frequency. -Returns: - A `pandas.DataFrame` with 128 rows and the following columns: - `['direction', 'strength', 'frequency']`. -""" + Returns: + A `pandas.DataFrame` with 128 rows and the following columns: + `['direction', 'strength', 'frequency']`.""" return _get_dataset("wind") def election(): """ -Each row represents voting results for an electoral district in the 2013 Montreal -mayoral election. + Each row represents voting results for an electoral district in the 2013 Montreal + mayoral election. -Returns: - A `pandas.DataFrame` with 58 rows and the following columns: - `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`. -""" + Returns: + A `pandas.DataFrame` with 58 rows and the following columns: + `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`.""" return _get_dataset("election") def election_geojson(): """ -Each feature represents an electoral district in the 2013 Montreal mayoral election. + Each feature represents an electoral district in the 2013 Montreal mayoral election. -Returns: - A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id` - is an electoral district numerical ID and whose `district` property is the ID and - district name. -""" + Returns: + A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id` + is an electoral district numerical ID and whose `district` property is the ID and + district name.""" import gzip import json import os @@ -92,27 +97,28 @@ def election_geojson(): def carshare(): """ -Each row represents the availability of car-sharing services near the centroid of a zone -in Montreal over a month-long period. + Each row represents the availability of car-sharing services near the centroid of a zone + in Montreal over a month-long period. -Returns: - A `pandas.DataFrame` with 249 rows and the following columns: - `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`. -""" + Returns: + A `pandas.DataFrame` with 249 rows and the following columns: + `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`.""" return _get_dataset("carshare") -def stocks(indexed=False): +def stocks(indexed=False, datetimes=False): """ -Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019. - -Returns: - A `pandas.DataFrame` with 100 rows and the following columns: - `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`. - If `indexed` is True, the 'date' column is used as the index and the column index - is named 'company' -""" + Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019. + + Returns: + A `pandas.DataFrame` with 100 rows and the following columns: + `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`. + If `indexed` is True, the 'date' column is used as the index and the column index + If `datetimes` is True, the 'date' column will be a datetime column + is named 'company'""" df = _get_dataset("stocks") + if datetimes: + df["date"] = df["date"].astype("datetime64[ns]") if indexed: df = df.set_index("date") df.columns.name = "company" @@ -121,15 +127,14 @@ def stocks(indexed=False): def experiment(indexed=False): """ -Each row in this wide dataset represents the results of 100 simulated participants -on three hypothetical experiments, along with their gender and control/treatment group. + Each row in this wide dataset represents the results of 100 simulated participants + on three hypothetical experiments, along with their gender and control/treatment group. -Returns: - A `pandas.DataFrame` with 100 rows and the following columns: - `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`. - If `indexed` is True, the data frame index is named "participant" -""" + Returns: + A `pandas.DataFrame` with 100 rows and the following columns: + `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`. + If `indexed` is True, the data frame index is named "participant" """ df = _get_dataset("experiment") if indexed: df.index.name = "participant" @@ -138,15 +143,14 @@ def experiment(indexed=False): def medals_wide(indexed=False): """ -This dataset represents the medal table for Olympic Short Track Speed Skating for the -top three nations as of 2020. - -Returns: - A `pandas.DataFrame` with 3 rows and the following columns: - `['nation', 'gold', 'silver', 'bronze']`. - If `indexed` is True, the 'nation' column is used as the index and the column index - is named 'medal' -""" + This dataset represents the medal table for Olympic Short Track Speed Skating for the + top three nations as of 2020. + + Returns: + A `pandas.DataFrame` with 3 rows and the following columns: + `['nation', 'gold', 'silver', 'bronze']`. + If `indexed` is True, the 'nation' column is used as the index and the column index + is named 'medal'""" df = _get_dataset("medals") if indexed: df = df.set_index("nation") @@ -156,14 +160,13 @@ def medals_wide(indexed=False): def medals_long(indexed=False): """ -This dataset represents the medal table for Olympic Short Track Speed Skating for the -top three nations as of 2020. + This dataset represents the medal table for Olympic Short Track Speed Skating for the + top three nations as of 2020. -Returns: - A `pandas.DataFrame` with 9 rows and the following columns: - `['nation', 'medal', 'count']`. - If `indexed` is True, the 'nation' column is used as the index. -""" + Returns: + A `pandas.DataFrame` with 9 rows and the following columns: + `['nation', 'medal', 'count']`. + If `indexed` is True, the 'nation' column is used as the index.""" df = _get_dataset("medals").melt( id_vars=["nation"], value_name="count", var_name="medal" ) diff --git a/packages/python/plotly/plotly/package_data/datasets/gapminder.csv.gz b/packages/python/plotly/plotly/package_data/datasets/gapminder.csv.gz index a0797e5249..f673f5b03b 100644 Binary files a/packages/python/plotly/plotly/package_data/datasets/gapminder.csv.gz and b/packages/python/plotly/plotly/package_data/datasets/gapminder.csv.gz differ