Skip to content

Commit

Permalink
Merge pull request #3305 from plotly/extended_data
Browse files Browse the repository at this point in the history
add some extra options to various demo datasets
  • Loading branch information
nicolaskruchten authored Jul 17, 2021
2 parents 09bbd75 + ee03bcc commit 612d3f9
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 76 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ This project adheres to [Semantic Versioning](http://semver.org/).

## UNRELEASED


### Added
- Extra flags were added to the `gapminder` and `stocks` dataset to facilitate testing, documentation and demos [#3305](https://github.com/plotly/plotly.py/issues/3305)

### Fixed
- Fixed regression introduced in version 5.0.0 where pandas/numpy arrays with `dtype` of Object were being converted to `list` values when added to a Figure ([#3292](https://github.com/plotly/plotly.py/issues/3292), [#3293](https://github.com/plotly/plotly.py/pull/3293))

Expand Down
155 changes: 79 additions & 76 deletions packages/python/plotly/plotly/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,78 +3,83 @@
"""


def gapminder():
def gapminder(datetimes=False, centroids=False, year=None):
"""
Each row represents a country on a given year.
Each row represents a country on a given year.
https://www.gapminder.org/data/
https://www.gapminder.org/data/
Returns:
A `pandas.DataFrame` with 1704 rows and the following columns:
`['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
'iso_alpha', 'iso_num']`.
"""
return _get_dataset("gapminder")
Returns:
A `pandas.DataFrame` with 1704 rows and the following columns:
`['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
'iso_alpha', 'iso_num']`.
If `datetimes` is True, the 'year' column will be a datetime column
If `centroids` is True, two new columns are added: ['centroid_lat', 'centroid_lon']
If `year` is an integer, the dataset will be filtered for that year
"""
df = _get_dataset("gapminder")
if datetimes:
df["year"] = (df["year"].astype(str) + "-01-01").astype("datetime64[ns]")
if not centroids:
df.drop(["centroid_lat", "centroid_lon"], axis=1, inplace=True)
if year:
df = df.query("year == %d" % year)
return df


def tips():
"""
Each row represents a restaurant bill.
Each row represents a restaurant bill.
https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
Returns:
A `pandas.DataFrame` with 244 rows and the following columns:
`['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`.
"""
Returns:
A `pandas.DataFrame` with 244 rows and the following columns:
`['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`."""
return _get_dataset("tips")


def iris():
"""
Each row represents a flower.
Each row represents a flower.
https://en.wikipedia.org/wiki/Iris_flower_data_set
https://en.wikipedia.org/wiki/Iris_flower_data_set
Returns:
A `pandas.DataFrame` with 150 rows and the following columns:
`['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`.
"""
Returns:
A `pandas.DataFrame` with 150 rows and the following columns:
`['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`."""
return _get_dataset("iris")


def wind():
"""
Each row represents a level of wind intensity in a cardinal direction, and its frequency.
Each row represents a level of wind intensity in a cardinal direction, and its frequency.
Returns:
A `pandas.DataFrame` with 128 rows and the following columns:
`['direction', 'strength', 'frequency']`.
"""
Returns:
A `pandas.DataFrame` with 128 rows and the following columns:
`['direction', 'strength', 'frequency']`."""
return _get_dataset("wind")


def election():
"""
Each row represents voting results for an electoral district in the 2013 Montreal
mayoral election.
Each row represents voting results for an electoral district in the 2013 Montreal
mayoral election.
Returns:
A `pandas.DataFrame` with 58 rows and the following columns:
`['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`.
"""
Returns:
A `pandas.DataFrame` with 58 rows and the following columns:
`['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`."""
return _get_dataset("election")


def election_geojson():
"""
Each feature represents an electoral district in the 2013 Montreal mayoral election.
Each feature represents an electoral district in the 2013 Montreal mayoral election.
Returns:
A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
is an electoral district numerical ID and whose `district` property is the ID and
district name.
"""
Returns:
A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
is an electoral district numerical ID and whose `district` property is the ID and
district name."""
import gzip
import json
import os
Expand All @@ -92,27 +97,28 @@ def election_geojson():

def carshare():
"""
Each row represents the availability of car-sharing services near the centroid of a zone
in Montreal over a month-long period.
Each row represents the availability of car-sharing services near the centroid of a zone
in Montreal over a month-long period.
Returns:
A `pandas.DataFrame` with 249 rows and the following columns:
`['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`.
"""
Returns:
A `pandas.DataFrame` with 249 rows and the following columns:
`['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`."""
return _get_dataset("carshare")


def stocks(indexed=False):
def stocks(indexed=False, datetimes=False):
"""
Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
Returns:
A `pandas.DataFrame` with 100 rows and the following columns:
`['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
If `indexed` is True, the 'date' column is used as the index and the column index
is named 'company'
"""
Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
Returns:
A `pandas.DataFrame` with 100 rows and the following columns:
`['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
If `indexed` is True, the 'date' column is used as the index and the column index
If `datetimes` is True, the 'date' column will be a datetime column
is named 'company'"""
df = _get_dataset("stocks")
if datetimes:
df["date"] = df["date"].astype("datetime64[ns]")
if indexed:
df = df.set_index("date")
df.columns.name = "company"
Expand All @@ -121,15 +127,14 @@ def stocks(indexed=False):

def experiment(indexed=False):
"""
Each row in this wide dataset represents the results of 100 simulated participants
on three hypothetical experiments, along with their gender and control/treatment group.
Each row in this wide dataset represents the results of 100 simulated participants
on three hypothetical experiments, along with their gender and control/treatment group.
Returns:
A `pandas.DataFrame` with 100 rows and the following columns:
`['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
If `indexed` is True, the data frame index is named "participant"
"""
Returns:
A `pandas.DataFrame` with 100 rows and the following columns:
`['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
If `indexed` is True, the data frame index is named "participant" """
df = _get_dataset("experiment")
if indexed:
df.index.name = "participant"
Expand All @@ -138,15 +143,14 @@ def experiment(indexed=False):

def medals_wide(indexed=False):
"""
This dataset represents the medal table for Olympic Short Track Speed Skating for the
top three nations as of 2020.
Returns:
A `pandas.DataFrame` with 3 rows and the following columns:
`['nation', 'gold', 'silver', 'bronze']`.
If `indexed` is True, the 'nation' column is used as the index and the column index
is named 'medal'
"""
This dataset represents the medal table for Olympic Short Track Speed Skating for the
top three nations as of 2020.
Returns:
A `pandas.DataFrame` with 3 rows and the following columns:
`['nation', 'gold', 'silver', 'bronze']`.
If `indexed` is True, the 'nation' column is used as the index and the column index
is named 'medal'"""
df = _get_dataset("medals")
if indexed:
df = df.set_index("nation")
Expand All @@ -156,14 +160,13 @@ def medals_wide(indexed=False):

def medals_long(indexed=False):
"""
This dataset represents the medal table for Olympic Short Track Speed Skating for the
top three nations as of 2020.
This dataset represents the medal table for Olympic Short Track Speed Skating for the
top three nations as of 2020.
Returns:
A `pandas.DataFrame` with 9 rows and the following columns:
`['nation', 'medal', 'count']`.
If `indexed` is True, the 'nation' column is used as the index.
"""
Returns:
A `pandas.DataFrame` with 9 rows and the following columns:
`['nation', 'medal', 'count']`.
If `indexed` is True, the 'nation' column is used as the index."""
df = _get_dataset("medals").melt(
id_vars=["nation"], value_name="count", var_name="medal"
)
Expand Down
Binary file not shown.

0 comments on commit 612d3f9

Please sign in to comment.