Skip to content

Commit

Permalink
feat(eda): add get_db_names
Browse files Browse the repository at this point in the history
  • Loading branch information
jinglinpeng committed Apr 21, 2022
1 parent ec857e7 commit a7bf820
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 5 deletions.
4 changes: 2 additions & 2 deletions dataprep/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""This module implements load dataset related functions"""

from ._base import load_dataset, _load_dataset_as_dask, load_db
from ._base import get_dataset_names
from ._base import get_dataset_names, get_db_names

__all__ = ["load_dataset", "get_dataset_names", "_load_dataset_as_dask", "load_db"]
__all__ = ["load_dataset", "get_dataset_names", "_load_dataset_as_dask", "load_db", "get_db_names"]
28 changes: 26 additions & 2 deletions dataprep/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,26 @@ def get_dataset_names() -> List[str]:
return datasets


def get_db_names() -> List[str]:
"""
Get all available database names. It is all csv file names in 'database' folder.
Returns
-------
datasets: list
A list of all available dataset names.
"""
module_path = dirname(__file__)
files = os.listdir(f"{module_path}/database")
db_files = list(filter(lambda x: x.endswith(".db"), files))

# remove suffix csv and get dataset names
db_names = list(map(lambda f: os.path.splitext(f)[0], db_files))

return db_names


def _get_dataset_path(name: str) -> str:
"""
Given a dataset name, output the file path.
Expand Down Expand Up @@ -80,20 +100,24 @@ def load_dataset(name: str) -> pd.DataFrame:
return df


def load_db(file_name: str) -> Engine:
def load_db(name: str) -> Engine:
"""
Load a database file
Parameters
----------
file_name: str
name: str
Name of the database file
Returns
-------
db_url : str
SQLite url
"""
file_name = name.lower()
if not file_name.endswith(".db"):
file_name += ".db"

db_file_path = str(
os.path.join(os.path.dirname(os.path.abspath(__file__)), "database", file_name)
)
Expand Down
13 changes: 12 additions & 1 deletion dataprep/tests/datasets/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,27 @@
"""
module for testing the functions inside datasets
"""
from ...datasets import get_dataset_names, load_dataset
from ...datasets import get_dataset_names, get_db_names, load_dataset, load_db


def test_get_dataset_names() -> None:
names = get_dataset_names()
assert len(names) > 0


def test_get_db_names() -> None:
names = get_db_names()
assert len(names) > 0


def test_load_dataset() -> None:
dataset_names = get_dataset_names()
for name in dataset_names:
df = load_dataset(name)
assert len(df) > 0


def test_load_db() -> None:
dataset_names = get_db_names()
for name in dataset_names:
db = load_db(name)

1 comment on commit a7bf820

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataPrep.EDA Benchmarks

Benchmark suite Current: a7bf820 Previous: ec857e7 Ratio
dataprep/tests/benchmarks/eda.py::test_create_report 0.2320414745737631 iter/sec (stddev: 0.10895920969644982) 0.17477829486814497 iter/sec (stddev: 0.10564637467677622) 0.75

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.