Skip to content

Commit

Permalink
feat(clean): add multiple clean functions for number types
Browse files Browse the repository at this point in the history
  • Loading branch information
NoirTree authored and qidanrui committed Sep 20, 2021
1 parent 13032fe commit 3c05be5
Show file tree
Hide file tree
Showing 132 changed files with 22,019 additions and 578 deletions.
527 changes: 526 additions & 1 deletion dataprep/clean/__init__.py

Large diffs are not rendered by default.

161 changes: 161 additions & 0 deletions dataprep/clean/clean_au_abn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
"""
Clean and validate a DataFrame column containing Australian Business Numbers (ABNs).
"""
# pylint: disable=too-many-lines, too-many-arguments, too-many-branches
from typing import Any, Union
from operator import itemgetter

import dask.dataframe as dd
import numpy as np
import pandas as pd

from stdnum.au import abn
from ..progress_bar import ProgressBar
from .utils import NULL_VALUES, to_dask


def clean_au_abn(
df: Union[pd.DataFrame, dd.DataFrame],
column: str,
output_format: str = "standard",
inplace: bool = False,
errors: str = "coerce",
progress: bool = True,
) -> pd.DataFrame:
"""
Clean Australian Business Numbers (ABNs) type data in a DataFrame column.
Parameters
----------
df
A pandas or Dask DataFrame containing the data to be cleaned.
col
The name of the column containing data of ABN type.
output_format
The output format of standardized number string.
If output_format = 'compact', return string without any separators or whitespace.
If output_format = 'standard', return string with proper separators and whitespace.
(default: "standard")
inplace
If True, delete the column containing the data that was cleaned.
Otherwise, keep the original column.
(default: False)
errors
How to handle parsing errors.
- ‘coerce’: invalid parsing will be set to NaN.
- ‘ignore’: invalid parsing will return the input.
- ‘raise’: invalid parsing will raise an exception.
(default: 'coerce')
progress
If True, display a progress bar.
(default: True)
Examples
--------
Clean a column of ABN data.
>>> df = pd.DataFrame({{
"abn": [
"51824753556",
"99999999999",]
})
>>> clean_au_abn(df, 'abn')
abn abn_clean
0 51824753556 51 824 753 556
1 99999999999 NaN
"""

if output_format not in {"compact", "standard"}:
raise ValueError(
f"output_format {output_format} is invalid. " 'It needs to be "compact" or "standard".'
)

# convert to dask
df = to_dask(df)

# To clean, create a new column "clean_code_tup" which contains
# the cleaned values and code indicating how the initial value was
# changed in a tuple. Then split the column of tuples and count the
# amount of different codes to produce the report
df["clean_code_tup"] = df[column].map_partitions(
lambda srs: [_format(x, output_format, errors) for x in srs],
meta=object,
)

df = df.assign(
_temp_=df["clean_code_tup"].map(itemgetter(0)),
)

df = df.rename(columns={"_temp_": f"{column}_clean"})

df = df.drop(columns=["clean_code_tup"])

if inplace:
df[column] = df[f"{column}_clean"]
df = df.drop(columns=f"{column}_clean")
df = df.rename(columns={column: f"{column}_clean"})

with ProgressBar(minimum=1, disable=not progress):
df = df.compute()

return df


def validate_au_abn(
df: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame],
column: str = "",
) -> Union[bool, pd.Series, pd.DataFrame]:
"""
Validate if a data cell is ABN in a DataFrame column. For each cell, return True or False.
Parameters
----------
df
A pandas or Dask DataFrame containing the data to be validated.
col
The name of the column to be validated.
"""
if isinstance(df, (pd.Series, dd.Series)):
return df.apply(abn.is_valid)
elif isinstance(df, (pd.DataFrame, dd.DataFrame)):
if column != "":
return df[column].apply(abn.is_valid)
else:
return df.applymap(abn.is_valid)
return abn.is_valid(df)


def _format(val: Any, output_format: str = "standard", errors: str = "coarse") -> Any:
"""
Reformat a number string with proper separators and whitespace.
Parameters
----------
val
The value of number string.
output_format
If output_format = 'compact', return string without any separators or whitespace.
If output_format = 'standard', return string with proper separators and whitespace.
"""
val = str(val)
result: Any = []

if val in NULL_VALUES:
return [np.nan]

if not validate_au_abn(val):
if errors == "raise":
raise ValueError(f"Unable to parse value {val}")
error_result = val if errors == "ignore" else np.nan
return [error_result]

if output_format == "compact":
result = [abn.compact(val)] + result
elif output_format == "standard":
result = [abn.format(val)] + result

return result
165 changes: 165 additions & 0 deletions dataprep/clean/clean_au_acn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
"""
Clean and validate a DataFrame column containing Australian Company Numbers (ACNs).
"""
# pylint: disable=too-many-lines, too-many-arguments, too-many-branches
from typing import Any, Union
from operator import itemgetter

import dask.dataframe as dd
import numpy as np
import pandas as pd

from stdnum.au import acn
from ..progress_bar import ProgressBar
from .utils import NULL_VALUES, to_dask


def clean_au_acn(
df: Union[pd.DataFrame, dd.DataFrame],
column: str,
output_format: str = "standard",
inplace: bool = False,
errors: str = "coerce",
progress: bool = True,
) -> pd.DataFrame:
"""
Clean Australian Company Numbers (ACNs) type data in a DataFrame column.
Parameters
----------
df
A pandas or Dask DataFrame containing the data to be cleaned.
col
The name of the column containing data of ACN type.
output_format
The output format of standardized number string.
If output_format = 'compact', return string without any separators or whitespace.
If output_format = 'standard', return string with proper separators and whitespace.
If output_format = 'abn', convert the number to an Australian Business Number (ABN).
(default: "standard")
inplace
If True, delete the column containing the data that was cleaned.
Otherwise, keep the original column.
(default: False)
errors
How to handle parsing errors.
- ‘coerce’: invalid parsing will be set to NaN.
- ‘ignore’: invalid parsing will return the input.
- ‘raise’: invalid parsing will raise an exception.
(default: 'coerce')
progress
If True, display a progress bar.
(default: True)
Examples
--------
Clean a column of ACN data.
>>> df = pd.DataFrame({{
"acn": [
"004085616",
"999 999 999"]
})
>>> clean_au_acn(df, 'acn')
acn acn_clean
0 004085616 004 085 616
1 999 999 999 NaN
"""

if output_format not in {"compact", "standard", "abn"}:
raise ValueError(
f"output_format {output_format} is invalid. "
'It needs to be "compact", "standard" or "abn".'
)

# convert to dask
df = to_dask(df)

# To clean, create a new column "clean_code_tup" which contains
# the cleaned values and code indicating how the initial value was
# changed in a tuple. Then split the column of tuples and count the
# amount of different codes to produce the report
df["clean_code_tup"] = df[column].map_partitions(
lambda srs: [_format(x, output_format, errors) for x in srs],
meta=object,
)

df = df.assign(
_temp_=df["clean_code_tup"].map(itemgetter(0)),
)

df = df.rename(columns={"_temp_": f"{column}_clean"})

df = df.drop(columns=["clean_code_tup"])

if inplace:
df[column] = df[f"{column}_clean"]
df = df.drop(columns=f"{column}_clean")
df = df.rename(columns={column: f"{column}_clean"})

with ProgressBar(minimum=1, disable=not progress):
df = df.compute()

return df


def validate_au_acn(
df: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame],
column: str = "",
) -> Union[bool, pd.Series, pd.DataFrame]:
"""
Validate if a data cell is ACN in a DataFrame column. For each cell, return True or False.
Parameters
----------
df
A pandas or Dask DataFrame containing the data to be validated.
col
The name of the column to be validated.
"""
if isinstance(df, (pd.Series, dd.Series)):
return df.apply(acn.is_valid)
elif isinstance(df, (pd.DataFrame, dd.DataFrame)):
if column != "":
return df[column].apply(acn.is_valid)
else:
return df.applymap(acn.is_valid)
return acn.is_valid(df)


def _format(val: Any, output_format: str = "standard", errors: str = "coarse") -> Any:
"""
Reformat a number string with proper separators and whitespace.
Parameters
----------
val
The value of number string.
output_format
If output_format = 'compact', return string without any separators or whitespace.
If output_format = 'standard', return string with proper separators and whitespace.
If output_format = 'abn', convert the number to an Australian Business Number (ABN).
"""
val = str(val)

if val in NULL_VALUES:
return [np.nan]

if not validate_au_acn(val):
if errors == "raise":
raise ValueError(f"Unable to parse value {val}")
error_result = val if errors == "ignore" else np.nan
return [error_result]

if output_format == "compact":
result = [acn.compact(val)]
elif output_format == "standard":
result = [acn.format(val)]
elif output_format == "abn":
result = [acn.to_abn(val)]

return result
Loading

0 comments on commit 3c05be5

Please sign in to comment.