-
Notifications
You must be signed in to change notification settings - Fork 206
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(clean): add multiple clean functions for number types
- Loading branch information
Showing
132 changed files
with
22,019 additions
and
578 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
""" | ||
Clean and validate a DataFrame column containing Australian Business Numbers (ABNs). | ||
""" | ||
# pylint: disable=too-many-lines, too-many-arguments, too-many-branches | ||
from typing import Any, Union | ||
from operator import itemgetter | ||
|
||
import dask.dataframe as dd | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from stdnum.au import abn | ||
from ..progress_bar import ProgressBar | ||
from .utils import NULL_VALUES, to_dask | ||
|
||
|
||
def clean_au_abn( | ||
df: Union[pd.DataFrame, dd.DataFrame], | ||
column: str, | ||
output_format: str = "standard", | ||
inplace: bool = False, | ||
errors: str = "coerce", | ||
progress: bool = True, | ||
) -> pd.DataFrame: | ||
""" | ||
Clean Australian Business Numbers (ABNs) type data in a DataFrame column. | ||
Parameters | ||
---------- | ||
df | ||
A pandas or Dask DataFrame containing the data to be cleaned. | ||
col | ||
The name of the column containing data of ABN type. | ||
output_format | ||
The output format of standardized number string. | ||
If output_format = 'compact', return string without any separators or whitespace. | ||
If output_format = 'standard', return string with proper separators and whitespace. | ||
(default: "standard") | ||
inplace | ||
If True, delete the column containing the data that was cleaned. | ||
Otherwise, keep the original column. | ||
(default: False) | ||
errors | ||
How to handle parsing errors. | ||
- ‘coerce’: invalid parsing will be set to NaN. | ||
- ‘ignore’: invalid parsing will return the input. | ||
- ‘raise’: invalid parsing will raise an exception. | ||
(default: 'coerce') | ||
progress | ||
If True, display a progress bar. | ||
(default: True) | ||
Examples | ||
-------- | ||
Clean a column of ABN data. | ||
>>> df = pd.DataFrame({{ | ||
"abn": [ | ||
"51824753556", | ||
"99999999999",] | ||
}) | ||
>>> clean_au_abn(df, 'abn') | ||
abn abn_clean | ||
0 51824753556 51 824 753 556 | ||
1 99999999999 NaN | ||
""" | ||
|
||
if output_format not in {"compact", "standard"}: | ||
raise ValueError( | ||
f"output_format {output_format} is invalid. " 'It needs to be "compact" or "standard".' | ||
) | ||
|
||
# convert to dask | ||
df = to_dask(df) | ||
|
||
# To clean, create a new column "clean_code_tup" which contains | ||
# the cleaned values and code indicating how the initial value was | ||
# changed in a tuple. Then split the column of tuples and count the | ||
# amount of different codes to produce the report | ||
df["clean_code_tup"] = df[column].map_partitions( | ||
lambda srs: [_format(x, output_format, errors) for x in srs], | ||
meta=object, | ||
) | ||
|
||
df = df.assign( | ||
_temp_=df["clean_code_tup"].map(itemgetter(0)), | ||
) | ||
|
||
df = df.rename(columns={"_temp_": f"{column}_clean"}) | ||
|
||
df = df.drop(columns=["clean_code_tup"]) | ||
|
||
if inplace: | ||
df[column] = df[f"{column}_clean"] | ||
df = df.drop(columns=f"{column}_clean") | ||
df = df.rename(columns={column: f"{column}_clean"}) | ||
|
||
with ProgressBar(minimum=1, disable=not progress): | ||
df = df.compute() | ||
|
||
return df | ||
|
||
|
||
def validate_au_abn( | ||
df: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame], | ||
column: str = "", | ||
) -> Union[bool, pd.Series, pd.DataFrame]: | ||
""" | ||
Validate if a data cell is ABN in a DataFrame column. For each cell, return True or False. | ||
Parameters | ||
---------- | ||
df | ||
A pandas or Dask DataFrame containing the data to be validated. | ||
col | ||
The name of the column to be validated. | ||
""" | ||
if isinstance(df, (pd.Series, dd.Series)): | ||
return df.apply(abn.is_valid) | ||
elif isinstance(df, (pd.DataFrame, dd.DataFrame)): | ||
if column != "": | ||
return df[column].apply(abn.is_valid) | ||
else: | ||
return df.applymap(abn.is_valid) | ||
return abn.is_valid(df) | ||
|
||
|
||
def _format(val: Any, output_format: str = "standard", errors: str = "coarse") -> Any: | ||
""" | ||
Reformat a number string with proper separators and whitespace. | ||
Parameters | ||
---------- | ||
val | ||
The value of number string. | ||
output_format | ||
If output_format = 'compact', return string without any separators or whitespace. | ||
If output_format = 'standard', return string with proper separators and whitespace. | ||
""" | ||
val = str(val) | ||
result: Any = [] | ||
|
||
if val in NULL_VALUES: | ||
return [np.nan] | ||
|
||
if not validate_au_abn(val): | ||
if errors == "raise": | ||
raise ValueError(f"Unable to parse value {val}") | ||
error_result = val if errors == "ignore" else np.nan | ||
return [error_result] | ||
|
||
if output_format == "compact": | ||
result = [abn.compact(val)] + result | ||
elif output_format == "standard": | ||
result = [abn.format(val)] + result | ||
|
||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
""" | ||
Clean and validate a DataFrame column containing Australian Company Numbers (ACNs). | ||
""" | ||
# pylint: disable=too-many-lines, too-many-arguments, too-many-branches | ||
from typing import Any, Union | ||
from operator import itemgetter | ||
|
||
import dask.dataframe as dd | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from stdnum.au import acn | ||
from ..progress_bar import ProgressBar | ||
from .utils import NULL_VALUES, to_dask | ||
|
||
|
||
def clean_au_acn( | ||
df: Union[pd.DataFrame, dd.DataFrame], | ||
column: str, | ||
output_format: str = "standard", | ||
inplace: bool = False, | ||
errors: str = "coerce", | ||
progress: bool = True, | ||
) -> pd.DataFrame: | ||
""" | ||
Clean Australian Company Numbers (ACNs) type data in a DataFrame column. | ||
Parameters | ||
---------- | ||
df | ||
A pandas or Dask DataFrame containing the data to be cleaned. | ||
col | ||
The name of the column containing data of ACN type. | ||
output_format | ||
The output format of standardized number string. | ||
If output_format = 'compact', return string without any separators or whitespace. | ||
If output_format = 'standard', return string with proper separators and whitespace. | ||
If output_format = 'abn', convert the number to an Australian Business Number (ABN). | ||
(default: "standard") | ||
inplace | ||
If True, delete the column containing the data that was cleaned. | ||
Otherwise, keep the original column. | ||
(default: False) | ||
errors | ||
How to handle parsing errors. | ||
- ‘coerce’: invalid parsing will be set to NaN. | ||
- ‘ignore’: invalid parsing will return the input. | ||
- ‘raise’: invalid parsing will raise an exception. | ||
(default: 'coerce') | ||
progress | ||
If True, display a progress bar. | ||
(default: True) | ||
Examples | ||
-------- | ||
Clean a column of ACN data. | ||
>>> df = pd.DataFrame({{ | ||
"acn": [ | ||
"004085616", | ||
"999 999 999"] | ||
}) | ||
>>> clean_au_acn(df, 'acn') | ||
acn acn_clean | ||
0 004085616 004 085 616 | ||
1 999 999 999 NaN | ||
""" | ||
|
||
if output_format not in {"compact", "standard", "abn"}: | ||
raise ValueError( | ||
f"output_format {output_format} is invalid. " | ||
'It needs to be "compact", "standard" or "abn".' | ||
) | ||
|
||
# convert to dask | ||
df = to_dask(df) | ||
|
||
# To clean, create a new column "clean_code_tup" which contains | ||
# the cleaned values and code indicating how the initial value was | ||
# changed in a tuple. Then split the column of tuples and count the | ||
# amount of different codes to produce the report | ||
df["clean_code_tup"] = df[column].map_partitions( | ||
lambda srs: [_format(x, output_format, errors) for x in srs], | ||
meta=object, | ||
) | ||
|
||
df = df.assign( | ||
_temp_=df["clean_code_tup"].map(itemgetter(0)), | ||
) | ||
|
||
df = df.rename(columns={"_temp_": f"{column}_clean"}) | ||
|
||
df = df.drop(columns=["clean_code_tup"]) | ||
|
||
if inplace: | ||
df[column] = df[f"{column}_clean"] | ||
df = df.drop(columns=f"{column}_clean") | ||
df = df.rename(columns={column: f"{column}_clean"}) | ||
|
||
with ProgressBar(minimum=1, disable=not progress): | ||
df = df.compute() | ||
|
||
return df | ||
|
||
|
||
def validate_au_acn( | ||
df: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame], | ||
column: str = "", | ||
) -> Union[bool, pd.Series, pd.DataFrame]: | ||
""" | ||
Validate if a data cell is ACN in a DataFrame column. For each cell, return True or False. | ||
Parameters | ||
---------- | ||
df | ||
A pandas or Dask DataFrame containing the data to be validated. | ||
col | ||
The name of the column to be validated. | ||
""" | ||
if isinstance(df, (pd.Series, dd.Series)): | ||
return df.apply(acn.is_valid) | ||
elif isinstance(df, (pd.DataFrame, dd.DataFrame)): | ||
if column != "": | ||
return df[column].apply(acn.is_valid) | ||
else: | ||
return df.applymap(acn.is_valid) | ||
return acn.is_valid(df) | ||
|
||
|
||
def _format(val: Any, output_format: str = "standard", errors: str = "coarse") -> Any: | ||
""" | ||
Reformat a number string with proper separators and whitespace. | ||
Parameters | ||
---------- | ||
val | ||
The value of number string. | ||
output_format | ||
If output_format = 'compact', return string without any separators or whitespace. | ||
If output_format = 'standard', return string with proper separators and whitespace. | ||
If output_format = 'abn', convert the number to an Australian Business Number (ABN). | ||
""" | ||
val = str(val) | ||
|
||
if val in NULL_VALUES: | ||
return [np.nan] | ||
|
||
if not validate_au_acn(val): | ||
if errors == "raise": | ||
raise ValueError(f"Unable to parse value {val}") | ||
error_result = val if errors == "ignore" else np.nan | ||
return [error_result] | ||
|
||
if output_format == "compact": | ||
result = [acn.compact(val)] | ||
elif output_format == "standard": | ||
result = [acn.format(val)] | ||
elif output_format == "abn": | ||
result = [acn.to_abn(val)] | ||
|
||
return result |
Oops, something went wrong.