docs(clean): add documentation for multiple clean functions for numbe…

…r types
sfu-db · Sep 23, 2021 · 732480f · 732480f
1 parent 9f6f5b2
commit 732480f
Show file tree

Hide file tree

Showing 146 changed files with 42,474 additions and 34 deletions.
diff --git a/dataprep/clean/clean_cy_vat.py b/dataprep/clean/clean_cy_vat.py
@@ -61,7 +61,7 @@ def clean_cy_vat(
 
     >>> df = pd.DataFrame({{
             "vat": [
-            '12302 6635',
+            'CY-10259033P',
             'CY-10259033Z',]
             })
     >>> clean_cy_vat(df, 'vat')

diff --git a/dataprep/clean/clean_de_vat.py b/dataprep/clean/clean_de_vat.py
@@ -1,5 +1,5 @@
 """
-Clean and validate a DataFrame column containing German VAT numberss (VATs).
+Clean and validate a DataFrame column containing German VAT numbers (VATs).
 """
 # pylint: disable=too-many-lines, too-many-arguments, too-many-branches
 from typing import Any, Union

diff --git a/dataprep/clean/clean_de_wkn.py b/dataprep/clean/clean_de_wkn.py
@@ -1,5 +1,6 @@
 """
-Clean and validate a DataFrame column containing Wertpapierkennnummer (WKNs).
+Clean and validate a DataFrame column containing
+German Securities Identification Codes (WKNs).
 """
 # pylint: disable=too-many-lines, too-many-arguments, too-many-branches
 from typing import Any, Union

diff --git a/dataprep/clean/clean_fi_associationid.py b/dataprep/clean/clean_fi_associationid.py
@@ -58,7 +58,7 @@ def clean_fi_associationid(
     --------
     Clean a column of Finnish association registry id data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "associationid": [
             "1234",
             "12df",]

diff --git a/dataprep/clean/clean_fr_siret.py b/dataprep/clean/clean_fr_siret.py
@@ -24,7 +24,7 @@ def clean_fr_siret(
     progress: bool = True,
 ) -> pd.DataFrame:
     """
-    Clean French company establishment identification numbers (SIRETs) data in a DataFrame column.
+    Clean French Company Establishment Identification Numbers (SIRETs) in a DataFrame column.
 
     Parameters
     ----------

diff --git a/dataprep/clean/clean_in_aadhaar.py b/dataprep/clean/clean_in_aadhaar.py
@@ -36,7 +36,7 @@ def clean_in_aadhaar(
             The output format of standardized number string.
             If output_format = 'compact', return string without any separators or whitespace.
             If output_format = 'standard', return string with proper separators and whitespace.
-            If output_format = 'mask', Masks the first 8 digits as per MeitY guidelines for
+            If output_format = 'mask', mask the first 8 digits as per MeitY guidelines for
                 securing identity information and Sensitive personal data.
 
             (default: "standard")

diff --git a/dataprep/clean/clean_lv_pvn.py b/dataprep/clean/clean_lv_pvn.py
@@ -1,5 +1,5 @@
 """
-Clean and validate a DataFrame column containing Latvian PVN (VAT) numbers (PVTs).
+Clean and validate a DataFrame column containing Latvian PVN (VAT) numbers (PVNs).
 """
 # pylint: disable=too-many-lines, too-many-arguments, too-many-branches
 from typing import Any, Union
@@ -23,21 +23,21 @@ def clean_lv_pvn(
     progress: bool = True,
 ) -> pd.DataFrame:
     """
-    Clean Latvian PVN (VAT) numbers (PVTs) type data in a DataFrame column.
+    Clean Latvian PVN (VAT) numbers (PVNs) type data in a DataFrame column.
 
     Parameters
     ----------
         df
             A pandas or Dask DataFrame containing the data to be cleaned.
         col
-            The name of the column containing data of PVT type.
+            The name of the column containing data of PVN type.
         output_format
             The output format of standardized number string.
             If output_format = 'compact', return string without any separators or whitespace.
             If output_format = 'standard', return string with proper separators and whitespace.
             If output_format = 'birthdate', return the birthdate of the person. Note only when
                 PVN refers to a person (but not a legal entity) this format will be available.
-            Note: in the case of PVT, the compact format is the same as the standard one.
+            Note: in the case of PVN, the compact format is the same as the standard one.
 
             (default: "standard")
         inplace
@@ -59,7 +59,7 @@ def clean_lv_pvn(
 
     Examples
     --------
-    Clean a column of PVT data.
+    Clean a column of PVN data.
 
     >>> df = pd.DataFrame({{
             "pvn": [
@@ -114,7 +114,7 @@ def validate_lv_pvn(
     column: str = "",
 ) -> Union[bool, pd.Series, pd.DataFrame]:
     """
-    Validate if a data cell is PVT in a DataFrame column. For each cell, return True or False.
+    Validate if a data cell is PVN in a DataFrame column. For each cell, return True or False.
 
     Parameters
     ----------
@@ -146,7 +146,7 @@ def _format(val: Any, output_format: str = "standard", errors: str = "coarse") -
            If output_format = 'standard', return string with proper separators and whitespace.
            If output_format = 'birthdate', return the birthdate of the person. Note only when
                 PVN refers to a person (but not a legal entity) this format will be available.
-           Note: in the case of PVT, the compact format is the same as the standard one.
+           Note: in the case of PVN, the compact format is the same as the standard one.
     """
     # pylint: disable=bare-except
 

diff --git a/dataprep/clean/clean_no_fodselsnummer.py b/dataprep/clean/clean_no_fodselsnummer.py
@@ -23,7 +23,7 @@ def clean_no_fodselsnummer(
     progress: bool = True,
 ) -> pd.DataFrame:
     """
-    Clean Estonian Personcal ID number type data in a DataFrame column.
+    Clean Norwegian birth number data in a DataFrame column.
 
     Parameters
     ----------

diff --git a/dataprep/clean/clean_no_iban.py b/dataprep/clean/clean_no_iban.py
@@ -59,7 +59,7 @@ def clean_no_iban(
     --------
     Clean a column of IBAN data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "iban": [
             'NO9386011117947',
             'NO92 8601 1117 947',]

diff --git a/dataprep/clean/clean_no_kontonr.py b/dataprep/clean/clean_no_kontonr.py
@@ -59,7 +59,7 @@ def clean_no_kontonr(
     --------
     Clean a column of kontonr data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "kontonr": [
             "8601 11 17947",
             "8601 11 17949",]

diff --git a/dataprep/clean/clean_no_orgnr.py b/dataprep/clean/clean_no_orgnr.py
@@ -58,7 +58,7 @@ def clean_no_orgnr(
     --------
     Clean a column of Orgnr data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "orgnr": [
             "988077917",
             "988 077 918",]

diff --git a/dataprep/clean/clean_nz_bankaccount.py b/dataprep/clean/clean_nz_bankaccount.py
@@ -60,10 +60,10 @@ def clean_nz_bankaccount(
     --------
     Clean a column of bankaccount data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "bankaccount": [
-            "51824753556",
-            "99999999999",]
+            "0102420100194000",
+            "01-0242-0100195-00",]
             })
     >>> clean_nz_bankaccount(df, 'bankaccount')
             bankaccount              bankaccount_clean

diff --git a/dataprep/clean/clean_nz_ird.py b/dataprep/clean/clean_nz_ird.py
@@ -58,7 +58,7 @@ def clean_nz_ird(
     --------
     Clean a column of IRD data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "ird": [
             "49091850",
             "136410133",]

diff --git a/dataprep/clean/clean_pl_regon.py b/dataprep/clean/clean_pl_regon.py
@@ -59,7 +59,7 @@ def clean_pl_regon(
     --------
     Clean a column of REGON data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "regon": [
             '192598184',
             '192598183',]

diff --git a/dataprep/clean/clean_pt_nif.py b/dataprep/clean/clean_pt_nif.py
@@ -59,14 +59,14 @@ def clean_pt_nif(
     --------
     Clean a column of NIF data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "nif": [
             'PT 501 964 843',
             'PT 501 964 842',]
             })
     >>> clean_pt_nif(df, 'nif')
             nif                 nif_clean
-    0       PT 501 964 843      PT 501 964 843
+    0       PT 501 964 843      501964843
     1       PT 501 964 842      NaN
     """
 

diff --git a/dataprep/clean/clean_py_ruc.py b/dataprep/clean/clean_py_ruc.py
@@ -58,7 +58,7 @@ def clean_py_ruc(
     --------
     Clean a column of RUC data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "ruc": [
             "800000358",
             "80123456789",]

diff --git a/dataprep/clean/clean_ro_cf.py b/dataprep/clean/clean_ro_cf.py
@@ -1,5 +1,5 @@
 """
-Clean and validate a DataFrame column containing Romanian CF (CF) numbers (CFs).
+Clean and validate a DataFrame column containing Romanian CF (VAT) numbers (CFs).
 """
 # pylint: disable=too-many-lines, too-many-arguments, too-many-branches
 from typing import Any, Union
@@ -23,7 +23,7 @@ def clean_ro_cf(
     progress: bool = True,
 ) -> pd.DataFrame:
     """
-    Clean Romanian CF (CF) numbers (CFs) type data in a DataFrame column.
+    Clean Romanian CF (VAT) numbers (CFs) type data in a DataFrame column.
 
     Parameters
     ----------

diff --git a/dataprep/clean/clean_ro_onrc.py b/dataprep/clean/clean_ro_onrc.py
@@ -59,7 +59,7 @@ def clean_ro_onrc(
     --------
     Clean a column of ONRC data.
 
-    >>> df = pd.DataFrame({{
+    >>> df = pd.DataFrame({
             "onrc": [
             "J52/750/2012",
             "X52/750/2012",]