vincentarelbundock · vincentarelbundock · Nov 2, 2023 · Oct 22, 2023 · Oct 22, 2023 · Oct 22, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -35,3 +35,11 @@ jobs:
       - name: Run pytest
         run: |
           pytest
+
+      - name: Install polars
+        run: |
+            poetry install --all-extras
+
+      - name: Repeat tests with polars support
+        run: |
+            pytest
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,4 @@ __pypackages__/
 
 # Testing
 .hypothesis/
-.pytest_cache/
+.pytest_cache/
diff --git a/countrycode/countrycode.py b/countrycode/countrycode.py
@@ -1,6 +1,6 @@
+import csv
 import os
 import re
-import csv
 
 try:
     import polars as pl
@@ -158,7 +158,7 @@ def replace_exact(sourcevar, origin, destination):
     for string in sourcevar:
         match_found = False
         for position, origin_i in enumerate(codelist[origin]):
-            if origin_i == '' or codelist[destination][position] == '':
+            if origin_i == "" or codelist[destination][position] == "":
                 continue
             if string == origin_i:
                 if codelist[destination][position].isdigit():
@@ -172,13 +172,12 @@ def replace_exact(sourcevar, origin, destination):
     return out
 
 
-
 def replace_regex(sourcevar, origin, destination):
     sourcevar_unique = list(set(sourcevar))
     o = []
     d = []
     for i, (val_origin, val_destination) in enumerate(zip(codelist[origin], codelist[destination])):
-        if val_origin != '' and val_destination != '':
+        if val_origin != "" and val_destination != "":
             o.append(re.compile(val_origin, flags=re.IGNORECASE))
             d.append(val_destination)
 
@@ -194,4 +193,4 @@ def replace_regex(sourcevar, origin, destination):
             result.append(None)
     mapping = dict(zip(sourcevar_unique, result))
     out = [int(mapping[i]) if mapping[i] and mapping[i].isdigit() else mapping[i] for i in sourcevar]
-    return out
+    return out
diff --git a/tests/custom_strategies.py b/tests/custom_strategies.py
@@ -1,20 +1,45 @@
+import csv
 import string
 import os
 from typing import Optional, Union
 
 from hypothesis import strategies as st
 from hypothesis.strategies import SearchStrategy
 
-import polars as pl
-
 pkg_dir, pkg_filename = os.path.split(__file__)
 pkg_dir = os.path.dirname(pkg_dir)
 data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
-codelist = pl.read_csv(data_path)
+with open(data_path) as f:
+    rows = csv.reader(f)
+    codelist = {col[0]: list(col[1:]) for col in zip(*rows)}
+
+
+def empty_string_to_null(s: str) -> Optional[str]:
+    """
+    Helper function to convert empty strings to `None`. Diract extraction from
+    the `codelist` dictionary stores empty values as `""` while
+    `countrycode` represents those values as None
+    Args:
+        s: A string
 
+    Returns: `None` is the string is empty, otherwise the function will return
+        the input string `s`.
 
-def _select_codes(code="iso3c") -> list:
-    return codelist.get_column(code).drop_nulls().to_list()
+    """
+    if s == "":
+        return None
+    return s
+
+def _select_codes(code: str = "iso3c") -> list:
+    """
+    Select all distinct values for a given column `code` from codelist
+    Args:
+        code: String representation of a column in `codelist` representing the field
+        of distinct values you wish to access
+
+    Returns: An array of non-empty values of the `code` column
+    """
+    return list(filter(lambda z: z != "", codelist.get(code)))
 
 
 def build_valid_code(code: str = "iso3c") -> SearchStrategy[str]:
@@ -26,19 +51,21 @@ def build_valid_code(code: str = "iso3c") -> SearchStrategy[str]:
     )
 
 
-def select_filtered_row(column: str, column_value: str, target_col="country.name.en") -> Union[
+def select_filtered_row(input_column: str, column_value: str, target_col="country.name.en") -> Union[
     Optional[int], Optional[str]]:
     """
-    Function to return the following operation:
+    Function to return the `target_col` row that matches the `column_value` value of `column`
+    Assuming `codelist` is from the `polars` package:
     codelist.filter(pl.col(column) == column_value).item(0, target_col)
     Args:
-        column: Column from codelist to filter
+        input_column: Column from codelist to filter
         column_value: The value with which to filter the specified column
         target_col: THe column to be selected
     Returns:
         The first cell of target_column after filtering column as equals to column_value
     """
-    return codelist.filter(pl.col(column) == column_value).item(0, target_col)
+    input_value_idx = codelist.get(input_column).index(column_value)
+    return codelist.get(target_col)[input_value_idx]
 
 
 def build_invalid_code(code="iso3c") -> SearchStrategy[str]:

diff --git a/tests/custom_strategies_polars.py b/tests/custom_strategies_polars.py
@@ -0,0 +1,49 @@
+import string
+import os
+from typing import Optional, Union
+
+from hypothesis import strategies as st
+from hypothesis.strategies import SearchStrategy
+
+try:
+    import polars as pl
+    pkg_dir, pkg_filename = os.path.split(__file__)
+    pkg_dir = os.path.dirname(pkg_dir)
+    data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
+    codelist = pl.read_csv(data_path)
+except ImportError:
+    pass
+
+def _select_codes(code="iso3c") -> list:
+    return codelist.get_column(code).drop_nulls().to_list()
+
+
+def build_valid_code(code: str = "iso3c") -> SearchStrategy[str]:
+    """
+    Builder function that returns a strategy to pick one of a valid 'code'.
+    """
+    return st.sampled_from(
+        _select_codes(code)
+    )
+
+
+def select_filtered_row(column: str, column_value: str, target_col="country.name.en") -> Union[
+    Optional[int], Optional[str]]:
+    """
+    Function to return the following operation:
+    codelist.filter(pl.col(column) == column_value).item(0, target_col)
+    Args:
+        column: Column from codelist to filter
+        column_value: The value with which to filter the specified column
+        target_col: THe column to be selected
+    Returns:
+        The first cell of target_column after filtering column as equals to column_value
+    """
+    return codelist.filter(pl.col(column) == column_value).item(0, target_col)
+
+
+def build_invalid_code(code="iso3c") -> SearchStrategy[str]:
+    """
+    Returns a string that is not represented in code within codelist
+    """
+    return st.text(alphabet=string.printable, min_size=1, max_size=10).filter(lambda z: z not in _select_codes(code))
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -1,20 +1,9 @@
-from hypothesis import given, example
-
 from countrycode import countrycode
-from custom_strategies import build_valid_code, select_filtered_row
-
 
 """
 Test to check that finding the iso3n representation of an iso3c row is
 equivalent to finding the corresponding cell in the countrycode dataframe.
 """
-@given(code_param=build_valid_code("iso3c"))
-@example(code_param="CAN")
-def test_numeric(code_param):
-    expected = select_filtered_row("iso3c", code_param, "iso3n")
-    assert countrycode(code_param, "iso3c", "iso3n") == expected
-
-
 def test_basic_conversions():
     def name_of(iso3c_code):
         return countrycode(iso3c_code, origin='iso3c', destination='country.name')

diff --git a/tests/test_codelist.py b/tests/test_codelist.py
@@ -1,14 +1,27 @@
 import os
-import polars as pl
 
-pkg_dir, pkg_filename = os.path.split(__file__)
-pkg_dir = os.path.dirname(pkg_dir)
-data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
-codelist = pl.read_csv(data_path)
+import pytest
 
+try:
+    import polars as pl
+    _has_polars = True
 
-def test_codelist_dimensions():
+    pkg_dir, pkg_filename = os.path.split(__file__)
+    pkg_dir = os.path.dirname(pkg_dir)
+    data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
+    codelist = pl.read_csv(data_path)
+except ImportError:
+    _has_polars = False
+    from custom_strategies import codelist
+
+@pytest.mark.skipif(not _has_polars, reason=".Shape method assumes polars installation")
+def test_codelist_dimensions_polars():
     """
     Unit test to validate the dimensions of the data.
     """
     assert codelist.shape == (291, 624)
+
+@pytest.mark.skipif(_has_polars, reason="Test assumed dictionary representation of codelist")
+def test_codelist():
+    assert len(codelist.keys()) == 624
+    assert all(len(codelist.get(key)) == 291 for key in codelist.keys())
diff --git a/tests/test_conversion.py → tests/test_conversion_polars.py b/tests/test_conversion.py → tests/test_conversion_polars.py
@@ -1,7 +1,28 @@
+import pytest
 from hypothesis import given, example
 
 from countrycode import countrycode
-from custom_strategies import build_invalid_code, build_valid_code, select_filtered_row
+
+try:
+    import polars as pl
+
+    _has_polars = True
+except ImportError:
+    _has_polars = False
+
+_regex_internal_skip_reason = "Test requires polars installation"
+
+if not _has_polars:
+    pytest.skip(_regex_internal_skip_reason, allow_module_level=True)
+
+from custom_strategies_polars import build_invalid_code, build_valid_code, select_filtered_row
+
+@given(code_param=build_valid_code("iso3c"))
+@example(code_param="CAN")
+def test_numeric(code_param):
+    expected = select_filtered_row("iso3c", code_param, "iso3n")
+    assert countrycode(code_param, "iso3c", "iso3n") == expected
+
 
 """
 Test to check that finding the country.name representation of an iso3c row is

diff --git a/tests/test_corner_cases.py b/tests/test_corner_cases.py
@@ -1,4 +1,3 @@
-import pytest
 from countrycode import countrycode
 
 

diff --git a/tests/test_regex_external.py b/tests/test_regex_external.py
@@ -1,14 +1,14 @@
-import pytest
 from countrycode import countrycode
 
-def iso3c_of(name): 
-    out = countrycode(sourcevar = name, origin = 'country.name', destination = 'iso3c')
+
+def iso3c_of(name):
+    out = countrycode(sourcevar=name, origin='country.name', destination='iso3c')
     if out is None:
         out = ""
     return out
 
-def test_known_variants():
 
+def test_known_variants():
     assert iso3c_of('Aruba') == 'ABW'
     assert iso3c_of('Afghanistan') == 'AFG'
     assert iso3c_of('Angola') == 'AGO'

diff --git a/tests/test_regex_internal.py → tests/test_regex_internal_polars.py b/tests/test_regex_internal.py → tests/test_regex_internal_polars.py
@@ -1,50 +1,71 @@
 import os
 import pytest
-import polars as pl
-from countrycode import *
-codelist = pl.read_csv("countrycode/data/codelist.csv")
+from countrycode import countrycode
+
+try:
+    import polars as pl
+
+    _has_polars = True
+
+    pkg_dir, pkg_filename = os.path.split(__file__)
+    pkg_dir = os.path.dirname(pkg_dir)
+    data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
+    codelist = pl.read_csv(data_path)
+except ImportError:
+    _has_polars = False
+
+_regex_internal_skip_reason = "Test requires polars installation"
+
+if not _has_polars:
+    pytest.skip(_regex_internal_skip_reason, allow_module_level=True)
+
 
 # Test all country names with iso3c codes are matched exactly once
 def test_iso3c_match():
     name = codelist.filter(pl.col("iso3c").is_not_null())
-    iso3c_from_name = countrycode(name["country.name.en"], origin='country.name', destination = "iso3c")
+    iso3c_from_name = countrycode(name["country.name.en"], origin='country.name', destination="iso3c")
     assert len(iso3c_from_name) == len(set(iso3c_from_name))
 
+
 # Test iso3c-to-country.name-to-iso3c is internally consistent
 def test_iso3c_consistency():
     tmp = codelist.filter(pl.col("iso3c").is_not_null())
-    a = countrycode(tmp["iso3c"], origin='iso3c', destination = "country.name")
-    b = countrycode(a, origin='country.name', destination = "iso3c")
+    a = countrycode(tmp["iso3c"], origin='iso3c', destination="country.name")
+    b = countrycode(a, origin='country.name', destination="iso3c")
     assert (b == tmp["iso3c"]).all()
 
+
 # Test English regex vs. cldr.short.
 def test_english_regex():
     tmp = codelist.filter(pl.col("country.name.en").is_not_null())
     tmp = tmp.with_columns(
-        test = countrycode(tmp["country.name.en"], origin="country.name.en", destination="cldr.short.en")
+        test=countrycode(tmp["country.name.en"], origin="country.name.en", destination="cldr.short.en")
     )
     assert (tmp["test"] != tmp["cldr.short.en"]).any() == False
 
+
 # Test Italian regex vs. cldr.short.it
 def test_italian_regex():
     tmp = codelist.filter(pl.col("country.name.it").is_not_null())
     tmp = tmp.with_columns(
-        test = countrycode(tmp["country.name.it"], origin="country.name.it", destination="cldr.short.it")
+        test=countrycode(tmp["country.name.it"], origin="country.name.it", destination="cldr.short.it")
     )
     assert (tmp["test"] != tmp["cldr.short.it"]).any() == False
 
+
 # Test German regex vs. cldr.short.de
 def test_german_regex():
     tmp = codelist.filter(pl.col("country.name.de").is_not_null())
     tmp = tmp.with_columns(
-        test = countrycode(tmp["country.name.de"], origin="country.name.de", destination="cldr.short.de")
+        test=countrycode(tmp["country.name.de"], origin="country.name.de", destination="cldr.short.de")
     )
     assert (tmp["test"] != tmp["cldr.short.de"]).any() == False
 
+
 # Test French regex vs. cldr.short.fr
 def test_french_regex():
     tmp = codelist.filter(pl.col("country.name.fr").is_not_null())
     tmp = tmp.with_columns(
-        test = countrycode(tmp["country.name.fr"], origin="country.name.fr", destination="cldr.short.fr")
+        test=countrycode(tmp["country.name.fr"], origin="country.name.fr", destination="cldr.short.fr")
     )
-    assert (tmp["test"] != tmp["cldr.short.fr"]).any() == False
+    assert (tmp["test"] != tmp["cldr.short.fr"]).any() == False
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		import pytest
		from countrycode import countrycode


Expand Down