Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 8/pass ci polars skip #11

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,11 @@ jobs:
- name: Run pytest
run: |
pytest

- name: Install polars
run: |
poetry install --all-extras

- name: Repeat tests with polars support
run: |
pytest
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ __pypackages__/

# Testing
.hypothesis/
.pytest_cache/
.pytest_cache/
9 changes: 4 additions & 5 deletions countrycode/countrycode.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import csv
import os
import re
import csv

try:
import polars as pl
Expand Down Expand Up @@ -158,7 +158,7 @@ def replace_exact(sourcevar, origin, destination):
for string in sourcevar:
match_found = False
for position, origin_i in enumerate(codelist[origin]):
if origin_i == '' or codelist[destination][position] == '':
if origin_i == "" or codelist[destination][position] == "":
continue
if string == origin_i:
if codelist[destination][position].isdigit():
Expand All @@ -172,13 +172,12 @@ def replace_exact(sourcevar, origin, destination):
return out



def replace_regex(sourcevar, origin, destination):
sourcevar_unique = list(set(sourcevar))
o = []
d = []
for i, (val_origin, val_destination) in enumerate(zip(codelist[origin], codelist[destination])):
if val_origin != '' and val_destination != '':
if val_origin != "" and val_destination != "":
o.append(re.compile(val_origin, flags=re.IGNORECASE))
d.append(val_destination)

Expand All @@ -194,4 +193,4 @@ def replace_regex(sourcevar, origin, destination):
result.append(None)
mapping = dict(zip(sourcevar_unique, result))
out = [int(mapping[i]) if mapping[i] and mapping[i].isdigit() else mapping[i] for i in sourcevar]
return out
return out
45 changes: 36 additions & 9 deletions tests/custom_strategies.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,45 @@
import csv
import string
import os
from typing import Optional, Union

from hypothesis import strategies as st
from hypothesis.strategies import SearchStrategy

import polars as pl

pkg_dir, pkg_filename = os.path.split(__file__)
pkg_dir = os.path.dirname(pkg_dir)
data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
codelist = pl.read_csv(data_path)
with open(data_path) as f:
rows = csv.reader(f)
codelist = {col[0]: list(col[1:]) for col in zip(*rows)}


def empty_string_to_null(s: str) -> Optional[str]:
"""
Helper function to convert empty strings to `None`. Diract extraction from
the `codelist` dictionary stores empty values as `""` while
`countrycode` represents those values as None
Args:
s: A string

Returns: `None` is the string is empty, otherwise the function will return
the input string `s`.

def _select_codes(code="iso3c") -> list:
return codelist.get_column(code).drop_nulls().to_list()
"""
if s == "":
return None
return s

def _select_codes(code: str = "iso3c") -> list:
"""
Select all distinct values for a given column `code` from codelist
Args:
code: String representation of a column in `codelist` representing the field
of distinct values you wish to access

Returns: An array of non-empty values of the `code` column
"""
return list(filter(lambda z: z != "", codelist.get(code)))


def build_valid_code(code: str = "iso3c") -> SearchStrategy[str]:
Expand All @@ -26,19 +51,21 @@ def build_valid_code(code: str = "iso3c") -> SearchStrategy[str]:
)


def select_filtered_row(column: str, column_value: str, target_col="country.name.en") -> Union[
def select_filtered_row(input_column: str, column_value: str, target_col="country.name.en") -> Union[
Optional[int], Optional[str]]:
"""
Function to return the following operation:
Function to return the `target_col` row that matches the `column_value` value of `column`
Assuming `codelist` is from the `polars` package:
codelist.filter(pl.col(column) == column_value).item(0, target_col)
Args:
column: Column from codelist to filter
input_column: Column from codelist to filter
column_value: The value with which to filter the specified column
target_col: THe column to be selected
Returns:
The first cell of target_column after filtering column as equals to column_value
"""
return codelist.filter(pl.col(column) == column_value).item(0, target_col)
input_value_idx = codelist.get(input_column).index(column_value)
return codelist.get(target_col)[input_value_idx]


def build_invalid_code(code="iso3c") -> SearchStrategy[str]:
Expand Down
49 changes: 49 additions & 0 deletions tests/custom_strategies_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import string
import os
from typing import Optional, Union

from hypothesis import strategies as st
from hypothesis.strategies import SearchStrategy

try:
import polars as pl
pkg_dir, pkg_filename = os.path.split(__file__)
pkg_dir = os.path.dirname(pkg_dir)
data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
codelist = pl.read_csv(data_path)
except ImportError:
pass

def _select_codes(code="iso3c") -> list:
return codelist.get_column(code).drop_nulls().to_list()


def build_valid_code(code: str = "iso3c") -> SearchStrategy[str]:
"""
Builder function that returns a strategy to pick one of a valid 'code'.
"""
return st.sampled_from(
_select_codes(code)
)


def select_filtered_row(column: str, column_value: str, target_col="country.name.en") -> Union[
Optional[int], Optional[str]]:
"""
Function to return the following operation:
codelist.filter(pl.col(column) == column_value).item(0, target_col)
Args:
column: Column from codelist to filter
column_value: The value with which to filter the specified column
target_col: THe column to be selected
Returns:
The first cell of target_column after filtering column as equals to column_value
"""
return codelist.filter(pl.col(column) == column_value).item(0, target_col)


def build_invalid_code(code="iso3c") -> SearchStrategy[str]:
"""
Returns a string that is not represented in code within codelist
"""
return st.text(alphabet=string.printable, min_size=1, max_size=10).filter(lambda z: z not in _select_codes(code))
11 changes: 0 additions & 11 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,9 @@
from hypothesis import given, example

from countrycode import countrycode
from custom_strategies import build_valid_code, select_filtered_row


"""
Test to check that finding the iso3n representation of an iso3c row is
equivalent to finding the corresponding cell in the countrycode dataframe.
"""
@given(code_param=build_valid_code("iso3c"))
@example(code_param="CAN")
def test_numeric(code_param):
expected = select_filtered_row("iso3c", code_param, "iso3n")
assert countrycode(code_param, "iso3c", "iso3n") == expected


def test_basic_conversions():
def name_of(iso3c_code):
return countrycode(iso3c_code, origin='iso3c', destination='country.name')
Expand Down
25 changes: 19 additions & 6 deletions tests/test_codelist.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,27 @@
import os
import polars as pl

pkg_dir, pkg_filename = os.path.split(__file__)
pkg_dir = os.path.dirname(pkg_dir)
data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
codelist = pl.read_csv(data_path)
import pytest

try:
import polars as pl
_has_polars = True

def test_codelist_dimensions():
pkg_dir, pkg_filename = os.path.split(__file__)
pkg_dir = os.path.dirname(pkg_dir)
data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
codelist = pl.read_csv(data_path)
except ImportError:
_has_polars = False
from custom_strategies import codelist

@pytest.mark.skipif(not _has_polars, reason=".Shape method assumes polars installation")
def test_codelist_dimensions_polars():
"""
Unit test to validate the dimensions of the data.
"""
assert codelist.shape == (291, 624)

@pytest.mark.skipif(_has_polars, reason="Test assumed dictionary representation of codelist")
def test_codelist():
assert len(codelist.keys()) == 624
assert all(len(codelist.get(key)) == 291 for key in codelist.keys())
23 changes: 22 additions & 1 deletion tests/test_conversion.py → tests/test_conversion_polars.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,28 @@
import pytest
from hypothesis import given, example

from countrycode import countrycode
from custom_strategies import build_invalid_code, build_valid_code, select_filtered_row

try:
import polars as pl

_has_polars = True
except ImportError:
_has_polars = False

_regex_internal_skip_reason = "Test requires polars installation"

if not _has_polars:
pytest.skip(_regex_internal_skip_reason, allow_module_level=True)

from custom_strategies_polars import build_invalid_code, build_valid_code, select_filtered_row

@given(code_param=build_valid_code("iso3c"))
@example(code_param="CAN")
def test_numeric(code_param):
expected = select_filtered_row("iso3c", code_param, "iso3n")
assert countrycode(code_param, "iso3c", "iso3n") == expected


"""
Test to check that finding the country.name representation of an iso3c row is
Expand Down
1 change: 0 additions & 1 deletion tests/test_corner_cases.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import pytest
from countrycode import countrycode


Expand Down
8 changes: 4 additions & 4 deletions tests/test_regex_external.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import pytest
from countrycode import countrycode

def iso3c_of(name):
out = countrycode(sourcevar = name, origin = 'country.name', destination = 'iso3c')

def iso3c_of(name):
out = countrycode(sourcevar=name, origin='country.name', destination='iso3c')
if out is None:
out = ""
return out

def test_known_variants():

def test_known_variants():
assert iso3c_of('Aruba') == 'ABW'
assert iso3c_of('Afghanistan') == 'AFG'
assert iso3c_of('Angola') == 'AGO'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,50 +1,71 @@
import os
import pytest
import polars as pl
from countrycode import *
codelist = pl.read_csv("countrycode/data/codelist.csv")
from countrycode import countrycode

try:
import polars as pl

_has_polars = True

pkg_dir, pkg_filename = os.path.split(__file__)
pkg_dir = os.path.dirname(pkg_dir)
data_path = os.path.join(pkg_dir, "countrycode", "data", "codelist.csv")
codelist = pl.read_csv(data_path)
except ImportError:
_has_polars = False

_regex_internal_skip_reason = "Test requires polars installation"

if not _has_polars:
pytest.skip(_regex_internal_skip_reason, allow_module_level=True)


# Test all country names with iso3c codes are matched exactly once
def test_iso3c_match():
name = codelist.filter(pl.col("iso3c").is_not_null())
iso3c_from_name = countrycode(name["country.name.en"], origin='country.name', destination = "iso3c")
iso3c_from_name = countrycode(name["country.name.en"], origin='country.name', destination="iso3c")
assert len(iso3c_from_name) == len(set(iso3c_from_name))


# Test iso3c-to-country.name-to-iso3c is internally consistent
def test_iso3c_consistency():
tmp = codelist.filter(pl.col("iso3c").is_not_null())
a = countrycode(tmp["iso3c"], origin='iso3c', destination = "country.name")
b = countrycode(a, origin='country.name', destination = "iso3c")
a = countrycode(tmp["iso3c"], origin='iso3c', destination="country.name")
b = countrycode(a, origin='country.name', destination="iso3c")
assert (b == tmp["iso3c"]).all()


# Test English regex vs. cldr.short.
def test_english_regex():
tmp = codelist.filter(pl.col("country.name.en").is_not_null())
tmp = tmp.with_columns(
test = countrycode(tmp["country.name.en"], origin="country.name.en", destination="cldr.short.en")
test=countrycode(tmp["country.name.en"], origin="country.name.en", destination="cldr.short.en")
)
assert (tmp["test"] != tmp["cldr.short.en"]).any() == False


# Test Italian regex vs. cldr.short.it
def test_italian_regex():
tmp = codelist.filter(pl.col("country.name.it").is_not_null())
tmp = tmp.with_columns(
test = countrycode(tmp["country.name.it"], origin="country.name.it", destination="cldr.short.it")
test=countrycode(tmp["country.name.it"], origin="country.name.it", destination="cldr.short.it")
)
assert (tmp["test"] != tmp["cldr.short.it"]).any() == False


# Test German regex vs. cldr.short.de
def test_german_regex():
tmp = codelist.filter(pl.col("country.name.de").is_not_null())
tmp = tmp.with_columns(
test = countrycode(tmp["country.name.de"], origin="country.name.de", destination="cldr.short.de")
test=countrycode(tmp["country.name.de"], origin="country.name.de", destination="cldr.short.de")
)
assert (tmp["test"] != tmp["cldr.short.de"]).any() == False


# Test French regex vs. cldr.short.fr
def test_french_regex():
tmp = codelist.filter(pl.col("country.name.fr").is_not_null())
tmp = tmp.with_columns(
test = countrycode(tmp["country.name.fr"], origin="country.name.fr", destination="cldr.short.fr")
test=countrycode(tmp["country.name.fr"], origin="country.name.fr", destination="cldr.short.fr")
)
assert (tmp["test"] != tmp["cldr.short.fr"]).any() == False
assert (tmp["test"] != tmp["cldr.short.fr"]).any() == False