From e9f7d1167cd28cdfe4959b057bea7f41f81a5cc1 Mon Sep 17 00:00:00 2001 From: willidert Date: Mon, 13 Apr 2020 14:11:06 -0400 Subject: [PATCH 1/6] Check if a zip code is valid for Brazil in Geonames --- models.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 models.py diff --git a/models.py b/models.py new file mode 100644 index 0000000..0e996ad --- /dev/null +++ b/models.py @@ -0,0 +1,17 @@ +# A list of the util functions + + +def is_valid_zip_brazil(zip_code): + if isinstance(zip_code, int): + zip_code = str(zip_code) + + if zip_code.find('-'): + zip_code = ''.join(zip_code.split()) + + result = [] + + result.append(1) if zip_code[5:] == '000' else result.append(0) + + result.append(1) if len(zip_code) == 8 else result.append(0) + + return all(result) From e1a321e169dea773b440760bcdd9cf73cfa54c06 Mon Sep 17 00:00:00 2001 From: willidert Date: Mon, 13 Apr 2020 17:46:43 -0400 Subject: [PATCH 2/6] Adding notice for zip codes from Brazil --- models.py | 18 +++++++++++++----- pgeocode.py | 22 +++++++++++++++------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/models.py b/models.py index 0e996ad..75eca04 100644 --- a/models.py +++ b/models.py @@ -2,11 +2,10 @@ def is_valid_zip_brazil(zip_code): - if isinstance(zip_code, int): - zip_code = str(zip_code) - - if zip_code.find('-'): - zip_code = ''.join(zip_code.split()) + if isinstance(zip_code, list): + zip_code = ''.join(zip_code) + if zip_code[0].find('-'): + zip_code = ''.join(zip_code.split('-')) result = [] @@ -15,3 +14,12 @@ def is_valid_zip_brazil(zip_code): result.append(1) if len(zip_code) == 8 else result.append(0) return all(result) + + +def check_is_valid_zip_code(zip_code): + if isinstance(zip_code, list): + zip_code = ''.join(zip_code) + + msg = f'zip_code {zip_code} is invalid for Brazil' + if not is_valid_zip_brazil(zip_code): + return msg diff --git a/pgeocode.py b/pgeocode.py index 901d694..b077aed 100644 --- a/pgeocode.py +++ b/pgeocode.py @@ -10,11 +10,14 @@ import numpy as np import pandas as pd +from models import check_is_valid_zip_code + + __version__ = '0.2.1' STORAGE_DIR = os.environ.get( - "PGEOCODE_DATA_DIR", - os.path.join(os.path.expanduser('~'), 'pgeocode_data') + "PGEOCODE_DATA_DIR", + os.path.join(os.path.expanduser('~'), 'pgeocode_data') ) DOWNLOAD_URL = "https://download.geonames.org/export/zip/{country}.zip" @@ -43,7 +46,6 @@ def _get_url(url): return reader, res.headers - class Nominatim(object): """Query geographical location from a city name or a postal code @@ -112,9 +114,9 @@ def _index_postal_codes(self): df_unique_cp_group = self._data.groupby('postal_code') data_unique = df_unique_cp_group[['latitude', 'longitude']].mean() valid_keys = set(DATA_FIELDS).difference( - ['place_name', 'lattitude', 'longitude', 'postal_code']) + ['place_name', 'lattitude', 'longitude', 'postal_code']) data_unique['place_name'] = df_unique_cp_group['place_name'].apply( - lambda x: ', '.join([str(el) for el in x])) + lambda x: ', '.join([str(el) for el in x])) for key in valid_keys: data_unique[key] = df_unique_cp_group[key].first() data_unique = data_unique.reset_index()[DATA_FIELDS] @@ -155,8 +157,14 @@ def query_postal_code(self, codes): if isinstance(codes, str): codes = [codes] single_entry = True + if self.country == 'BR': + warnings.warn(check_is_valid_zip_code(codes)) else: single_entry = False + # warnings.warn(msg) + if self.country == 'BR': + [warnings.warn(msg) for msg in list( + map(check_is_valid_zip_code, codes))] if not isinstance(codes, pd.DataFrame): codes = pd.DataFrame(codes, columns=['postal_code']) @@ -281,8 +289,8 @@ def haversine_distance(x, y): x_lat = x_rad[:, 0] y_lat = y_rad[:, 0] - a = np.sin(dlat/2.0)**2 + \ - np.cos(x_lat) * np.cos(y_lat) * np.sin(dlon/2.0)**2 + a = np.sin(dlat / 2.0)**2 + \ + np.cos(x_lat) * np.cos(y_lat) * np.sin(dlon / 2.0)**2 c = 2 * np.arcsin(np.sqrt(a)) return EARTH_RADIUS * c From 75c91a2b1a482c324f9573235af5bd9fbcbc44b8 Mon Sep 17 00:00:00 2001 From: willidert Date: Mon, 13 Apr 2020 17:48:27 -0400 Subject: [PATCH 3/6] unnecessary file --- test_pgeocode.py | 182 ----------------------------------------------- 1 file changed, 182 deletions(-) delete mode 100644 test_pgeocode.py diff --git a/test_pgeocode.py b/test_pgeocode.py deleted file mode 100644 index 9af7477..0000000 --- a/test_pgeocode.py +++ /dev/null @@ -1,182 +0,0 @@ -# -*- coding: utf8 -*- -# License 3-clause BSD -# -# Authors: Roman Yurchak -import os -import shutil -import tempfile - -import numpy as np -import pandas as pd -from numpy.testing import assert_allclose, assert_array_equal - -import pytest - -import pgeocode -from pgeocode import haversine_distance, Nominatim, GeoDistance - - - -@pytest.fixture -def temp_dir(): - path_save = pgeocode.STORAGE_DIR - path = tempfile.mkdtemp() - pgeocode.STORAGE_DIR = path - yield path - pgeocode.STORAGE_DIR = path_save - shutil.rmtree(path) - - -def _normalize_str(x): - if x is np.nan: - return x - else: - return x.lower() - - -@pytest.mark.parametrize( - 'country, pc1, location1, pc2, location2, distance12', - [('FR', '91120', 'Palaiseau', '67000', 'Strasbourg', 400), - ('GB', 'WC2N 5DU', 'London', 'BT1 5GS', 'Belfast', 518), - # ('AR', 'c1002', 'Buenos-Aires', '62091', 'Rio-Negro', 965), known failure # noqa - ('AU', '6837', 'Perth', '3000', 'melbourne', 2722), - ('AU', '6837', 'Perth', '0221', 'Barton', 3089), - ('US', '60605', 'Chicago', '94103', 'San Francisco', 2984), - ('CA', 'M5R 1X8', 'Toronto', 'H2Z 1A7', 'Montreal', 503), - ('IE', 'D01 R2PO', 'Dublin', 'T12 RW26', 'Cork', 219), - ]) -def test_countries(country, pc1, location1, pc2, location2, - distance12): - if country == 'IE': - pytest.xfail('TODO: Investigate failure for IE') - nomi = Nominatim(country) - - res = nomi.query_postal_code(pc1) - assert isinstance(res, pd.Series) - assert _normalize_str(location1) in _normalize_str(res.place_name) - - res = nomi.query_postal_code(pc2) - assert isinstance(res, pd.Series) - assert _normalize_str(location2) in _normalize_str(res.place_name) - - gdist = GeoDistance(country) - dist = gdist.query_postal_code(pc1, pc2) - assert isinstance(dist, float) - assert dist == pytest.approx(distance12, abs=5) - - -def test_download_dataset(temp_dir): - assert not os.path.exists(os.path.join(temp_dir, 'FR.txt')) - nomi = Nominatim('fr') - # the data file was downloaded - assert os.path.exists(os.path.join(temp_dir, 'FR.txt')) - res = nomi.query_postal_code('77160') - - nomi2 = Nominatim('fr') - res2 = nomi.query_postal_code('77160') - - assert_array_equal(nomi._data.columns, - nomi2._data.columns) - assert_array_equal(nomi._data_frame.columns, - nomi2._data_frame.columns) - assert nomi._data.shape == nomi._data.shape - assert nomi._data_frame.shape == nomi._data_frame.shape - - assert len(res.place_name.split(',')) > 1 - assert len(res2.place_name.split(',')) > 1 - - -def test_nominatim_query_postal_code(): - nomi = Nominatim('fr') - - res = nomi.query_postal_code(['91120']) - assert isinstance(res, pd.DataFrame) - assert res.shape[0] == 1 - assert res.place_name.values[0] == 'Palaiseau' - - res = nomi.query_postal_code('91120') - assert isinstance(res, pd.Series) - assert res.place_name == 'Palaiseau' - - res = nomi.query_postal_code(['33625', '31000', '99999']) - assert res.shape[0] == 3 - assert not np.isfinite(res.iloc[2].latitude) - - -def test_nominatim_query_postal_code_multiple(): - nomi = Nominatim('de', unique=False) - expected_places = [ - 'Wellen', - 'Groß Rodensleben', - 'Irxleben', - 'Eichenbarleben', - 'Klein Rodensleben', - 'Niederndodeleben', - 'Hohendodeleben', - 'Ochtmersleben', - ] - - res = nomi.query_postal_code('39167') - assert isinstance(res, pd.DataFrame) - assert res.shape[0] == len(expected_places) - for place in res.place_name.values: - assert place in expected_places - -@pytest.mark.slow -@pytest.mark.parametrize('country', pgeocode.COUNTRIES_VALID) -def test_nominatim_all_countries(country): - nomi = Nominatim(country) - res = nomi.query_postal_code('00000') - assert isinstance(res, pd.Series) - - -def test_nominatim_distance_postal_code(): - - gdist = GeoDistance('fr') - - dist = gdist.query_postal_code('91120', '91120') - assert dist == 0 - - # distance between Palaiseau and Strasbourg - dist = gdist.query_postal_code('91120', '67000') - assert isinstance(dist, float) - assert dist == pytest.approx(400, abs=4.5) - assert np.isfinite(dist).all() - - dist = gdist.query_postal_code('91120', ['31000', '67000']) - assert isinstance(dist, np.ndarray) - assert dist.shape == (2,) - assert np.isfinite(dist).all() - - dist = gdist.query_postal_code(['31000', '67000'], '91120') - assert isinstance(dist, np.ndarray) - assert dist.shape == (2,) - assert np.isfinite(dist).all() - - dist = gdist.query_postal_code(['31000', '67000'], ['67000', '31000']) - assert isinstance(dist, np.ndarray) - assert dist.shape == (2,) - assert np.diff(dist)[0] == 0 - assert np.isfinite(dist).all() - - -def test_haversine_distance(): - try: - from geopy.distance import great_circle - except ImportError: - raise pytest.skip('scikit-learn not installed') - - rng = np.random.RandomState(42) - - N = 100 - - x = rng.rand(N, 2) * 80 - y = x * rng.rand(N, 2) - - d_ref = np.zeros(N) - for idx, (x_coord, y_coord) in enumerate(zip(x, y)): - d_ref[idx] = great_circle(x_coord, y_coord).km - - d_pred = haversine_distance(x, y) - # same distance +/- 3 km - assert_allclose(d_ref, d_pred, atol=3) From 3c9a70cb195e76f83ff9ba6d4e4aa91413d92202 Mon Sep 17 00:00:00 2001 From: willidert Date: Sun, 11 Oct 2020 14:38:06 -0400 Subject: [PATCH 4/6] Adding notice for zip codes and test file modification --- pgeocode.py | 52 +++++++++++++++++++++++++++++++++++++++++------- test_pgeocode.py | 19 +++++++++++++++++- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/pgeocode.py b/pgeocode.py index a3318b1..94ff97f 100644 --- a/pgeocode.py +++ b/pgeocode.py @@ -10,7 +10,6 @@ import numpy as np import pandas as pd -from models import check_is_valid_zip_code __version__ = "0.2.1" @@ -252,14 +251,21 @@ def query_postal_code(self, codes): if isinstance(codes, str): codes = [codes] single_entry = True - if self.country == 'BR': - warnings.warn(check_is_valid_zip_code(codes)) + test = _CheckPostalCode(code) + try: + if not test(): + warnings.warn(f'{code} is a invalid postal code for {self.country}') + except NotImplementedError as e: + pass else: single_entry = False - # warnings.warn(msg) - if self.country == 'BR': - [warnings.warn(msg) for msg in list( - map(check_is_valid_zip_code, codes))] + for code in codes: + test = _CheckPostalCode(code) + try: + if not test(): + warnings.warn(f'{code} is a invalid postal code for {self.country}') + except NotImplementedError as e: + pass if not isinstance(codes, pd.DataFrame): codes = pd.DataFrame(codes, columns=["postal_code"]) @@ -393,3 +399,35 @@ def haversine_distance(x, y): c = 2 * np.arcsin(np.sqrt(a)) return EARTH_RADIUS * c + + +class _CheckPostalCode(): + def __init__(self, country): + self.country = country + self.implemented = ["BR"] + + def __call__(self, postal_code): + if self.country not in self.implemented: + raise NotImplementedError + else: + return self._check_is_valid_zip_code(postal_code) + + def _is_valid_zip_brazil(self, zip_code): + # in the Brazil de zip_codes valids in Geoname are terminated + # with 000 e have 8 digits + if isinstance(zip_code, list): + zip_code = ''.join(zip_code) + if isinstance(zip_code, str) and zip_code.find('-') != -1: + zip_code = ''.join(zip_code.split('-')) + + if not isinstance(zip_code, str): + zip_code = str(zip_code) + + return all([len(zip_code) == 8, zip_code[5:] == '000']) + + def _check_is_valid_zip_code(self, zip_code): + # each country have a deferent postal code format + if isinstance(zip_code, list): + zip_code = ''.join(zip_code) + + return self._is_valid_zip_brazil(zip_code) diff --git a/test_pgeocode.py b/test_pgeocode.py index d8c966a..f3c5f8c 100644 --- a/test_pgeocode.py +++ b/test_pgeocode.py @@ -11,7 +11,7 @@ from numpy.testing import assert_allclose, assert_array_equal import pgeocode -from pgeocode import GeoDistance, Nominatim, haversine_distance +from pgeocode import GeoDistance, Nominatim, haversine_distance, _CheckPostalCode @pytest.fixture @@ -179,3 +179,20 @@ def test_haversine_distance(): d_pred = haversine_distance(x, y) # same distance +/- 3 km assert_allclose(d_ref, d_pred, atol=3) + + +class TestCheckPostalCode(): + """ Test for class CheckPostalCode""" + a = _CheckPostalCode("BR") + + def test_is_valid_zip_brazil_false(self, zip_code='69000-010'): + assert False == self.a._is_valid_zip_brazil(zip_code) + + def test_check_is_valid_zip_code_false(self, zip_code="69000-010"): + assert self.a._check_is_valid_zip_code(zip_code) == False + + def test_is_valid_zip_brazil_true(self, zip_code='69000-000'): + assert True == self.a._is_valid_zip_brazil(zip_code) + + def test_check_is_valid_zip_code_true(self, zip_code="69000-000"): + assert self.a._check_is_valid_zip_code(zip_code) == True From fce68e7933710cdcdcaec16a978273035686df05 Mon Sep 17 00:00:00 2001 From: willidert Date: Sun, 11 Oct 2020 14:38:06 -0400 Subject: [PATCH 5/6] Adding notice for zip codes and test file modification --- pgeocode.py | 52 +++++++++++++++++++++++++++++++++++++++++------- test_pgeocode.py | 19 +++++++++++++++++- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/pgeocode.py b/pgeocode.py index a3318b1..94ff97f 100644 --- a/pgeocode.py +++ b/pgeocode.py @@ -10,7 +10,6 @@ import numpy as np import pandas as pd -from models import check_is_valid_zip_code __version__ = "0.2.1" @@ -252,14 +251,21 @@ def query_postal_code(self, codes): if isinstance(codes, str): codes = [codes] single_entry = True - if self.country == 'BR': - warnings.warn(check_is_valid_zip_code(codes)) + test = _CheckPostalCode(code) + try: + if not test(): + warnings.warn(f'{code} is a invalid postal code for {self.country}') + except NotImplementedError as e: + pass else: single_entry = False - # warnings.warn(msg) - if self.country == 'BR': - [warnings.warn(msg) for msg in list( - map(check_is_valid_zip_code, codes))] + for code in codes: + test = _CheckPostalCode(code) + try: + if not test(): + warnings.warn(f'{code} is a invalid postal code for {self.country}') + except NotImplementedError as e: + pass if not isinstance(codes, pd.DataFrame): codes = pd.DataFrame(codes, columns=["postal_code"]) @@ -393,3 +399,35 @@ def haversine_distance(x, y): c = 2 * np.arcsin(np.sqrt(a)) return EARTH_RADIUS * c + + +class _CheckPostalCode(): + def __init__(self, country): + self.country = country + self.implemented = ["BR"] + + def __call__(self, postal_code): + if self.country not in self.implemented: + raise NotImplementedError + else: + return self._check_is_valid_zip_code(postal_code) + + def _is_valid_zip_brazil(self, zip_code): + # in the Brazil de zip_codes valids in Geoname are terminated + # with 000 e have 8 digits + if isinstance(zip_code, list): + zip_code = ''.join(zip_code) + if isinstance(zip_code, str) and zip_code.find('-') != -1: + zip_code = ''.join(zip_code.split('-')) + + if not isinstance(zip_code, str): + zip_code = str(zip_code) + + return all([len(zip_code) == 8, zip_code[5:] == '000']) + + def _check_is_valid_zip_code(self, zip_code): + # each country have a deferent postal code format + if isinstance(zip_code, list): + zip_code = ''.join(zip_code) + + return self._is_valid_zip_brazil(zip_code) diff --git a/test_pgeocode.py b/test_pgeocode.py index d8c966a..4e0d0d8 100644 --- a/test_pgeocode.py +++ b/test_pgeocode.py @@ -11,7 +11,7 @@ from numpy.testing import assert_allclose, assert_array_equal import pgeocode -from pgeocode import GeoDistance, Nominatim, haversine_distance +from pgeocode import GeoDistance, Nominatim, haversine_distance, _CheckPostalCode @pytest.fixture @@ -179,3 +179,20 @@ def test_haversine_distance(): d_pred = haversine_distance(x, y) # same distance +/- 3 km assert_allclose(d_ref, d_pred, atol=3) + + +class TestCheckPostalCode(): + """ Test for class CheckPostalCode""" + a = _CheckPostalCode('BR') + + def test_is_valid_zip_brazil_false(self, zip_code='69000-010'): + assert False == self.a._is_valid_zip_brazil(zip_code) + + def test_check_is_valid_zip_code_false(self, zip_code='69000-010'): + assert self.a._check_is_valid_zip_code(zip_code) == False + + def test_is_valid_zip_brazil_true(self, zip_code='69000-000'): + assert True == self.a._is_valid_zip_brazil(zip_code) + + def test_check_is_valid_zip_code_true(self, zip_code='69000-000'): + assert self.a._check_is_valid_zip_code(zip_code) == True From 8c1a2d9f62f95ce0dcd4d64a8c97ee0e413cbb52 Mon Sep 17 00:00:00 2001 From: willidert Date: Sun, 11 Oct 2020 19:12:20 -0400 Subject: [PATCH 6/6] Test bugs fixes caused by my changes, sorry --- pgeocode.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pgeocode.py b/pgeocode.py index 7f22f18..cee355e 100644 --- a/pgeocode.py +++ b/pgeocode.py @@ -249,22 +249,22 @@ def query_postal_code(self, codes): codes = str(codes) if isinstance(codes, str): - code = [codes] + codes = [codes] single_entry = True - test = _CheckPostalCode(code) + check_zip_code = _CheckPostalCode(codes) try: - if not test(): + if not check_zip_code(codes): warnings.warn( - f"{code} is a invalid postal code for {self.country}" + f"{codes} is a invalid postal code for {self.country}" ) except NotImplementedError: pass else: single_entry = False for code in codes: - test = _CheckPostalCode(code) + check_zip_code = _CheckPostalCode(code) try: - if not test(): + if not check_zip_code(code): warnings.warn( f"{code} is a invalid" + f"postal code for {self.country}"