From a8ea4f29cc1e2268e73b939bd2d922eb07cb4186 Mon Sep 17 00:00:00 2001 From: Dmytro Yurchenko Date: Fri, 1 Dec 2023 16:48:20 +0100 Subject: [PATCH] DAB-1528 feat: add shared code from recommendations DAGs (#18) * DAB-1528 feat: add dump_object_to_s3 and load_object_from_s3 to utils * DAB-1528 feat: update version to 0.3.0 * addded is_acceptable_recommendation to recommendations_utils --- ds_toolkit/recommendations_utils.py | 21 ++++++ ds_toolkit/utils.py | 42 +++++++++++ poetry.lock | 110 +++++++++++++++++++++++++++- pyproject.toml | 4 +- tests/test_recommendation_utils.py | 45 ++++++++++++ tests/test_utils.py | 49 ++++++++++++- 6 files changed, 268 insertions(+), 3 deletions(-) diff --git a/ds_toolkit/recommendations_utils.py b/ds_toolkit/recommendations_utils.py index 89ac092..98eac21 100644 --- a/ds_toolkit/recommendations_utils.py +++ b/ds_toolkit/recommendations_utils.py @@ -2,6 +2,27 @@ from typing import Any, Optional import numpy as np +from geopy.distance import distance + + +def is_acceptable_recommendation( + source_listing: dict, max_geo_distance: float, target_listing: dict +): + source_categories = set(source_listing["CATEGORIES"].split(",")) + target_categories = set(target_listing["CATEGORIES"].split(",")) + geo_dist = distance( + (source_listing["LATITUDE"], source_listing["LONGITUDE"]), + ( + target_listing["LATITUDE"], + target_listing["LONGITUDE"], + ), + ).km + + return ( + target_listing["IS_ACTIVE"] + and geo_dist <= max_geo_distance + and (len(target_categories.intersection(source_categories)) > 0) + ) def get_cosine_similarity(source_vector, item_representations): diff --git a/ds_toolkit/utils.py b/ds_toolkit/utils.py index aedac8b..17096a9 100644 --- a/ds_toolkit/utils.py +++ b/ds_toolkit/utils.py @@ -1,3 +1,8 @@ +import pickle +from io import BytesIO +from typing import Any + +from botocore.client import BaseClient from datadog import initialize from datadog.api.metrics import Metric @@ -5,3 +10,40 @@ def send_datadog_metric(options, *args, **kwargs): initialize(**options) Metric.send(*args, **kwargs) + + +def dump_object_to_s3(client: BaseClient, obj: Any, bucket: str, key: str): + """ + Dump an object to S3 + + :param client: S3 client + :param obj: Object to dump + :param bucket: S3 bucket + :param key: S3 key + :return: None + """ + buff = BytesIO() + buff.write(pickle.dumps(obj)) + buff.seek(0) + + client.put_object( + Body=buff.read(), + Bucket=bucket, + Key=key, + ) + + +def load_object_from_s3(client: BaseClient, bucket: str, key: str) -> Any: + """ + Load an object from S3 + + :param client: S3 client + :param bucket: S3 bucket + :param key: S3 key + :return: Loaded object + """ + buff = client.get_object( + Bucket=bucket, + Key=key, + )["Body"].read() + return pickle.loads(buff) diff --git a/poetry.lock b/poetry.lock index f54c67c..0e73edf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -182,6 +182,28 @@ d = ["aiohttp (>=3.7.4)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "botocore" +version = "1.33.4" +description = "Low-level, data-driven core of boto 3." +optional = false +python-versions = ">= 3.7" +files = [ + {file = "botocore-1.33.4-py3-none-any.whl", hash = "sha256:bbd96c211b670d17191d617423f3c4c277964eeb586a33b3759691c33904629d"}, + {file = "botocore-1.33.4.tar.gz", hash = "sha256:872decbc760c3b2942477cda905d4443bd8a97511dcee3e9ca09eeb9299ad5e2"}, +] + +[package.dependencies] +jmespath = ">=0.7.1,<2.0.0" +python-dateutil = ">=2.1,<3.0.0" +urllib3 = [ + {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, + {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}, +] + +[package.extras] +crt = ["awscrt (==0.19.17)"] + [[package]] name = "certifi" version = "2023.7.22" @@ -517,6 +539,40 @@ files = [ {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"}, ] +[[package]] +name = "geographiclib" +version = "2.0" +description = "The geodesic routines from GeographicLib" +optional = false +python-versions = ">=3.7" +files = [ + {file = "geographiclib-2.0-py3-none-any.whl", hash = "sha256:6b7225248e45ff7edcee32becc4e0a1504c606ac5ee163a5656d482e0cd38734"}, + {file = "geographiclib-2.0.tar.gz", hash = "sha256:f7f41c85dc3e1c2d3d935ec86660dc3b2c848c83e17f9a9e51ba9d5146a15859"}, +] + +[[package]] +name = "geopy" +version = "2.4.1" +description = "Python Geocoding Toolbox" +optional = false +python-versions = ">=3.7" +files = [ + {file = "geopy-2.4.1-py3-none-any.whl", hash = "sha256:ae8b4bc5c1131820f4d75fce9d4aaaca0c85189b3aa5d64c3dcaf5e3b7b882a7"}, + {file = "geopy-2.4.1.tar.gz", hash = "sha256:50283d8e7ad07d89be5cb027338c6365a32044df3ae2556ad3f52f4840b3d0d1"}, +] + +[package.dependencies] +geographiclib = ">=1.52,<3" + +[package.extras] +aiohttp = ["aiohttp"] +dev = ["coverage", "flake8 (>=5.0,<5.1)", "isort (>=5.10.0,<5.11.0)", "pytest (>=3.10)", "pytest-asyncio (>=0.17)", "readme-renderer", "sphinx (<=4.3.2)", "sphinx-issues", "sphinx-rtd-theme (>=0.5.0)"] +dev-docs = ["readme-renderer", "sphinx (<=4.3.2)", "sphinx-issues", "sphinx-rtd-theme (>=0.5.0)"] +dev-lint = ["flake8 (>=5.0,<5.1)", "isort (>=5.10.0,<5.11.0)"] +dev-test = ["coverage", "pytest (>=3.10)", "pytest-asyncio (>=0.17)", "sphinx (<=4.3.2)"] +requests = ["requests (>=2.16.2)", "urllib3 (>=1.24.2)"] +timezone = ["pytz"] + [[package]] name = "identify" version = "2.5.30" @@ -553,6 +609,17 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "jmespath" +version = "1.0.1" +description = "JSON Matching Expressions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, + {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, +] + [[package]] name = "joblib" version = "1.3.2" @@ -910,6 +977,20 @@ pytest = ">=5.0" [package.extras] dev = ["pre-commit", "pytest-asyncio", "tox"] +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] + +[package.dependencies] +six = ">=1.5" + [[package]] name = "pyyaml" version = "6.0.1" @@ -1194,6 +1275,17 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + [[package]] name = "threadpoolctl" version = "3.2.0" @@ -1238,6 +1330,22 @@ files = [ {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"}, ] +[[package]] +name = "urllib3" +version = "1.26.18" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ + {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"}, + {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"}, +] + +[package.extras] +brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + [[package]] name = "urllib3" version = "2.0.7" @@ -1365,4 +1473,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.11" -content-hash = "632be22c556d1c9230b38808ae3ac199462f77c460ba7b8c7f3fa15f7c6ccf61" +content-hash = "891c9bc875272a518a8f1c31508728b1b38dec696c7c3ca8a8e9540036660ccf" diff --git a/pyproject.toml b/pyproject.toml index c957e58..1f3fac7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ds-toolkit" -version = "0.2.4" +version = "0.3.0" description = "Utility package for SMG Real Estate DS team" authors = ["Dmytro Yurchenko "] license = "PROPRIETARY" @@ -8,6 +8,8 @@ readme = "README.md" packages = [{include = "ds_toolkit"}] [tool.poetry.dependencies] +botocore = "^1.31.0" +geopy = "^2.4.0" python = ">=3.8,<3.11" numpy = "^1.24.0" datadog = "^0.47.0" diff --git a/tests/test_recommendation_utils.py b/tests/test_recommendation_utils.py index 3a229f4..9d9753b 100644 --- a/tests/test_recommendation_utils.py +++ b/tests/test_recommendation_utils.py @@ -1,4 +1,5 @@ import json +from unittest import mock import numpy as np @@ -11,11 +12,55 @@ flatten_hgrets, get_category_code, get_recommendations_ordered_by_distance, + is_acceptable_recommendation, isnull, normalise_price, ) +@mock.patch("ds_toolkit.recommendations_utils.distance") +def test_is_acceptable_recommendation(mock_distance): + mock_distance.return_value = lambda: None + mock_distance.return_value.km = 5.0 + + source_listing = { + "LISTING_ID": 1, + "LATITUDE": 47.3769, + "LONGITUDE": 8.5417, + "CATEGORIES": "HOUSE,SINGLE_HOUSE", + "IS_ACTIVE": True, + } + + target_listing = { + "LISTING_ID": 2, + "LATITUDE": 43.3769, + "LONGITUDE": 8.5417, + "CATEGORIES": "HOUSE", + "IS_ACTIVE": True, + } + + assert ( + is_acceptable_recommendation(source_listing, 10.0, target_listing) + is True + ) + assert ( + is_acceptable_recommendation( + source_listing, 10.0, {**target_listing, "IS_ACTIVE": False} + ) + is False + ) + assert ( + is_acceptable_recommendation( + source_listing, 10.0, {**target_listing, "CATEGORIES": "GARAGE"} + ) + is False + ) + assert ( + is_acceptable_recommendation(source_listing, 3.0, target_listing) + is False + ) + + def test_get_recommendations_ordered_by_distance(): recommended_listing_ids_map = { 1: [(20, 0.1), (30, 0.3)], diff --git a/tests/test_utils.py b/tests/test_utils.py index e855cb0..ee83071 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,10 @@ from unittest import mock -from ds_toolkit.utils import send_datadog_metric +from ds_toolkit.utils import ( + dump_object_to_s3, + load_object_from_s3, + send_datadog_metric, +) @mock.patch("ds_toolkit.utils.initialize") @@ -21,3 +25,46 @@ def test_send_datadog_metric(mock_send, mock_initialize): tags=["test:1"], metric="test.metric", points=[(1, 1)] ) mock_initialize.assert_called_once_with(**datadog_options) + + +@mock.patch("ds_toolkit.utils.pickle.dumps") +@mock.patch("ds_toolkit.utils.BytesIO") +def test_dump_object_to_s3(mock_bytesio, mock_dumps): + mock_client = mock.MagicMock() + mock_obj = mock.MagicMock() + mock_bucket = mock.MagicMock() + mock_key = mock.MagicMock() + mock_buff = mock.MagicMock() + mock_bytesio.return_value = mock_buff + mock_dumps.return_value = "pickled_obj" + mock_buff.read.return_value = "pickled_obj" + + dump_object_to_s3(mock_client, mock_obj, mock_bucket, mock_key) + mock_bytesio.assert_called_once_with() + mock_dumps.assert_called_once_with(mock_obj) + mock_buff.write.assert_called_once_with("pickled_obj") + mock_buff.seek.assert_called_once_with(0) + mock_client.put_object.assert_called_once_with( + Body="pickled_obj", Bucket=mock_bucket, Key=mock_key + ) + + +@mock.patch("ds_toolkit.utils.pickle.loads") +def test_load_object_from_s3(mock_loads): + mock_client = mock.MagicMock() + mock_bucket = mock.MagicMock() + mock_key = mock.MagicMock() + mock_obj = mock.MagicMock() + mock_client.get_object.return_value = {"Body": mock_obj} + mock_obj.read.return_value = "pickled_obj" + mock_loads.return_value = "unpickled_obj" + + assert ( + load_object_from_s3(mock_client, mock_bucket, mock_key) + == "unpickled_obj" + ) + mock_client.get_object.assert_called_once_with( + Bucket=mock_bucket, Key=mock_key + ) + mock_obj.read.assert_called_once_with() + mock_loads.assert_called_once_with("pickled_obj")