Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAB-1534: Fix LightFM transformation pipelines #15

Merged
merged 2 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 41 additions & 14 deletions ds_toolkit/lightfm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@

from .recommendations_utils import isnull

__all__ = ["features_to_tags_pipeline", "features_to_list_with_tags_pipeline"]
__all__ = [

Check warning on line 7 in ds_toolkit/lightfm.py

View check run for this annotation

Codecov / codecov/patch

ds_toolkit/lightfm.py#L7

Added line #L7 was not covered by tests
"features_to_tags_pipeline_buy",
"features_to_tags_pipeline_rent",
"features_to_list_with_tags_pipeline",
]


class RentPriceTransformer(BaseEstimator, TransformerMixin):
"""
Sets price to nan if it is above 99th quantile for every category.
Sets price to None if it is above 99th quantile for almost all categories.
For APPT and HOUSE categories sets price to nan if it is above 30'000 and 60'000 respectively.
Those prices are much higher than 99th quantile. It is done to avoid deleting too expensive listings from some expensive cantons and municipalities.
"""

def __init__(self):
Expand Down Expand Up @@ -47,7 +53,7 @@

class BuyPriceTransformer(BaseEstimator, TransformerMixin):
"""
Sets price to nan if it is above 99th quantile for every category.
Sets price to None if it is above 99th quantile for every category.
"""

def __init__(self):
Expand All @@ -71,7 +77,8 @@

class RentSpaceTransformer(BaseEstimator, TransformerMixin):
"""
Sets space to nan if it is suspicious high or low.
Sets space to None if it is suspiciously low (equals or lower than 1 sqm).
Only PARK and INDUS DISPLAY_WINDOW categories are allowed to have such space.
"""

def __init__(self):
Expand Down Expand Up @@ -100,7 +107,10 @@

class BuySpaceTransformer(BaseEstimator, TransformerMixin):
"""
Sets space to nan if it is suspicious high or low.
Sets space to None for some categories if it is suspiciously high or low:
- APPT space is not allowed to be > 1000 or <= 1
- PARK space is not allowed to be > 100
- GASTRO, HOUSE spaces are not allowed to be <= 1
"""

def __init__(self):
Expand Down Expand Up @@ -137,7 +147,7 @@

class FloorTransformer(BaseEstimator, TransformerMixin):
"""
Sets floor to nan if it is suspicious high.
Sets floor to None if it is suspiciously high.
The highest building in Switzerland has 50 floors.
"""

Expand Down Expand Up @@ -198,8 +208,9 @@

class FeaturesIntoTagsTransformer(BaseEstimator, TransformerMixin):
"""
Transforms every features of every listing from pandas DataFrame
into a list of tuples (listing_id, features_list)
Transforms every feature of every listing from pandas DataFrame
into a list of tags - tuples (listing_id, features_list).
These tags are needed for the LightFM model and should be in the string format.
"""

def __init__(self):
Expand Down Expand Up @@ -228,8 +239,9 @@

class TagsListTransformer(BaseEstimator, TransformerMixin):
"""
Transforms all features from pandas DataFrame
into a list with unique features
Transforms all listing features from FeaturesIntoTagsTransformer
into a set of unique listing features.
This set is needed for the LightFM model.
"""

def __init__(self):
Expand All @@ -247,18 +259,33 @@
return feature_set


features_to_tags_pipeline = make_pipeline(
features_to_tags_pipeline_buy = make_pipeline(

Check warning on line 262 in ds_toolkit/lightfm.py

View check run for this annotation

Codecov / codecov/patch

ds_toolkit/lightfm.py#L262

Added line #L262 was not covered by tests
BuyPriceTransformer(),
BuySpaceTransformer(),
FloorTransformer(),
YearTransformer(),
FeaturesIntoTagsTransformer(),
)
"""
# Build transofrmation pipeline
# - clean some data (delete outliers, suspicious values)
# Build transofrmation pipeline for BUY listings:
# - clean some data (see Transformers' docstrings)
# - transform features into tags
# - create a list with all unique tags
# - create a set with all unique feature tags
"""


features_to_tags_pipeline_rent = make_pipeline(

Check warning on line 277 in ds_toolkit/lightfm.py

View check run for this annotation

Codecov / codecov/patch

ds_toolkit/lightfm.py#L277

Added line #L277 was not covered by tests
RentPriceTransformer(),
RentSpaceTransformer(),
FloorTransformer(),
YearTransformer(),
FeaturesIntoTagsTransformer(),
)
"""

Check warning on line 284 in ds_toolkit/lightfm.py

View check run for this annotation

Codecov / codecov/patch

ds_toolkit/lightfm.py#L284

Added line #L284 was not covered by tests
# Build transofrmation pipeline for RENT listings:
# - clean some data (see Transformers' docstrings)
# - transform features into tags
# - create a set with all unique feature tags
"""

features_to_list_with_tags_pipeline = make_pipeline(TagsListTransformer())
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ds-toolkit"
version = "0.2.2"
version = "0.2.3"
description = "Utility package for SMG Real Estate DS team"
authors = ["Dmytro Yurchenko <dmytro.yurchenko@swissmarketplace.group>"]
license = "PROPRIETARY"
Expand Down