From 2edc247e040f7b2b548327ce7f94d4c4728ce201 Mon Sep 17 00:00:00 2001 From: "Rosato, Matteo (contracted)" Date: Mon, 26 Feb 2024 16:08:31 +0100 Subject: [PATCH 01/28] First commit. Template created from cookiecutter-data-science --- .gitignore | 105 ++++---------- Makefile | 144 +++++++++++++++++++ README.md | 59 +++++++- docs/Makefile | 153 +++++++++++++++++++++ docs/commands.rst | 10 ++ docs/conf.py | 244 +++++++++++++++++++++++++++++++++ docs/getting-started.rst | 6 + docs/index.rst | 24 ++++ docs/make.bat | 190 +++++++++++++++++++++++++ models/.gitkeep | 0 notebooks/.gitkeep | 0 references/.gitkeep | 0 reports/.gitkeep | 0 reports/figures/.gitkeep | 0 requirements.txt | 10 ++ setup.py | 10 ++ src/__init__.py | 0 src/data/.gitkeep | 0 src/data/__init__.py | 0 src/data/make_dataset.py | 30 ++++ src/features/.gitkeep | 0 src/features/__init__.py | 0 src/features/build_features.py | 0 src/models/.gitkeep | 0 src/models/__init__.py | 0 src/models/predict_model.py | 0 src/models/train_model.py | 0 src/visualization/.gitkeep | 0 src/visualization/__init__.py | 0 src/visualization/visualize.py | 0 test_environment.py | 25 ++++ tox.ini | 3 + 32 files changed, 931 insertions(+), 82 deletions(-) create mode 100644 Makefile create mode 100644 docs/Makefile create mode 100644 docs/commands.rst create mode 100644 docs/conf.py create mode 100644 docs/getting-started.rst create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 models/.gitkeep create mode 100644 notebooks/.gitkeep create mode 100644 references/.gitkeep create mode 100644 reports/.gitkeep create mode 100644 reports/figures/.gitkeep create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 src/__init__.py create mode 100644 src/data/.gitkeep create mode 100644 src/data/__init__.py create mode 100644 src/data/make_dataset.py create mode 100644 src/features/.gitkeep create mode 100644 src/features/__init__.py create mode 100644 src/features/build_features.py create mode 100644 src/models/.gitkeep create mode 100644 src/models/__init__.py create mode 100644 src/models/predict_model.py create mode 100644 src/models/train_model.py create mode 100644 src/visualization/.gitkeep create mode 100644 src/visualization/__init__.py create mode 100644 src/visualization/visualize.py create mode 100644 test_environment.py create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index 68bc17f..7ebb815 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] -*$py.class # C extensions *.so @@ -19,12 +18,9 @@ lib64/ parts/ sdist/ var/ -wheels/ -share/python-wheels/ *.egg-info/ .installed.cfg *.egg -MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -39,17 +35,12 @@ pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ -.nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ # Translations *.mo @@ -57,69 +48,14 @@ cover/ # Django stuff: *.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy # Sphinx documentation docs/_build/ # PyBuilder -.pybuilder/ target/ -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments +# DotEnv configuration .env .venv env/ @@ -128,17 +64,33 @@ ENV/ env.bak/ venv.bak/ -# Spyder project settings -.spyderproject -.spyproject +# Database +*.db +*.rdb + +# Pycharm +.idea + +# VS Code +.vscode/ -# Rope project settings -.ropeproject +# Spyder +.spyproject/ -# mkdocs documentation -/site +# Jupyter NB Checkpoints +.ipynb_checkpoints/ -# mypy +# exclude data from source control by default +/data/ + +# Mac OS-specific storage files +.DS_Store + +# vim +*.swp +*.swo + +# Mypy .mypy_cache/ .dmypy.json dmypy.json @@ -151,10 +103,3 @@ dmypy.json # Cython debug symbols cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c6b2ae2 --- /dev/null +++ b/Makefile @@ -0,0 +1,144 @@ +.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3 + +################################################################################# +# GLOBALS # +################################################################################# + +PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') +PROFILE = default +PROJECT_NAME = property-finder +PYTHON_INTERPRETER = python3 + +ifeq (,$(shell which conda)) +HAS_CONDA=False +else +HAS_CONDA=True +endif + +################################################################################# +# COMMANDS # +################################################################################# + +## Install Python Dependencies +requirements: test_environment + $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel + $(PYTHON_INTERPRETER) -m pip install -r requirements.txt + +## Make Dataset +data: requirements + $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed + +## Delete all compiled Python files +clean: + find . -type f -name "*.py[co]" -delete + find . -type d -name "__pycache__" -delete + +## Lint using flake8 +lint: + flake8 src + +## Upload Data to S3 +sync_data_to_s3: +ifeq (default,$(PROFILE)) + aws s3 sync data/ s3://$(BUCKET)/data/ +else + aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE) +endif + +## Download Data from S3 +sync_data_from_s3: +ifeq (default,$(PROFILE)) + aws s3 sync s3://$(BUCKET)/data/ data/ +else + aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE) +endif + +## Set up python interpreter environment +create_environment: +ifeq (True,$(HAS_CONDA)) + @echo ">>> Detected conda, creating conda environment." +ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) + conda create --name $(PROJECT_NAME) python=3 +else + conda create --name $(PROJECT_NAME) python=2.7 +endif + @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" +else + $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper + @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ + export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" + @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" + @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" +endif + +## Test python environment is setup correctly +test_environment: + $(PYTHON_INTERPRETER) test_environment.py + +################################################################################# +# PROJECT RULES # +################################################################################# + + + +################################################################################# +# Self Documenting Commands # +################################################################################# + +.DEFAULT_GOAL := help + +# Inspired by +# sed script explained: +# /^##/: +# * save line in hold space +# * purge line +# * Loop: +# * append newline + line to hold space +# * go to next line +# * if line starts with doc comment, strip comment character off and loop +# * remove target prerequisites +# * append hold space (+ newline) to line +# * replace newline plus comments by `---` +# * print line +# Separate expressions are necessary because labels cannot be delimited by +# semicolon; see +.PHONY: help +help: + @echo "$$(tput bold)Available rules:$$(tput sgr0)" + @echo + @sed -n -e "/^## / { \ + h; \ + s/.*//; \ + :doc" \ + -e "H; \ + n; \ + s/^## //; \ + t doc" \ + -e "s/:.*//; \ + G; \ + s/\\n## /---/; \ + s/\\n/ /g; \ + p; \ + }" ${MAKEFILE_LIST} \ + | LC_ALL='C' sort --ignore-case \ + | awk -F '---' \ + -v ncol=$$(tput cols) \ + -v indent=19 \ + -v col_on="$$(tput setaf 6)" \ + -v col_off="$$(tput sgr0)" \ + '{ \ + printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ + n = split($$2, words, " "); \ + line_length = ncol - indent; \ + for (i = 1; i <= n; i++) { \ + line_length -= length(words[i]) + 1; \ + if (line_length <= 0) { \ + line_length = ncol - indent - length(words[i]) - 1; \ + printf "\n%*s ", -indent, " "; \ + } \ + printf "%s ", words[i]; \ + } \ + printf "\n"; \ + }' \ + | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/README.md b/README.md index 2ab5246..8705000 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,57 @@ -# property-finder -This repository is dedicated to retrieving and analyzing property advertisements from real estate websites. +Property finder +============================== + +This project is dedicated to retrieving and analyzing property advertisements from real estate websites. + +Project Organization +------------ + + ├── LICENSE + ├── Makefile <- Makefile with commands like `make data` or `make train` + ├── README.md <- The top-level README for developers using this project. + ├── data + │   ├── external <- Data from third party sources. + │   ├── interim <- Intermediate data that has been transformed. + │   ├── processed <- The final, canonical data sets for modeling. + │   └── raw <- The original, immutable data dump. + │ + ├── docs <- A default Sphinx project; see sphinx-doc.org for details + │ + ├── models <- Trained and serialized models, model predictions, or model summaries + │ + ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), + │ the creator's initials, and a short `-` delimited description, e.g. + │ `1.0-jqp-initial-data-exploration`. + │ + ├── references <- Data dictionaries, manuals, and all other explanatory materials. + │ + ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc. + │   └── figures <- Generated graphics and figures to be used in reporting + │ + ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g. + │ generated with `pip freeze > requirements.txt` + │ + ├── setup.py <- makes project pip installable (pip install -e .) so src can be imported + ├── src <- Source code for use in this project. + │   ├── __init__.py <- Makes src a Python module + │ │ + │   ├── data <- Scripts to download or generate data + │   │   └── make_dataset.py + │ │ + │   ├── features <- Scripts to turn raw data into features for modeling + │   │   └── build_features.py + │ │ + │   ├── models <- Scripts to train models and then use trained models to make + │ │ │ predictions + │   │   ├── predict_model.py + │   │   └── train_model.py + │ │ + │   └── visualization <- Scripts to create exploratory and results oriented visualizations + │   └── visualize.py + │ + └── tox.ini <- tox file with settings for running tox; see tox.readthedocs.io + + +-------- + +

Project based on the cookiecutter data science project template. #cookiecutterdatascience

diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..cb5a700 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,153 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/property-finder.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/property-finder.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/property-finder" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/property-finder" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/docs/commands.rst b/docs/commands.rst new file mode 100644 index 0000000..2d162f3 --- /dev/null +++ b/docs/commands.rst @@ -0,0 +1,10 @@ +Commands +======== + +The Makefile contains the central entry points for common tasks related to this project. + +Syncing data to S3 +^^^^^^^^^^^^^^^^^^ + +* `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`. +* `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..c63afa5 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,244 @@ +# -*- coding: utf-8 -*- +# +# Property finder documentation build configuration file, created by +# sphinx-quickstart. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import os +import sys + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Property finder' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +# html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +# html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_domain_indices = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'property-finderdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # 'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', + 'property-finder.tex', + u'Property finder Documentation', + u"Matteo Rosato", 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# If true, show page references after internal links. +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'property-finder', u'Property finder Documentation', + [u"Matteo Rosato"], 1) +] + +# If true, show URL addresses after external links. +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'property-finder', u'Property finder Documentation', + u"Matteo Rosato", 'Property finder', + 'This project is dedicated to retrieving and analyzing property advertisements from real estate websites.', 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# texinfo_appendices = [] + +# If false, no module index is generated. +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# texinfo_show_urls = 'footnote' diff --git a/docs/getting-started.rst b/docs/getting-started.rst new file mode 100644 index 0000000..b4f71c3 --- /dev/null +++ b/docs/getting-started.rst @@ -0,0 +1,6 @@ +Getting started +=============== + +This is where you describe how to get set up on a clean install, including the +commands necessary to get the raw data (using the `sync_data_from_s3` command, +for example), and then how to make the cleaned, final data sets. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..ef71dea --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,24 @@ +.. Property finder documentation master file, created by + sphinx-quickstart. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Property finder documentation! +============================================== + +Contents: + +.. toctree:: + :maxdepth: 2 + + getting-started + commands + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..584d7fd --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,190 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\property-finder.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\property-finder.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +:end diff --git a/models/.gitkeep b/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/references/.gitkeep b/references/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/reports/.gitkeep b/reports/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/reports/figures/.gitkeep b/reports/figures/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d4f7d11 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# local package +-e . + +# external requirements +click +Sphinx +coverage +awscli +flake8 +python-dotenv>=0.5.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..08eeec2 --- /dev/null +++ b/setup.py @@ -0,0 +1,10 @@ +from setuptools import find_packages, setup + +setup( + name='src', + packages=find_packages(), + version='0.1.0', + description='Project for retrieving and analyzing property advertisements from real estate websites.', + author='Matteo Rosato', + license='AGPL-3.0 license', +) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/.gitkeep b/src/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/data/__init__.py b/src/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py new file mode 100644 index 0000000..96b377a --- /dev/null +++ b/src/data/make_dataset.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +import click +import logging +from pathlib import Path +from dotenv import find_dotenv, load_dotenv + + +@click.command() +@click.argument('input_filepath', type=click.Path(exists=True)) +@click.argument('output_filepath', type=click.Path()) +def main(input_filepath, output_filepath): + """ Runs data processing scripts to turn raw data from (../raw) into + cleaned data ready to be analyzed (saved in ../processed). + """ + logger = logging.getLogger(__name__) + logger.info('making final data set from raw data') + + +if __name__ == '__main__': + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) + + # not used in this stub but often useful for finding various files + project_dir = Path(__file__).resolve().parents[2] + + # find .env automagically by walking up directories until it's found, then + # load up the .env entries as environment variables + load_dotenv(find_dotenv()) + + main() diff --git a/src/features/.gitkeep b/src/features/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/features/__init__.py b/src/features/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/features/build_features.py b/src/features/build_features.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/.gitkeep b/src/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/predict_model.py b/src/models/predict_model.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/train_model.py b/src/models/train_model.py new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/.gitkeep b/src/visualization/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/__init__.py b/src/visualization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/visualize.py b/src/visualization/visualize.py new file mode 100644 index 0000000..e69de29 diff --git a/test_environment.py b/test_environment.py new file mode 100644 index 0000000..d0ac4a7 --- /dev/null +++ b/test_environment.py @@ -0,0 +1,25 @@ +import sys + +REQUIRED_PYTHON = "python3" + + +def main(): + system_major = sys.version_info.major + if REQUIRED_PYTHON == "python": + required_major = 2 + elif REQUIRED_PYTHON == "python3": + required_major = 3 + else: + raise ValueError("Unrecognized python interpreter: {}".format( + REQUIRED_PYTHON)) + + if system_major != required_major: + raise TypeError( + "This project requires Python {}. Found: Python {}".format( + required_major, sys.version)) + else: + print(">>> Development environment passes all tests!") + + +if __name__ == '__main__': + main() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..c32fbd8 --- /dev/null +++ b/tox.ini @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 79 +max-complexity = 10 From 6a052d0c5e789143212ca5769dc2c39a6735ac07 Mon Sep 17 00:00:00 2001 From: matteorosato <101740643+matteorosato@users.noreply.github.com> Date: Wed, 6 Mar 2024 15:47:14 +0100 Subject: [PATCH 02/28] Defined a first flow in make_dataset.py (for downloading and exporting data) Added .env.example file Added config.toml Modified requirements.txt --- .env.example | 3 + config.toml | 47 ++++++++++++++++ requirements.txt | 2 + src/data/make_dataset.py | 117 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 .env.example create mode 100644 config.toml diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..a92237a --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +#### IDEALISTA #### +IDEALISTA_API_KEY=myApiKey +IDEALISTA_SECRET=mySecret diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..f2821e6 --- /dev/null +++ b/config.toml @@ -0,0 +1,47 @@ +# API Parameters Configuration File +# Refer to the official documentation for more details + +[BASE_FILTERS] +country = "it" # Country code for the website (required) [string] +operation = "sale" # Type of operation (required) [string] +propertyType = "homes" # Type of property (required) [string] +center = "40.353,18.174" # Geographic coordinates for search center (WGS84) (latitude, longitude) [string] +locale = "it" # Search language for summary [string] +distance = 3000 # Distance to center, in meters (ratio) [integer] +locationId = "" # Idealista location code [string] +maxItems = 50 # Items per page (maximum allowed: 50) [integer] +numPage = "" # Page number for pagination (1, 2, 3..n) [integer] +maxPrice = 200000 # Maximum price in response [double] +minPrice = 10000 # Minimum price in response [double] +sinceDate = "" # Property age (W:last week, M: last month, T:last day (for rent except rooms), Y: last 2 days (sale and rooms)) [string] +order = "publicationDate" # Allowed sorting values by property type [string] +sort = "desc" # Sort order (asc or desc) [string] +adIds = "" # Filter by adid (multivalued field) [array] +hasMultimedia = true # Retrieve properties with pictures or video or virtual tour [boolean] + +[HOME_FILTERS] +minSize = "" # min size (from 60 m2 to 1000m2) [double] +maxSize = "" # max size (from 60 m2 to 1000m2) [double] +virtualTour = "" # virtual tour [boolean] +flat = "" # property is a flat [boolean] +penthouse = "" # [boolean] +duplex = "" # [boolean] +studio = "" # [boolean] +chalet = "" # [boolean] +countryHouse = "" # [boolean] +bedrooms = "" # bedroom number (multivalued field: 0,1,2,3,4) [string] +bathrooms = "" # bathroom number (multivalued field: 0,1,2,3) [string] +preservation = "" # property preservation (good, renew) [string] +newDevelopment = "" # if true, return only new development properties [boolean] +furnished = "" # (furnished, furnishedKitchen) [string] +bankOffer = "" # owner is a bank [boolean] +garage = "" # has garage [boolean] +terrance = "" # has terrance [boolean] +exterior = "" # [boolean] +elevator = "" # [boolean] +swimmingPool = "" # [boolean] +airConditioning = "" # [boolean] +storeRoom = "" # [boolean] +clotheslineSpace = "" # [boolean] +builtinWardrobes = "" # [boolean] +subTypology = "" # chalet subtypology (for propertyType = homes and chalet = true) [string] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d4f7d11..f47dafa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,5 @@ coverage awscli flake8 python-dotenv>=0.5.1 +pandas>=2.2.0 +toml \ No newline at end of file diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 96b377a..eaba361 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -1,10 +1,105 @@ # -*- coding: utf-8 -*- +import base64 +import json +import os +import pathlib +import time + import click import logging from pathlib import Path + +import pandas as pd +import requests +import toml from dotenv import find_dotenv, load_dotenv +def create_dataset(source_dir) -> pd.DataFrame: + json_files = [f for f in pathlib.Path(source_dir).glob("*.json")] + json_files.reverse() # put files in descending order + dfs = [] + for file in json_files: + with open(file, 'r') as f: + elements_dict = json.load(f)['elementList'] + dfs.append(pd.DataFrame.from_dict(elements_dict)) + merged_df = pd.concat(dfs) + merged_df = merged_df.drop_duplicates(subset=['propertyCode'], keep='first') + return merged_df + + +def export_dataset(df, output_dir): + # export df for backup purposes + output_filename = os.path.join(output_dir, f'df_total_{int(time.time())}.csv') + df.to_csv(output_filename) + + +def get_oauth_token(api_key: str, secret: str) -> str: + message = f"{api_key}:{secret}" + + # deal with bytes-like object + message_bytes = message.encode('ascii') + base64_bytes = base64.b64encode(message_bytes) + base64_message = base64_bytes.decode('ascii') + + auth_header = f"Basic {base64_message}" + + headers_dict = {"Authorization": auth_header, + "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8"} + + params_dict = {"grant_type": "client_credentials", + "scope": "read"} + + try: + r = requests.post("https://api.idealista.com/oauth/token", + headers=headers_dict, + params=params_dict) + r.raise_for_status() + token = r.json()["access_token"] + except requests.exceptions.RequestException as e: + print(f"Connection error: '{str(e)}'") + raise + + return token + + +def read_toml_config(file_path: str) -> dict: + with open(file_path, 'r') as file: + config_dict = toml.load(file) + return config_dict + + +def parse_filter_params(params_dict: dict) -> dict: + filtered_params = dict() + for dictionary in params_dict.values(): + for k, v in dictionary.items(): + if str(v): # keep non-empty values only + filtered_params[k] = v + return filtered_params + + +def define_search_url(country: str) -> str: + search_url = f'https://api.idealista.com/3.5/{country}/search' + return search_url + + +def get_results(url, params): + token = get_oauth_token(IDEALISTA_API_KEY, IDEALISTA_SECRET) + headers_dict = {"Authorization": 'Bearer ' + token, + "Content-Type": "application/x-www-form-urlencoded" + } + try: + r = requests.post(url, headers=headers_dict, params=params) + r.raise_for_status() + + result = r.json() + except requests.exceptions.RequestException as e: + print(f"Connection error: '{str(e)}'") + raise + + return result + + @click.command() @click.argument('input_filepath', type=click.Path(exists=True)) @click.argument('output_filepath', type=click.Path()) @@ -13,7 +108,24 @@ def main(input_filepath, output_filepath): cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) - logger.info('making final data set from raw data') + + config_filepath = 'config.toml' + params = read_toml_config(config_filepath) + filtered_params = parse_filter_params(params) + url = define_search_url(country=filtered_params['country']) + logger.info(f'Getting results from {url}') + result = get_results(url, filtered_params) + + output_filename = f'data/raw/dump_{int(time.time())}.json' + logger.info(f'Exporting data to {output_filename}') + with open(output_filename, 'w') as f: + f.write(json.dumps(result, indent=4)) + + logger.info('Creating dataset...') + df = create_dataset(input_filepath) + + logger.info(f'Exporting dataset to {output_filepath}') + export_dataset(df, output_filepath) if __name__ == '__main__': @@ -27,4 +139,7 @@ def main(input_filepath, output_filepath): # load up the .env entries as environment variables load_dotenv(find_dotenv()) + IDEALISTA_API_KEY: str = os.environ['IDEALISTA_API_KEY'] + IDEALISTA_SECRET: str = os.environ['IDEALISTA_SECRET'] + main() From ecdee3299b57183c8a2860ddf1cc683607200108 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Thu, 7 Mar 2024 11:45:44 +0100 Subject: [PATCH 03/28] Added clean_dataset function Minor changes in config.toml --- config.toml | 14 +++++++------- src/data/make_dataset.py | 26 +++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/config.toml b/config.toml index f2821e6..7cb8bc2 100644 --- a/config.toml +++ b/config.toml @@ -23,12 +23,12 @@ hasMultimedia = true # Retrieve properties with pictures or video or virtual to minSize = "" # min size (from 60 m2 to 1000m2) [double] maxSize = "" # max size (from 60 m2 to 1000m2) [double] virtualTour = "" # virtual tour [boolean] -flat = "" # property is a flat [boolean] -penthouse = "" # [boolean] -duplex = "" # [boolean] -studio = "" # [boolean] -chalet = "" # [boolean] -countryHouse = "" # [boolean] +flat = true # property is a flat [boolean] +penthouse = false # [boolean] +duplex = true # [boolean] +studio = true # [boolean] +chalet = true # [boolean] +countryHouse = false # [boolean] bedrooms = "" # bedroom number (multivalued field: 0,1,2,3,4) [string] bathrooms = "" # bathroom number (multivalued field: 0,1,2,3) [string] preservation = "" # property preservation (good, renew) [string] @@ -39,7 +39,7 @@ garage = "" # has garage [boolean] terrance = "" # has terrance [boolean] exterior = "" # [boolean] elevator = "" # [boolean] -swimmingPool = "" # [boolean] +swimmingPool = false # [boolean] airConditioning = "" # [boolean] storeRoom = "" # [boolean] clotheslineSpace = "" # [boolean] diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index eaba361..c927c37 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -28,6 +28,27 @@ def create_dataset(source_dir) -> pd.DataFrame: return merged_df +def clean_dataset(df: pd.DataFrame) -> pd.DataFrame: + columns = ['propertyCode', 'floor', 'price', 'size', 'rooms', 'bathrooms', 'address', 'province', 'municipality', + 'district', 'latitude', 'longitude', 'showAddress', 'url', 'distance', 'description', 'status', + 'newDevelopment', 'hasLift', 'priceByArea', 'detailedType', 'hasPlan', 'hasStaging', 'topNewDevelopment', + 'topPlus', 'externalReference', 'isAuction', 'parkingSpace', 'labels', 'highlight', + 'newDevelopmentFinished'] + df = df[columns] # keep only specified columns + + # cast to int + df = df.astype({'price': 'int', 'size': 'int', 'priceByArea': 'int'}) + + # remove auction ads + df = df[df['isAuction'].isna()] + df = df.drop(columns=['isAuction']) + + # convert floors to numbers + df['floor'] = df['floor'].replace('ss', -1).replace('bj', 0).replace('en', 0.5) + + return df + + def export_dataset(df, output_dir): # export df for backup purposes output_filename = os.path.join(output_dir, f'df_total_{int(time.time())}.csv') @@ -83,7 +104,7 @@ def define_search_url(country: str) -> str: return search_url -def get_results(url, params): +def get_results(url, params) -> dict: token = get_oauth_token(IDEALISTA_API_KEY, IDEALISTA_SECRET) headers_dict = {"Authorization": 'Bearer ' + token, "Content-Type": "application/x-www-form-urlencoded" @@ -124,6 +145,9 @@ def main(input_filepath, output_filepath): logger.info('Creating dataset...') df = create_dataset(input_filepath) + logger.info('Cleaning dataset...') + df = clean_dataset(df) + logger.info(f'Exporting dataset to {output_filepath}') export_dataset(df, output_filepath) From 14887034ade3da3d7ca3876e90cfa3c296bb802a Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 8 Mar 2024 17:51:51 +0100 Subject: [PATCH 04/28] Added Datasource and Idealista classes to handle the flow in an OOP way Defined several methods for getting results and process them Modified main function --- src/data/make_dataset.py | 307 +++++++++++++++++++++------------------ 1 file changed, 167 insertions(+), 140 deletions(-) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index c927c37..0993e61 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -1,169 +1,196 @@ # -*- coding: utf-8 -*- import base64 import json +import logging import os import pathlib import time - -import click -import logging -from pathlib import Path - import pandas as pd import requests import toml from dotenv import find_dotenv, load_dotenv +RAW = 'raw' # name of the folder for raw data +PROCESSED = 'processed' # name of the folder for processed data -def create_dataset(source_dir) -> pd.DataFrame: - json_files = [f for f in pathlib.Path(source_dir).glob("*.json")] - json_files.reverse() # put files in descending order - dfs = [] - for file in json_files: - with open(file, 'r') as f: - elements_dict = json.load(f)['elementList'] - dfs.append(pd.DataFrame.from_dict(elements_dict)) - merged_df = pd.concat(dfs) - merged_df = merged_df.drop_duplicates(subset=['propertyCode'], keep='first') - return merged_df - - -def clean_dataset(df: pd.DataFrame) -> pd.DataFrame: - columns = ['propertyCode', 'floor', 'price', 'size', 'rooms', 'bathrooms', 'address', 'province', 'municipality', - 'district', 'latitude', 'longitude', 'showAddress', 'url', 'distance', 'description', 'status', - 'newDevelopment', 'hasLift', 'priceByArea', 'detailedType', 'hasPlan', 'hasStaging', 'topNewDevelopment', - 'topPlus', 'externalReference', 'isAuction', 'parkingSpace', 'labels', 'highlight', - 'newDevelopmentFinished'] - df = df[columns] # keep only specified columns - - # cast to int - df = df.astype({'price': 'int', 'size': 'int', 'priceByArea': 'int'}) - - # remove auction ads - df = df[df['isAuction'].isna()] - df = df.drop(columns=['isAuction']) - - # convert floors to numbers - df['floor'] = df['floor'].replace('ss', -1).replace('bj', 0).replace('en', 0.5) - - return df - - -def export_dataset(df, output_dir): - # export df for backup purposes - output_filename = os.path.join(output_dir, f'df_total_{int(time.time())}.csv') - df.to_csv(output_filename) - - -def get_oauth_token(api_key: str, secret: str) -> str: - message = f"{api_key}:{secret}" - - # deal with bytes-like object - message_bytes = message.encode('ascii') - base64_bytes = base64.b64encode(message_bytes) - base64_message = base64_bytes.decode('ascii') - - auth_header = f"Basic {base64_message}" - - headers_dict = {"Authorization": auth_header, - "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8"} - - params_dict = {"grant_type": "client_credentials", - "scope": "read"} - - try: - r = requests.post("https://api.idealista.com/oauth/token", - headers=headers_dict, - params=params_dict) - r.raise_for_status() - token = r.json()["access_token"] - except requests.exceptions.RequestException as e: - print(f"Connection error: '{str(e)}'") - raise +# find .env automagically by walking up directories until it's found, then +# load up the .env entries as environment variables +load_dotenv(find_dotenv()) - return token +class Datasource: + api_key = None + secret = None -def read_toml_config(file_path: str) -> dict: - with open(file_path, 'r') as file: - config_dict = toml.load(file) - return config_dict - - -def parse_filter_params(params_dict: dict) -> dict: - filtered_params = dict() - for dictionary in params_dict.values(): - for k, v in dictionary.items(): - if str(v): # keep non-empty values only - filtered_params[k] = v - return filtered_params - - -def define_search_url(country: str) -> str: - search_url = f'https://api.idealista.com/3.5/{country}/search' - return search_url - - -def get_results(url, params) -> dict: - token = get_oauth_token(IDEALISTA_API_KEY, IDEALISTA_SECRET) - headers_dict = {"Authorization": 'Bearer ' + token, - "Content-Type": "application/x-www-form-urlencoded" - } - try: - r = requests.post(url, headers=headers_dict, params=params) - r.raise_for_status() - - result = r.json() - except requests.exceptions.RequestException as e: - print(f"Connection error: '{str(e)}'") - raise - - return result - + logger = logging.getLogger(__name__) -@click.command() -@click.argument('input_filepath', type=click.Path(exists=True)) -@click.argument('output_filepath', type=click.Path()) -def main(input_filepath, output_filepath): + def __init__(self, name: str, config_filepath: str, data_dir: str): + self.name = name + self.config_filepath = config_filepath + self.data_dir = data_dir + self.df = None + + @property + def filtered_params(self): + return self.parse_filter_params( + params_dict=self.read_toml_config(file_path=self.config_filepath)) + + @property + def search_url(self): + return self.define_search_url() + + @staticmethod + def read_toml_config(file_path: str) -> dict: + with open(file_path, 'r') as file: + config_dict = toml.load(file) + return config_dict + + @staticmethod + def parse_filter_params(params_dict: dict) -> dict: + filtered_params = dict() + for dictionary in params_dict.values(): + for k, v in dictionary.items(): + if str(v): # keep non-empty values only + filtered_params[k] = v + return filtered_params + + def create_dataset(self): + pass + + def define_search_url(self) -> str: + pass + + def clean_dataset(self): + pass + + def export_dataset(self): + # export df for backup purposes + output_filename = os.path.join(self.data_dir, PROCESSED, f'df_total_{int(time.time())}.csv') + self.df.to_csv(output_filename) + + def get_oauth_token(self) -> str: + pass + + def get_results(self) -> dict: + pass + + def export_results(self, results: dict): + # export results from query + output_filename = os.path.join(self.data_dir, RAW, f'dump_{int(time.time())}.json') + self.logger.info(f'Exporting data to {output_filename}') + with open(output_filename, 'w') as f: + f.write(json.dumps(results, indent=4)) + + +class Idealista(Datasource): + api_key: str = os.environ['IDEALISTA_API_KEY'] + secret: str = os.environ['IDEALISTA_SECRET'] + + def __init__(self, name: str, config_filepath: str, data_dir: str): + super().__init__(name, config_filepath, data_dir) + + def create_dataset(self): + source_dir = os.path.join(self.data_dir, RAW) + json_files = [f for f in pathlib.Path(source_dir).glob("*.json")] + json_files.reverse() # put files in descending order + dfs = [] + for file in json_files: + with open(file, 'r') as f: + elements_dict = json.load(f)['elementList'] + dfs.append(pd.DataFrame.from_dict(elements_dict)) + self.df = pd.concat(dfs) + # removing duplicates based on propertyCode + self.df.drop_duplicates(subset=['propertyCode'], keep='first') + + def clean_dataset(self): + columns = ['propertyCode', 'floor', 'price', 'size', 'rooms', 'bathrooms', 'address', 'province', + 'municipality', 'district', 'latitude', 'longitude', 'showAddress', 'url', 'distance', 'description', + 'status', 'newDevelopment', 'hasLift', 'priceByArea', 'detailedType', 'hasPlan', 'hasStaging', + 'topNewDevelopment', 'topPlus', 'externalReference', 'isAuction', 'parkingSpace', 'labels', + 'highlight', 'newDevelopmentFinished'] + self.df = self.df[columns] # keep only specified columns + + # cast to int + self.df = self.df.astype({'price': 'int', 'size': 'int', 'priceByArea': 'int'}) + + # remove auction ads + self.df = self.df[self.df['isAuction'].isna()] + self.df = self.df.drop(columns=['isAuction']) + + # convert floors to numbers + self.df['floor'] = self.df['floor'].replace('ss', -1).replace('bj', 0).replace('en', 0.5) + + def get_oauth_token(self) -> str: + message = f"{self.api_key}:{self.secret}" + + # deal with bytes-like object + message_bytes = message.encode('ascii') + base64_bytes = base64.b64encode(message_bytes) + base64_message = base64_bytes.decode('ascii') + + auth_header = f"Basic {base64_message}" + + headers_dict = {"Authorization": auth_header, + "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8"} + + params_dict = {"grant_type": "client_credentials", + "scope": "read"} + + try: + r = requests.post("https://api.idealista.com/oauth/token", + headers=headers_dict, + params=params_dict) + r.raise_for_status() + token = r.json()["access_token"] + except requests.exceptions.RequestException as e: + self.logger.error(f"Connection error: '{str(e)}'") + raise + + return token + + def get_results(self) -> dict: + token = self.get_oauth_token() + headers_dict = {"Authorization": 'Bearer ' + token, + "Content-Type": "application/x-www-form-urlencoded" + } + try: + r = requests.post(self.search_url, headers=headers_dict, params=self.filtered_params) + r.raise_for_status() + + result = r.json() + except requests.exceptions.RequestException as e: + self.logger.error(f"Connection error: '{str(e)}'") + raise + + return result + + def define_search_url(self) -> str: + country = self.filtered_params['country'] + search_url = f'https://api.idealista.com/3.5/{country}/search' + return search_url + + +def main(): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ - logger = logging.getLogger(__name__) - - config_filepath = 'config.toml' - params = read_toml_config(config_filepath) - filtered_params = parse_filter_params(params) - url = define_search_url(country=filtered_params['country']) - logger.info(f'Getting results from {url}') - result = get_results(url, filtered_params) - - output_filename = f'data/raw/dump_{int(time.time())}.json' - logger.info(f'Exporting data to {output_filename}') - with open(output_filename, 'w') as f: - f.write(json.dumps(result, indent=4)) + idealista = Idealista(name='Idealista', config_filepath='config.toml', data_dir='data') + logger.info(f'Getting results from {idealista.name} website') + results = idealista.get_results() + logger.info(f'Exporting results from {idealista.name} website') + idealista.export_results(results) logger.info('Creating dataset...') - df = create_dataset(input_filepath) - + idealista.create_dataset() logger.info('Cleaning dataset...') - df = clean_dataset(df) - - logger.info(f'Exporting dataset to {output_filepath}') - export_dataset(df, output_filepath) + idealista.clean_dataset() + logger.info(f'Exporting dataset...') + idealista.export_dataset() if __name__ == '__main__': log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) - - # not used in this stub but often useful for finding various files - project_dir = Path(__file__).resolve().parents[2] - - # find .env automagically by walking up directories until it's found, then - # load up the .env entries as environment variables - load_dotenv(find_dotenv()) - - IDEALISTA_API_KEY: str = os.environ['IDEALISTA_API_KEY'] - IDEALISTA_SECRET: str = os.environ['IDEALISTA_SECRET'] + logger = logging.getLogger(__name__) main() From 647af41074796a313555142e33eb6bacf34a2ae6 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Tue, 12 Mar 2024 11:11:57 +0100 Subject: [PATCH 05/28] Added information in README.md --- README.md | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8705000..59e1b1a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,72 @@ Property finder ============================== -This project is dedicated to retrieving and analyzing property advertisements from real estate websites. +This project is dedicated to retrieving and analyzing property advertisements from real estate websites. It provides a +convenient way to gather information for analysis, research, or any other purposes related to the real estate domain. + +The project is developed entirely using Python and follows object-oriented programming (OOP) practices. The initial template is provided by [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/). + +## Who is this for? +This tool is intended for: +- Developers interested in real estate data extraction and analysis. +- Real estate agents/companies looking to integrate listing data into their systems. +- Anyone curious about exploring the world of real estate through data. + +## Fair Use Disclaimer +Note that this code is provided free of charge as is. For any bugs, see the issue tracker. + +## Setup and Use +To use the tool, follow these steps: + +1. Ensure you have Python 3.10 and pip installed on your system. +2. Clone the repository to your local machine: + ```shell + git clone https://github.com/matteorosato/property-finder.git + ``` + +3. Navigate to the project directory: + ``` + cd property-finder + ``` + +4. Create a virtual environment for the project: + ``` + python -m venv venv + ``` + +5. Activate the virtual environment: + - On Windows: + ``` + venv\Scripts\activate + ``` + - On macOS and Linux: + ``` + source venv/bin/activate + ``` + +6. Install the required dependencies by running: + ``` + pip install -r requirements.txt + ``` +7. Fill the `.env` file with the required environment variables. Use the `.env.example` file as reference. + +8. Fill the `config.toml` file according to your preferences and needs. + +9. Run the tool: + ``` + TODO + ``` + +## Supported websites +Currently, the following websites are supported: + - [Idealista](https://www.idealista.com/) + +### Idealista +This tool utilizes the APIs provided by Idealista to extract real estate listing data. To execute the tool, you need to obtain an API_KEY and SECRET by requesting access through the following link: [Idealista API Access Request](https://developers.idealista.com/access-request). + +Please note that the free access is limited to 100 requests per month and 1 request per second. Therefore, it's important to configure the filtering parameters carefully to avoid an unnecessary number of requests. + +For further information, refer to the documents located in the _references_ folder. Project Organization ------------ From aa4ed5a3172b9dec2aceb90cfad62c186ee97b8f Mon Sep 17 00:00:00 2001 From: matteorosato Date: Tue, 12 Mar 2024 11:12:50 +0100 Subject: [PATCH 06/28] Added guides from Idealista on how to configure APIs --- references/idealista/authentication.pdf | Bin 0 -> 8691 bytes references/idealista/search.pdf | Bin 0 -> 48194 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 references/idealista/authentication.pdf create mode 100644 references/idealista/search.pdf diff --git a/references/idealista/authentication.pdf b/references/idealista/authentication.pdf new file mode 100644 index 0000000000000000000000000000000000000000..79b3a853de6bbbc32d9bac590e2a33b73d0250be GIT binary patch literal 8691 zcmch7cQ~Be`u0|$1woJyWb{7Gn4(6F5}l}{4<X-u61_!8q7x+~ zga}bb`Hk%CefBx~obNj4pYNUPomH;2?)P5LdS=a2US4fQWl;zaM#a0Z_-2$!8~_Hm zTHm6Qk^;%=5x(>YA433GMg}DJvt^*G3L@MB4mg~f1PJ8g;{(KEaCSgf4||Y@oz2+| zL5LxNIA5F~VUnJgHO|is1G<5AaRe!tf(hslG)Ugn!xrQ5vjs+jbTKwKQ>ZusC=LO@ z#2`R%6ab161ERzLFc=t!6i3U*P+?qb&sf3#H+C2opbyp&>xQw#TK!~r)&j{}d1BzO z|A`#}34lSMKrs{mA|?hDhXG(v2oU~PW{7_sFdiUfCo3FA5o63ozC^BBf*FCE@ zlZGeM$Kf5f^a8H(fO-S2>xPjz-0!^mEOjLzS}KbLo0QA^#+6y=!vh0v&n92@XZ!x& zM+NZDcTWgEee`{%K%@r5e7bM{a0Zycxt7T{1)QvMuZC|Jh%H41IiC!eAGWrl z16#*M`vy)Vn~h;XGTy53{Aj%;4f~)OR%h<%>^p8weI7kOTuJ8M8yo8TiJR=(DcUUF z_ZHc<21glpT8wgCn7!OKbYw=%w)IHdJG~+Hb#H(jYnd#C{f>|8q(GF3(vlew2eZ?g z%1h;;+dYLTp=wJ>O%yxOYm6COuRfeQ^`4YA%{20rC}Tu>CBu4q%YJNPAcP~6g%BAYq3~uB_pU(GOR1z_PfIB_WT;h}kdg;P00D~}j9ueP)et0!Yse{7F zlZ^c_X<`Z1QUAe?o9n(eiNed@;lJ#;N%?~zw#U*qu^@6{; zTvvj6@AZ3WNm`d@foD%;7|0#p`r^4|ZO?RKwnK#`17;2TnngVFWEgpvkd2+PE8C9S z;hbPAV##POSJ|_4>5(GOPXIK&UUK8}6S&5iHymR2;6*3{vF?^egR9D`YD36JxsRbe zBwSq3M1aIYoHR=|gMN=7j@z(wwJa&CC@pdDhUBYTEJW;z!1!xR^7I*`Z?OZ6$@D^U zjnC;6s0_*>45=wzX~dt`ohIo&^izBWxP=}OG2I>FiBPjH$+@pV^aU_9*I}@n@|LFt zOA?J+w3FGMQ(R;FBWs)z%DN`1_ zPiHjYIneQ@m`oZ<69BQ3YR0I1G-s3$AMU4}vVCV=Wd_qpKV$8s_SK@SkP)<8Hv9 z2dsNNy1KAYwc_n`9+vFZmpVaJL_$l_33B;vr!!FgC{KZbiW`Cc_*R(W64bI;CV zB}l=+0pfGfPi!QHEOA$Iu_|njJH4L#qBLXHTrRm&W_yc*GKH-|-}5XnKv1B9ID7Xd zeB<7l5*v;#?b`BfN+nwc`Qij29*E#PfS99x-Fk0{m@}THanye(61SyW=19BXJk2t8 zPlfQak|QN%$r$E|(X_9s3Du>fdbqjSLTB0~G|@&>c`1v6|Ge%vqUv%?gCM&Rh`T+! zjgp=+ORt^Vp zTbuEshFiRmnR&ioqvc|-n);2qfmKhfOwEQ0^K9CPpk!<+#CpF8#p)l#%Z2hPGQEfw z7I(>G(NXn#QwMdr-|xqUqc#F8+$K}9rIIg&8fq2j0;o_rT=Ioa$r3&8M#PTZzSbOD z-sVA1&Ori?xn`+wF(Yiae1I{9f+@z#{9W3OM~|NBE-<{(MAS8^u?4)6zt@=Q_+BP3 zih|GCrwbh!SNPk?UBp|3c6Xgf?7$OMMuIf>!HlolUAWu9b6bta zKyg9m6(u&(WywGG3Rc-fcga&UR&vpwo>H=>y5}~Ph8=@EznJ0sYh|QNBL%h*<~GuN z#Y}hD_(?*2Ge-#jg$Zu)7THAH^yR7)*#kHHNGb^@2#qQ1^G$@2u7lZf?=EmAB_@*D z6R3XL25(Eie7|Yo7X3MPQsx+mJUP8|Ql0XFcVgOtoRL{z zCg8A9s44`j6D`;F4I)5qBwE<^^FVY`OyZN%7y9ywOrEzWe4qWL!gl7%v4n|`XvFS4 zb1jP$W7j#SmPXU04c~Mxv<%KzZ){htGIM>Q5c;O(f$Qu9C_LMz{1)D!+^f9GOYjQ5 zq}HoET>YUT%Si;XKNOF!nY9agKmNe+F+Md{CTrHtz}$AXvsSXwuz%FQbx}vZ#GGCk z@qEPizN|*~d#j)~(?EBOz_&@rx0X}HsMdwi?@6MdCG!-%p@aat(&N?=w0DE7t<1w> z6|PichvxkLfVka26ONMbzUEjl5WcyLb8GQr{z*p7x0eUP`^U615(?KX{Kh8LCX~0| zr_RmVuIO<{e%i*l3I?qDl8(*o7FR#tNpK#WjNUV8*vT#Zw2njanRY1yvyR0=oW;j0 zuC5*TyEJCwYF-tKNSvlg9c0N&Tey`ApW31trcOMV3U5Xoh)4zUfH^MHOC^YEz zR2)5n80xOyJb7}WRn*q=OoSD`$-OO|L%TMUxvk09m4ZLA%s}9O1ZN*R9>$CMG)%ag zu$PQ3I!#vYHjFxRQ0gw5(fY}HTw}~SW_;=s;^r#(EONuyzD9lD_k4=*#hX(Po{bPi zL_Rrf7)@%)f#dgFc%1g41x>$g>`L8}=zI0QnP=HlqPz+?}>+F zcb|u6v)OQ0jMIP`CwKBa@JsVgLQD1+SJ$`PvNUr8eYUrHua)MMa0pj~9Apj>UwM>6rZKft zm`er{d7(P~ePbCZ$QiK4*ZE9X3uu?|?Aep_f#D&JgVE}ff=Ox9s+;)mDSYiIuhT-8 zv8Nq3zC6%$Yw)GtL3&Vw&vWQV-@yFS<==D?JIW&~&W)l=v`2@IY?o_&zI>fX{qXRB zgz3TH9iO_n%Q?=N)QVAD4~KNXk8vGa_t27^jar2IgQ&7tRL|6?c~$x;^f*9frv{#w zKu$7~d`xvBdSzbzb7P{=yTd->q$oen0*t^VD(w%I*=7Un^h?isHfZsc++RaNP&#w26Ra~zEZh*AYu44=`JCDS;d zCA+-yc?Ew0aZ5y6l<&JGbe-P1}v@zWAtmY zH9g1RUw8(2Emjhn{T%Psn7F{b!5Vv|g%w|Cf^CKz>jyU|PreuXPpzrY28AZ}1xuPE zdEWeH4~p3AXW6L#AaB;Jd1YphV*7bu`c0+{qSB8Z>lhP*1N(g=U;HSpQ?+IF{z6c7 zGfKp#)KH?Q$N8~H>cF>*i!U0%&{B`(AA6sct^Ap>`u^g@>O#Jia7WL>*29`&Xv1P% z%+68gUHhUh+wB%$c0>GHk0o#{kv@x!rQ3d`T1~xhRV^Ylz1%_PTM6L%_=`sZC}-m-mlVgoOp{a$inonORKcolv$2hkD}WV(+$$nZLtp8 z%)`S^GRZGGl+%J=T3jtztl05VqKRvYUSB(3YbDhF#dSzUd)}O*-xAi_nH# zK#JoCP5fAqsrT-8{2HH#U0+}o$i(f1I}3W2c8fCZ5-@9!cW8XM^38E%Ye#3r!_!oR z@Agw+|260OZfn8fPI3c~{?`F%mcg z3P7Nse~E)Y&H^dF!ZiOm+yI4(iT@?sux>IHH{C@av|>Ora@e%(~fi6NtK<*uQ0P^D7q1@?FdRk6A&^@A72 zHZP<5RuJtE+GS!WPsdtl9O!GZI+B#{Dw*A4rDdf3tc#i^bq$l^RmzBFiPkNty5gSQ z4e6jE+yCge{(4@=h~fCEJgVF{Dd-g)1R zB~}ouz&T|#*&$QQ^aa|^1Ldc=X?)Mf(wJpL-t9WE$a>c}C3SmmQj^UsS99Si#X@+I zut=MJW?+3oDjG#$jgh80CDa9wqFr1Bv}5D#HJd-%j?Uky8?&ak6*?)Q}+8`gUO zyGx)I+E!Tc$v&er`-s5?^28@lj zzTEAcNYaRPc_M{>XPI2Q(sTUyz}hu#aQ_*-h=z2MN4+w~H%KyA9GeBcN*DRHawM&? zH%36EToYbUS8iY4QfX{@S@oe2buY4Hx6b++f~7Ar<3%O{6e&jII?;7B5y2PL@i9O8 ziCt}ZX=PTWh!NdTd>pIO{qpX2vp1Od1QYWF+ z-EB%@E7s`pr{Ae3e;?l*afJImA-t`&3fxo;HD~fy=R~qE@nnS0GLk(rh z{a~+Sz@VU`)_o*Uo3XY**XAXkO##@)8?WKzQHwT$gAAm9r`p+}Q`NCFGwr9Eb;nm7 zVK>azK~uPS^HjrhU<|C;UjVl+TiZ#SoVrUi!!Q`*3idf-nKW?S(mvJA@oTp#%aqXd&v`*J)_zvgPnXDeJMr(Y%-%Ng|s@dnu<$U z-in|HW#iQRGwW}6q{TxX0>%BZB(5@&Z&b4MP1)+ReHr@FU4ARB{z5-&O9(-l;zl#faAluQpR}TicHCMJ9>|V z6+Lb?nVFj^56UF@b+>Ah-w$-ZdmOE|@!|7fPDZOrYKD|%_JzA7dt*)mzlin|h{Ean zQ#vZT-_PFs7os8b-8Uwa%I;8y&!miYZHqsg>MJ>=xqb_)g*SCi%M`jQ5iP{7sn)_c zq#tL%Y!7q4_~=5YZ*x+Ver}ndLXJ`~Y`*XwH!#pS>xduMP0?x0RE#yE3xXGm2Fcs! z?b=|AEj!CE(#quP(Kew%9L<*_Xd|7Juov;e$YG^@9-t(7&LWESlP>=%QgdS27x*dH z7?q0ixv>~Dw}^V2Sd?d_TUH~aV8?SO5uD;xQ(c#6!QV}xG?c(a^Q?oXH_b$l?5mU^ zpJF=$KbHv8rze@)4JCF(VFmRhb?HHF!tXF)st%f5QXZ<7Ej{sONzO|NJh!rSHsZ=# zuF3=5jW-ijjnSB%q8OJ(GIb5vH5{UPpHL4BbtvDx-~6KOS+MqE zb#;R=LF7EQf^LYEQ`3vq$t?#5Rxxj45(<I>DqS?Y(b*w zm>iF6C(&hoRcbyQ0E`Ow;N?740pXq(xX9GqtUv%jp<7*j4_Vz;P5b~?-f zVe4?`YiXkIq6jIG`|P~(lOMwYYyQlco0=fblB5v%h4I`;Q{rr8T66#!jmYhBnc9n2 zB-TkV27U&f=#h6Qx#VG1H|RsTT%vQDu}x_4O~H++(8p6%6=Ng^2mP(I&%89|!_AZ- z-!1gJgbgoV=cnjZEq20Z?$%Gl2baAFopz4SJTNZyV@_e=1IUJ8s)>QbG7`MZ3##v3 z$NGhesFO^Cqp0XVdK$#!X+5Gb4xfHbQDJB9D3nb>r02;bP_MB z4q2I-cWLIok?XS8e_PWkntT)OLiXUt*7O+N^UDiWj#^lpNeoTP4R@T{-Ro`Mr|R#W z5QfdtYr#mte(uBKTZIkix@ev@)Agc8Vxrh{?NnRdWiM_SUWlLe8)QbEDn_+QDLs0) zVk(L`!orkRE21U>eSE_tb+7Cmd0)10U(SN8qRmGs)BXgmM8 zwmGSz$_H20B4(~rr3;HB2>&=-h~Nhuc9?Xf`fy-#PQKN%V-^bKELm4S!d& zKbsw6h*YKQOs?)I4Z+V}QHNbP#D27$;bD>GVrb)0lZLc3} z1!a4I-E~BVGzZpB{oQMJ=Sk4{yhmErZw}L3jarR3BsAn3lX7RMU+1?rKfk?$oOZco zwEj_)rD|$MssCI6=ic|FqvLNnX}sjCi3$T$EeifGO@i!u=ZR#Kvqhqi_K%SAq)3(s za$llCkrLDpRr2zRv#eQyYFsLdL)C{GB}$-Yv3{jjie_)YVM@BqUJC>?OrDtFgAuF& zKU6zJ#7zq~QJXW3>`i$a`A&U;i9i+-j)N*ntFa201l@Y@X52jR0O@hXZ=9s z{c8`ZfhlRJiH**UFOB0CJEY#S3+p=QhSqoaP5fSDaRu?64wethPLOvF7kT=c_$~eP zHw*3Y)u?Nt31-5|Y-vIPN1e4N8nxROvFF#Nm*J!$H+N4Y!3K`|s%3)%q1V zhnx}$z<1sIlPR0U@5V%yo6%b|2k(f^4Y5xKS)5i$Ps&R>s44s)@y2G9XH>UkxpsiP zt1iQY_7OxNY6Rh-~4E{(~{%rq({^?n0dEuO}E*MXcf~yz7 z(uaVQu^yg;@^=T}hSi_-e_!Svz6fy)gd;{+Z+T3tt^;>-oDP03bj0fuI9_xQ2i19u%aiZ=^2(1ww!bLN58w zP5;Z{&pb(uv-GclC+5$+{|TgEg|l*UwFmvt3;11ke)^uCzq^2cf&5EOo&cvvNGl5{ zNb z4k7I4KhH(Lpnsc-{5PDDVANmag7dJ#I$=D{-m6e`v2z8St&wofs44F)0CE>XI>yBh@Q1_5c127!NquDzH0?0wc= z-?^^;`c#;VIo|P(dpvPJHIYnKScDEh&jLrbu{k~k2Vf*(B(l0^wvtjia}T%09+==prWb6rk)J>U#24W4cfc)G#g z#K8h!rRU`QA3$LS&@%&}FmcecF+T%i`wa@y--p8d1m%B#gb6^;&Pv3|`ShI)2#fJI zAk6wkcSk%^x5Cl*!~ATj_w$8T7Gd;T|Ao|gR~7dB3yg8r`AKFNiZlj+%R**Tve zu>q0&ZnwZa|1K5)2Qxj(-?f%!EWit&3>WzF3)%06%l_YCd4l{8xd1r+2MS_kXMWNw z;HOYnpYZ&qAmE<=jZpp!goXX@S}l-NSlL*fjTXTCq}(S|zZ)%3$bT0H6EjeXf7^Rl zSy`SfmKEs2oIqI2zgsNRf61~>3j9MeOpMI*fd3H|7M5p=Wo4u1015?&>^F;L`Y)OG z&sd(s3?!q!FBWFjXM<&ZHWyYPu-^=p>Az&!Cv*BkEWj|r4!rE|Lt$clHd&@;u4M%R z`^{vT{)%fA<-{0(Uz8^H_SRes45kK#y5^>K_PX@?Ru)u$Cl&%YpXiK;iJ9r?L713< zrou|Z!~!%NR^UVZ5f7REif{iT9#i|j2M0*_97Ie^Kn)Wy0XXS7n4h7sF#Q?KUlHzq z1Y`X-!Egf80u~?+R(clpXHaZxz!#v1{*30YDED8`7+C2$(A&A#(f>nd4A#2FhIR~& zriM;WKhf*z+nZWhzJB^y%zzgFU}I)xWC#2$F)}eS(gUfKnGI-OoJ7pboKI1h8OV>n zYxT*genn9cQ*(PmTLuwxU3)`eLwzd)U>;>?X>4yo!~(pIkI&BD)=<|1&N;nSV=;EK zFUDg>ndk>8l?XOSfQ0VKfr@rFo|Z)GdeAiA1ZiH77Moa%`7k=(9mDOH(V~%(AzBR4 zI`dt$WmttwE*EMU(6!ul?fX4bMa=Fw44K90@+>7UEsyr1p&4JPNaQ2ysIKyhBzGPeCbz4!~LS+-FK&#{%dp~mnV zl!05)&$rvxMYTl;6^G`q=*Xw-e+h=xfE1k zk*(}LHLPtKt^Xwj4XeZlvk)42s)G{l3g1y^-XRhfNm}sb!5Rjlw_mo$3)Od`3TUxn zurJF@hl|k9o1oZ0JEQa_km^D`u(SF?KYl&0vei%H8W|$C6UW|n<-w=5(%&?n;565SXkgs`ZAWs zbUp9W;TVT?Um6R7vtQy4>AQ6W5 zX8Oa>dxVeo+*igk?+bUkrBV6A_oQByC;R&qh{~d3ioF>X%;}}2vx-;r6Q2qZsV$Nr ztVnO2+PQ#PdRoD{TVY;+#=dfPTN{OXl4DMZky=l6PRv1w%+omLoSWyRe20<m>r$N5KEzQ%(DO1VuLl3W6}DhOT=|yWK!r}fWAn3h+RfT zLPbqbU!K;GKzi0Q{f^Eo<4aH14TPTF=BW;|gfZktodRz(>&(o=ikjfQJe5hls{4|o zwzSL2EAD!>^Y@HMJFAh}aCb)tD0QRaCT5Uz1>IJexLS^_5%DBbVj>G%kw9** zu$|g;8Z!3C%B}@>D1{E3FcW^|X&%TYkMQq>scgp(HI>)FA~g7hc@qI$nJG|1X=3VJ zivuGB{nSwF(U^EN%9#a;Vlr?8B`Fhhf>huEZRm;V^&`F>@eyaG5_Xd+gG4W@^kfjH zX-=7T#XxVi;5Wn=4ROVl+0yT5%~E=rI*$-^h5JAMTt~5WTkD>3p5y%k4C;n7nK9Cw zx0vfV(&iwej3C_IeoHh5*+P}I^AP*gfIXJ=aLpR;7~HubFg7rRfZ~ANEG6BrMzb0b zuVGV%_cg(%Nmc41UPXH7XpCr8*1`<_xJ2MGgycHO8K$9%o4_qt(~8urD4Y8n?Adb2 z5l~XduP+$k3QlrNvH4XQ9aQ^RLJU9kmNu-M`Ln2uj`xd#&V*x#l0Gj?ViYep@%J%+ zSDW^6LNqu&XDC?dbNnir(@p(m^UiyxDW=FYZpvlq1QmH!c9N~~*dKAnB91kZ&9cr0 z+zr13X97T*kGSzB&M=UqsiVjBgq!(D-Oo&caOoxS4hPJ&duP zXv`0ta-a#QTkkR2ac;b*eW8Pw+pVsK&1M zRT?MeG4v1Me5AH!(=-J$MgaZ`!8YpE2~Qtg(_&aBRj>15hVo&~Ai?F6U{Xj=>wKMQ z^QD73auihi(@~Xo*|&c21@FLh!dwqtqjBjYGm@6ogr?hBhb=Iq;Wu7IBJw zGeegEa7zGE-{gLuZiFRbT=1*+w08ppp9V63GB zQ$5C;=;hOgNl?q~J9x;J_7l#JyVayZrB2#nQI5-$JbQfOguLSKCh&yNFwZDYk_D`S zecv{>VZF?y+#59pul@E8Dq*Qv+@*#f_wD+7a-Pt4YmvmLJv{U|u`dtzzI`6^;3l>0 zr?lOv7A{SENhUkkS{nbB7~>`Dr*ldXAMU`BR9HFpEd&VKnHALI9%t)Lg%?bbUgB0lw3Crn)gH8iiJ{`M~Txy-(qBA9{Sh zm))7B-@tz^Eg89N5#~mL6WR2^29;J|6jg!mIwE-7x3D=8o%5uCF-sjuZ_EwKq5Jt+Tg~nLE_@G$Vtapm7M9q3u^2eZ(@SJ{7;a zTRqB7$d6z=FTUNdh_O+9uqEpQd%yG+2H_+1nEyLk?=iGDLk3+hl`+=v*y54s@jf#0 zg_519>WQYrGR(*(NOe96DFf5#iwE`!jXG!FTley5$O9{DNRx%huJxqp7mqf4nR#ay zb~Y(9JEQpwSr6N7==<`z%W10bt?u74XzaC1C5^f`H62Ali^C&I$s=!ZP(w6G*lAl4 zcsxwyFXwtcmSyE_7`Ad_q+NJwyYpm*#gQwfYmMh`8aB1i)J>UknL7pTY&ZvZmu1Pw zSMr&Mk?3TEMK#|$aB07yDi?F}OZH-UuVM8SzB$Hd%@k?M^9HV(u&jCXwo=j=>45Q! zJ@iCST(j}cZZk@7%xZS$;!Ht9ddld^rhfgSmpe#~@6KF|l9%he<~HS(KvSh*qo_PzFI^j&V$8|KYQ)gG20$=wo6)-#Lujt@fTCBdB>H>2k< z33ZkbNSgPP?(R>6Ox3-IXBguAZlde5Q`6c#`I@?;y}V1(V1Ha3M^|cU&qf(`+`qb{d#Y$0DznC$sb{w|6z6oGWnE^ZET5BtG2()he;_%w@s@LWA z!NDoBZq>sD{e$I6@fqrdEpNA$6SwIJ!(0ey1Ce{{K^|uZ%+*aTge;Pp`6pWVD=tDc zTPF`Zc8hNrd)%aL zjCjRF$40OkzfC`$BHR$GY93!d^lIB(11sh{n%1*L>ruwK8JfO%pB1|$`o0|xLRsP_{K8w#z_5R|6O{gS`w+`< z(y7Jgx_HEvk z0RV-@GQ1Ju;^gHeTnrEDS5SxB6S@B3Tf+8sX(u-4B08f(o%yW%ARhAAB_2o(xpl3F zo9t9QOJ&S`2~YzkhVmf2h;fM5tNI%_OFn!EG$;*u|5f9*HK;%POyi`k%VDhE+0=W-`myM1u1XfgmFuR%g2sB~_n_qq z`IyU(fh>LP{j`i;?#dc@g>x_2%a6!NSc=>l(X~Ij4DTl(@{r9c`IgJ%^YZF@dGH*( zeqna8gqXJdnZUHGY&Z2{y!a7X^`Q{vsaN)^OY+AQ>7VTtAuCIJLrZ%*BBq~>OogXG zl`OEs1Dr_NSvlD18`=TqTe7xR`tpYMMCuH{u^179f}yiL5rdcoaGBt*SD|08Vwwyh zz%7B-hyWX9&kactCZeaVp&WwS{pZnhvy%w$ydlwZ<5&*Z5PLrN+%E+l{oLibMF>3lxeL?t zE}Fncv;{sk5!26}sg$9Csji@v^Yb(gIN~8<=lpYHkNHo1>Hpl1Vq*ue{<$Bexfn_E z))v^0`tqs9a<=N|4eyW{9utvD7=d%;+&r{&9PcQBmKv%0r_Zw^uWzB#lNyxCRS|_y zNF7L-rA5+tC{# z7=m*`dl3i~N8HZ->7o)SksevW2Ef}RQ00(yEq_&zDgVt{R4M*o9H=PjwIat$8B0*irMMW>Fit^1CX9l^QQ1Q-V9wo0XQankP?H(IUog$p?YgPidR1J9j~ycj`nkNR zis0K1Z)=%#M35;6LhMrzIATL=9z%|q0(%=lEz@C~{rrHjjGmi3G<*oA;MY3(UB+lV zb#V)dy#@XXf%*aBFJyWX0O@s}BHJ}gBeMw;L`=RcuSf!CNP{gXsyls^5Q`9kh!%R~ zlgJT`OBu%CK}_TMblb}Op?uI^q5T4wLWkA!URIZvZ1!ETL*PZZUq61RQ|RABI%zGK zJ=rU7V~CJnnia=c7JjBk#u6M(6fqE-%?hixIB3B*1+2Q)VT7BV_a<}JOD=X>h@wSy z(ShPNLE#{N<%2F^{`9nqdgM=?4Y9^ZwNmC1_LW(;*+h%}U~KW=GMbw$iExua_1!#3 z9{Q8(5Mm>s^w^aO&aDMCOuKP5jSZwTI@~p3kwmD2TLyalhe`b{$QwV!%D@(m^zFqX zWx~5bMFd*>%GSq7j-}{B{I5AU0KtmAp8-SS%{^ckmQ1EI%)5r5ML6+f7Wl3jkC0P$&d9QlIYGlYoHiHBaF^N)+O(DH~d>B6PahmVKTUUfY1Az11~8K{)+=k zZ#OWFDgW-k^_`N%P>O*+?gSCW>Sgism=n1(#FpxyuCK!7EM8L6W7Lce5q z#^Mex**eWH(fb+ICd0C9tx6~^^&GL+C5^mGQ!&tjhkwLT&IVs{f1PJvf3 z=tBRb8$q%vcPr`aYHa>17WpdTy#DTwM+UCpHl2*Br7z!lL+BxiR!B-uS)4W^8sf5k ztVxZhNOLMGNuLkIL9+_9oraYR#}wVn1izVpK`nJf?_lu_g2?0ovL|?fWM5JIm=p}P z1(W_&cugT@aydL;+U?1t^bX3#H7boq~21(xw zn=%ngem_u<=Zr&i%GI1ToQ4q&$Mna;f6GU$*YHC3rC_~}F?c#Fpz8)oIwkB2Rr?9ZZ>CoN9 zp*bUjec^PCxbxyxrD<4afpT-v^KG zHPU&k%teupGM!@TVw_b{6lz1*W(^GATx)xr#shCKB%)mv6Ik0}QM-5p>WL&toQc+M z&;*AVy@!-a-t2xUjE=2i+Lj(p!2=Q~0SAl*iVuIE{{)ag$y>`q>>cbRG?m+yZNFn}->7C|TldbXsI8)PZ20uGXZzgsP@ypp_9zMW63kmT^qpz9$A z7zT$8hJdN zxWU^$%+%72X0&heCcm1h*UD|5;jn=vxDsEz$bw9dt1$dypQ9pMZ2{`03Z@i$c8Q$g z7F&NQ*U3i|IjK6MDft%EV|LXxcoHPF9q*yp%022C-Yc|dtxPf^Ri%e zA`L1(AI{OJQAT2Zwnq;IJ6;L6+P^~lR-NlSUg=sZPMluA)UU6Q&DR%fGY#KSRKFV; z<5TWMykwUw$+p!Uhf{{2JB^30qMW#Uu?lG+H;+BlOERL|TAX9@(K4C{5V_TD@D4np zXC4Ll=*~sjoRQg(w1` z8-$vq5-yjaG#pKYiES)6Io1273F@W)_%@o$$-0J+5n2yh?7U1Ka6}YnTjf-L5ee7{e_{>q{jIp%kG=9H`oHCCCws4o34q^ZJv&!k*Q3 zxG;LYN}>wS8eokF=9?sq7KY-$po<%(km^)aUNc{_&Ma8oS7+{TM3qQ1J(uufrWX5< zstHmDcU?%(ofX6oo8OQXm@-)bQ>J2^seHq+(&vOTTm!``CLA{7DH(FrhRnrPn2%7T z=H9bt8#wq+2?mJ+rV8XHG;!x^RMlRui;l2vLQ1ZrkZ>nZ1{H8Yql!(}k4?@yk;RR^19rO{vZM+kl?LiDCB+G%Dj$qjc!AHkP<5qbL0~)^iG|pvbVdqW3L7^83NX_}qa2w5 z>le`Rql93i%@sqL5;xLM0VYimeP{_Y8GAGqY!+APBh5|m(J0ixWv7IJIp`#THG9CU z9qJ{j^F|K}8cQk@j+s0cJ8?L>d3Q{!~y)*j@)c(cu+;#)vgk9}4Mbi~Ll{mFtv0 zDULfj$4g@U9#{==SzF822Ze;gWXOO|da zPnE@6YWbI$$tj_&bxs&~{KDl(pBUv$AwP0rs zWIP;jd_KP{(dZc)xslkXc3_LyH+*V-_DijEo~cX)uM zFh=C;#kDEl)pw$2i0M^$yYko^#HB#L+MPnnC)cSf^MzLa9zci&AZC6_-j6Bc3W*b@ zMbo-kZ`z{mrU1mC4IfWyqJL8cN-{d`Mj_|ig) zkvl&qniQMD473}Y(K{U!Jg^JH+mxs^Kr*`v9ZL@t?zJ>aJWC9dwOY^ehZO|>Y>01t zu?&!c{Mx?GB&si>yhY{+#wl$DSDq*f+GlIl}f~R&ot$MzB+$Axj7`HS+ z5^ys{wM-B}(O2Pdwf?%aOy2PkhU{dF36{8`R>bnaVQRofRZ$@TDPry3EDW~Tovo)V z&Q4GT>}s%(HMY2S-@URsj(G4lA!+tWhno{foligPq&PJ7#lYX4zGG0}@xP@nI?^(V z_QWCb{|OoJ?d6T{C7lNq$dsV%V={jL1)V?02W*HA*;QUe`~n0yMsR%x8Fnl2- zZ`QB;amiqebUF1gVhYZa!Z{zmF~_3%-+C5q@EcB0b$}3)6886Pb1R5Cb4} z)I_aF;tEc{I;u_{(_ho?wIOLc9AIffQHB?v7}36HRDSJIO&c`yCgD*OO*qQZ^4jO6 zLlSX1wx2hyao>clyxMgytWV%K8v%)~PYPNC@x6SvN(F1zjZJ>xNp!4h*rsM`G$KnN ze7K=K!U*`(tg;S(q3ff_7W!mgS_yxwS$OOf1Vcn+_l^?ejHU{kMy$zT$7P?WTaSqL-hOPsIpz;< z0Ci`<2nhke6nx=dbV~w05ekvdKe6>e0Xu2en_ddNvNOHS`EM{=4m&c|m$xJ~+jCBq zbx%i6FHWXzH!l~R9?!i3nKI<39j@;b(rdZ3>Iv(27FEkUDKKL$Hn6Cdd-E7gsesq}(&hCFm6mJBuw@&q+et6RFZo&ZNl8gkFFc;r*65TqS1ve^!^Egk`jf&5mD)@XqDe2Iu{jhP}<6@ z{KSitKYAoqvRB7!e!JW7xWRsR=6H3ABfgT(M`+c>KD^)+qrDSh49b;1>RW7g$86vC_8xJ2w+Tqrzr%Y+H zQsL%l)yj{G#t~ETuRl=AmmeRZI3q$@@|Pd>=$DqK)?KR?M;b3t27_kr9bULDW5_fg zJ~(=sPZj1xymZ#m_B;m(Iy9=!oTlIL^x`_K-&dWDe6`==;MRU`7$$Si?L>Uy;^dJD zxhWl18QK4Mbb7m2CIs*F=3>~gnzxs5Y;d9g=0_~zML$cM1uJMFDIeOcOeTN`G!Hbw0YuRsYx756I@;TUQO zNdfA50S5-%io4vsw#;v6ZCZTpZf?#OEqvw|YUsRO?5*-cMjo0$`vZ$E*Q5LCJ7a-o zN4<8=%yDg~8BDWjqhtA{SCO~*Qyx34r6lAzo+^Z|@2{q&F5P#&o_i^LjCHA&tUn)D zO#hm>ly2_$f`YjH$fA~=#%r~k;_7tp$Jy?WUAcxNdTpyIX4>uud1-`l-E?x`xQm+s zK*D(;8+m$TeDBIjkX}UiXy4?-01mUURL|I>`T#?1GOuT6rI{0gYQ$sArWi5F*+Aoe zwwcs_f0=)AGqp@1^ey&mb&|}uwWs!n5mvS@a;mae{ zXzih4a@&~+Cl@N|>{lf{$E6=f*s~I9y^=7^M|GWkgt-B%-wJ8YJ5kfE;5FNI-D*P@ z*Z94zIx59#`z}Fuc>yH4ovF@NBKoDUxc$LGokpH7c9zQu#bnFOJSAQki$#)QrC8Oj zm@byJjjcn|53gI1*2fbdup6}8cFH=AcoF*o-yD^IgWYb`So5zn_1;Ug&o!zziuhT6 zX3Io#Hf1Yjmrgs|gW=lD;bw@^Bk{ZiS?V7CG<>$Z)l2uHobHD8K=*FZYJcY3BG+pzus><41iB@U$B6bj{aC z747rRvl&*~D_iRGPBAMJzqoCCfRp2ot+}Dq5!iCE@%_!M-4)&+bCAZZ+3?b-Q@u;` zr&lOa^S&$EPInQDVNOR}n`5oM7ldJN+G(-#&`YrbmPx?d}kW-Ir+ zxp;6ZAOv}Axt_nj^Qu#djQv07b$|Mh`#pa`*Q}XMg%N zivK)1W9RtoqujZgi?JmAHXd2!{0Hb;D&71qEJtBl2NH#JgSumDQC*|YL5_49qbK~P zH5zi-1GDz_%33l?XeFa?;6l@SOXO4X$EaAEoLnDluF{a5ihJ9Ru?UZjJ@3|!kewbI z)ajH>+cn)=_pu&h9yeEyW{P{gmhW%Mi+k^vw=3qX*uvH=n-?X$nk2mz@Dqzq$8TPz zy-s|5xH*z>LU!8ky>9P6K4!Z!Y*)A6uOW%wje^|HVozK4bP?S=rz*w$@O9iN%Qhuw z*|J{y+AE3G(Nx1T1<57hqDkG7w9O>FuO=)KTA$~xC&N3wXxAKog_YAY3Ad$B z#v^JD6CnKdjVa8=qUk|tZ0HAt5IV1h;Ucv%giEI}O6@dniGQS=^~sxae@;?X z%`<&eZr*x73Osk{*{B_rZy@GGY#imSA}O+Tkm&d>vXMx;#41ncC~0|2p>a^sTq*&> z>UOIrx$Ns5_MU%JU~`Rl(>Dy!H*I>epZ)6&6V2!<_OCYChJEpwt>TXa)ibH`-6aQY z)rYY#;OH=-!4Hlc-Mp(fwz)V~`|B!{UtjjYA>nc1zCDb;*4s?*R1>AlQsK%Mbz9=> zrL?hhS;Ui!1U0CpvzQVji+8XRY)~57Nmmdc9=G=X&Y2zX0xujbL4-x?A`f5yPL3JP zs+ZiwiMPKduptOT%0w+Fgt98BZ=4_pksv-YW>g85Pp);bfPdm+ClWqpKJ`XVE9op7 z3CBPr_zgLP7$UI-O=)Xwx~q@wZd|Vq5_-Ao6z+*noJcf;yY&c-9iPH0zn-9IQGaJS z;J(skP6;z*mk8JP)@)^pSc^!dtFdkmw5!Jm2=tpC@ z%7<@cgAkZVdS0RX<^-aYCCsd-Nw;=XoOd=hIGUPpSb1w9g4%UvA*s(>|Phy^rXOSf`& zlD6=yDIM8ifQVU;mWf#OJ0#;C7kC5-!t^k+q`<(lnWQs8ezcR$8`%&9G(y2w zs2gI40`Ae!!cTr3+r%bUrAff$JV%u+DDuVxW+N@cX+}2m!;C%k6hyjf zKyRLeT@L-e9N&yC6f!rlBCLR%5X`K&zLk$W10$`6HGP>TF7BAR_82^L^rJF>hj#*lVHR5h7q32uBH%go(ukTDF>-8RrF9_ayYr#zJoP zddtPAivSVq_1Y6Hgdh{gz?|kh)4~yTspcWh$?lrDMlo$2fm%D+)p8;sX1qJlQ|i?n zT_62^@>Bpa0-ulg05d+3%2<(t2fj&(At$33rGu$|RVC6(QhI;>SD#+7K%9=UI4V)S zku~E=Fa?ut?b4gdRh8Ru;3V9O7BKQxF%XRYg3?txxu8B=_yRkTkK@HGJtE>xDZ@^H zH|d(+SvDFDDWAXatDZQZjX0_`7uPt}8h7OTRoj$S%{iLfCoZwyUq?#=IH95s-=~~E zKnGMENx{s*5n{W8B}5x@@K>ZO7T$79s0`ccJjm`%!WilkuncINa7q5utLZ_B+;4iV z&KcXc<@?g<*Cnbd3iDI1W>$3hS~@DXYZ@BOEG+5uZRsz-V!e^ag2bJpZ`Kl$f9dtG zY;s{b6Eq8Htgm)RZ!!Bq-B!lbrQ%7ilSYlKSAY1#gb(rm-6up16#Rj@Na-7+&V_j| zmH)TYdeb8N#3+AFCwH#x2c@<^!CmM!vi}3zUpnmn6S%Dwl>ZK#o0g|V!Z5)5Yv;<+ zBjtLpzr|lDm1}O1!T&Fmste_ImMi<0N*&R$YR6#uPfW-DwQc@vb&2i7U6jwWEF8u6l4kst9S8lgkzKz}^tEPOfM8n}Z(e63!p#RCobDBbC%5G5ZTpW0_ z)L|AuGggBhX?eVXebq4#WTJ6I(-RaY$ zW(2+%^lFEk5~paQG6Yg+N!X8q$@e6JyVh)0J;Nd1=a>4C~XNd znQ$4JTPW6F*AUcqscW#+TF8#1+7B+FV)Q}g%My#F$3xJDs(=X;i7QKdT;5Zd_3bTT zny#&_!o>pr6tt1ZMu}JaLBPm2Ntc?M3z(CM>nkV7Lez@LyO8UAF{NQqOC__`N;u1g zdx^m}#3cwV2Mn_!YIpB(E;hP=VRm^aJ?3-E@+hnT3hW#(%=$s6AZi8gp2R;<`^*@1 zq2?i$^|v(y4WI?%r?!z@#3$kvs{u18i^T1<r$~k&P`a$}5f=MmFZ3|cXfXY!zlI=xjQ++iCST&y=DFpe2iR1hOzaq(mm`&;GJR*Sz^Ztlgk=f9e{ z4LyVkv>I@44XPItML^l_PNoSGfi;{?Lz#ohLbKnC4?ZKn#~wA2FTtZMHe@!t5?z@D z+tNei5v7H3fOZZ&g9|nzU6IK}BQ6~kP9C(~WyMUR9gSuyADox04H@gMFR;cP_zTWSO}9zR1(Gm+S$<#ucvl$PCmD~m;fvjKi$yG zu$NpYtsbIHtvii`6e?QdlglCfAK@R_eIfoS{4=H~=5w*jDi0g!T*+=chku^AT?r=& zFm7i{Ah>rukhg$rCf6eR&YLIjM!$T)fdf2Dt zRC6%|B6h?7wV=RZiOP#dA!P;T|NoR+Z>M*9!Nj-A#VMO78kju4qq1-mxf9|`-vOYESX?fiQ{VsE$9j=f}m);iyO zKJVWn|9foz?}eD1*7^2BP|3(&);bbLn$SR#!{%?&sBV0G_rEKA&EA7*dD1^sVWW3b zRR2v8W};0=_bVTEHb(qCAAY-6MZcv_U^LO9p!U}_*hHIx+OI07?+Ev)Lag^%UU%hg zZ6ck@sBA*)e-XlNR<#NG*fOo0Il(}yvEwDxdU zjwQ4aU%SXES4X->B4U$)eJS-7=XMpa?I26>g>}inWm%|v- zIxDl*Gy_9IGyYg#onj`z@QYNthw{u;^8R;+yEZr@t7|+!vtjA@U6e&a`2$?f z#kCuz6X~>=4)pCHM&#AyyG;KrYadP$5#>vHM)7n**%d6|8(^wJZuBV%?RQLD%Sn z=cD z{HZ;nZpZB>}@?^|4OeflUTwp-H^j$;qDdNBVbzVhoA=JI^~ zAhFQMx7gl)q*+YY2-Hu_Asx=?Z%AypvC8J0#t(rLxtK%!;`S9kCUtU8DnkA#Q4HFA z!CJnuH!~IZsqMLr&w^wYZ5LGy<}nA;d+zdtC8$6^ZUSJO8^6Cd;WqZ-1VDW3X`#&( z!sKH6tjhshCrtvgDr#nzFsU$%kj}+H0?h9tfR8)Sx>HY(aQ<@TL0JC;tL3GZuajAm znIJO!8O@TLGiH+dG8PZVBk&hzO1C8m;ak{`-4|T3Ou|=dgd>>OJLpfmM+D>OocBY2y-2`Vxk^&bFM~@ z=)^m^Jn?N-pQ?1{f?wNmbaNg|R9{r!{vcom8{jA?5yZ5TcEw@gamCViEc2@E`a&$S z4W^?OKlbv&KeMI=eNs9TJN09xDsVDgJ!2}xb#%-3N9TJ-5D|__)OMhTjV^&`ru4T{M>q$ z+VUB|tm*vk@*%)7Yop*M<|@%-%?TN;OLVkiW3$0t?yrqiSbLL-fRqI|<0S}pRFmt0 zn2A|64G5a(*0o4x32K#p?b>7c(kZ25xqA9S(y6x}ui!jaT<|=yZjkol2eeRM@yIrs zX5pC*F0aYtv8=8dEnJq+gYz$R_1kpsDQ{j1+*yJ31M59U* zd>pkZ3fHQ53eLtIr z`aV76V_!_YYZVR83R^av%jXiT&BvGR?RZ{4?9PR}RF3kqCbykmKd^QKaJ@?C-_1D zz#@ZY(L#6Q!j+SN&~kj?WuGQXV+PuGaNq21d zT6$yON);eh+kC2QE+rpy)=fD6=;o?ex!fmtd-HJneR=CVbgX~2_Q0mFcgoBxcN%H$ z?q;Xm{cQJ?i`KEYcY-dnT$Q?bdg6XdnYl^B_~WJF;bv6z;ulq`y(!7GYFyU$M)$GO z{m9FLbI3dWJ4u^c-4|OnO7&c4J1#r2yLD=eFMbS#eyLPFx#+74a+k}ex|`a%{&72) zs?KD$G7*zQaW!T8*8dwkcdT`Lj%Vt9A`=Iml?mo?#?71J?Y>0r*H znJfu&wFKefp))5Vfi{CSeLmriLw5DbWZ+opbpO)%r08D%`0Osp(aqL_d&X?=n5zT{ zmzffY-IN50CN6_U-$7K?S#__vaJmXFm&tW4~!ELB4*h!I>37?v@mbmY;d`1ax?E+y>$AQ4TaPQg)>Zdr=XUFBnS6%+{!P2Shok)7{(VOL zti-UXsD9!sc$roDmC}_~?H2X+)fTtc#-F>-+X`(yI~8A%tAVY)E|o`*CXtgp{GGhK9hY1q8ffBDn)Wn0VL@BNL*qH0Q;k#$N-zil`1xjdKgL%OZU6rAVs#=M*akzuHW)Ig z)|ymX82WvEw!7=#&2xV;WWSYTw$+|TnZa3-*SMRPcla%r8G(8GE^mKbp|8)=>v7q9 zgb9C#nf{+@K)b{)?)vG zOZf%)67$=iV7_E0t~ege985Sl88qmHesQC&n8AG(7Ix>EKkRw-BHmdeQsKZvh;86DvBDT! zsveDXIa@}4Vq0(@XNtIY!fV0(ScK?Y48)Dy#hD)x&NIwA)y&cjuY8)? zT3q65&L!EmmU3tges?6=8(UixAN9FL%qwl|U0*7Vm0^+lmKMKy!uO(lN2#s7bwZv;;v$uJ?w{5Op zHm7YpFj+y)S^0G?v~*AYc;@4@Ipy`AVf??@!~F-0|14SkAA#|oWw-x5@czGOJotBm zN;Y=DND=r4z=`g^ZhQ;)K>`ML+-y%Ce>^-)9DunV4=Z5n_%}N{_P;p~{{IjOI5_^U z!F?hUZ1mfE9aZS<#~f55t!`ZsegKP8k-B>h8Y}J+t{WeGUfsfhew_$b0qx|FiUFzd)eTWc0zra4% z*`tjv@r@WeD#&GDLsb1?nB{Sa2(8&?6sxExPW9)D0GtkjO{Z5mqP1n|wm0 zq~$uxV4_9S+UEHKhC?H&V;=`rb_viiqeR`Jm?7x^OsI?1#Vw48c@?et5kvzQR`qfr zPK*>}sKmAPKv9sl*+mjuU6Jhx{A&_3xOr~)A?W!fxJEIe772a?XYs%SVEkI9?4sNc zBVOEz>+NnV!Y&D)P$d^OzdS=~ISlur--;v$Gtf4#u2PhS4T6-j_!oYU0BGAmqaKwS)ThcM1Vyxw*4;7&+F!4J{V%PB_NjwQ_BUMD39lC|T zzNB7v=rkC}*{A<1C@WUYu0&r4oBEQPm#@m1OO@z5XXiWEEP-%ZOf-z{{4?I~N!VX; z!odTAuN|x@Z=0o^>^}&mM);3lbONKh5@2*EV#SxM+Og@Y(j8n?A=_j`CS`651(VE<^Xbj*njz8Swq z5qK|+eO-)lG2E&h;oSi-@5HfGj&oEo_7#({#(Xkv;-&Wx!>Q^A zq-4XvUpZjx%aplP5#LJEvj=~xVsyfnMhUWs3LZ$uPLE5^fzlx-3lt#hti~5X=6YZmPn#vq20FcZm@%sYIeg97fj1;VOsE)$zcL8XeB z7ZwE?V`*!{AVp!ASXP*wx-nxKHfm>HA{es8gv1tW)E04Z1XuCE4K|2lxv4tk!u`)3 zP;xryM&XS}Z-dDyP+dRld9#FCK}!m8F3bNvxA-i-uGHR%id~oz3O7aR9Q21((YhZ2 zF#grIwMlBwt91ww8@?Q9D`{4(%BA#UdPAnN7$McpaH5tw3McF*h#_Neg%0q5@ShZ^ zrDF;yCF4o(wqD?6zZIjbn(L5xKNv>N1x#M2>OO|ORm6B5;G2Y87F^|cn*2hHqzY_4 zKvcX|i>4Ef#m8R&?DpKcMrm`ksMI!yH<61rX}X~TR+p2fqWcZ=3fAnz^1JQ|7Brfs zb!Z+~8=*e&vv+Jq-tJ&?p{T|o^Ed6UWVcemT~n{NARJt%-i+GSMbzACTPT+p#LL*L z7$CDRl%_a^QB5CUUBk!6S~G>-w_4K&rcET}S`#K>&$4jVPP><;O*|Lx7-2X$*P0Q?0f!6+xOarK&o`w2U>l(lw(F zR#s28cTPT(bum5pJ_k+xRl-mZ(mNx({j|b^f-1xqRSo_a!vbjYz9O>XlM#laf_z(? zFgZd#E6}n?bbys~l?Ozf)y!kIDda651wG&&Hg3qvT^ShR+cmzawU5~79!pq&z0_dH z6!>iup3p?HT9M>c8YiYcFD=dh-aOpb!NxgshchzQ5&Q;c1ziLv)Ig7dDbn=BAuMYr zwa@feNC+yq;PL8?#<>^3WtR%H4PNi`5%h=9T@@OkS?KZD-3;LaOhXFDKqugVj8%p_ zJEQsIYisFGLK(7BAJJVm>hYhhVFLPdvu=1tNhcoZr<+;pCcgKrVPoF_9x&6TfsT!EHT~!s ztYGqSz6RoKG3q?~BZHJ1^AhKGcxYmdd0Jvxb-XR*a}BB}{LN z4Ku0_<2xsoZC=gLZDZR)FZ9X8i@-aZ2 z!_^`Prq2Mu&srR0CRJlcLYFE=KI!n8E)L54-B$+QftQvn;TJAtV{1hyB3k4p-UM`* z6VjhCgmdx`$VGVH4~b+W3?!ugjO0vt>o}xZX!nd4wtNkifzUefSI;8bQuuhdP`B&B zHflg6dT||jeZ}3lD`Vod$bC?y(mb^kbx=Lm?8UxQVzgp(w24EZDGl}N0<2$<7&|b1 zC9O@4Z)a5(K+7s11*(^qrT|$Q_}GfY))m( zx@=q(AR|IYigw~pYbP@e^qJIiocPj2B}Y}_)F?8>e%;Vz&ACayS1RL%~lR!6bz%Jb;CsV z!}C~7+w#Rk86>E~n}3c%DLn&rgPu=I56GI1!1NsKJU3B2hHhdT*=k+OsRwOnVyyr+ z`lSbYs-X07sf`nM>;i)!%3~MWkiH_AZ<9g{tvp&3!dn+Fo2s4%c$d+;nRh+nv<9Fv`p{KUkBO(hMrq+EJW zGAZic@5WSSHB|JOGWbKZgr)=F97hs94Ev&AiwB@q0=um&SU#`@iyTJLigr*^PyXkY zUdnlT%+pHfC$sOk!^DXi!GKx}byaQ@7ba)HlS{PXRpX3RPo3?(W#QcVh0YGcm6TNB^G!$b}WvOZE zQ_2-h5O8p^{`UW-vKN`%_8e^fDo)|72)@T@f<%CJd~k;AVTubqEOKL zz#4~Yv0^Bu?Gh>;`kRI9g1PY#hDC@tm~vZ)ktCy51)9N;50QD$*TB0Pn^8beSAo&1JCGBZe=~|z&7D~l|$-lnOj%UNh7M!n2L$> zJ84oS9)#&>+~2Hfi#xJn1ud?@pK|+h9tc5be%8mdA_ekwbE_SG!$xe8b^$q`-v%2* zhiIme*p`aEJU}%f{X)VH)U(}hECy;~NZ2xE=ocM|4Z?D!y2iKC_mP3v;sLGR2rPJ# z^4P&Tx(oz!N(qGQxep1k-_!dy^+;gYc&Rc*9Zena>PL%Mih8$BRw=yhg8iqPwoB>>kbEx z{NL6cw`qWvT;OC?b6IDp+}19Uk{1%Joe1#n1Hq{M=ULb%OD)`fQiXJe`pK-<_DsOonohV7H8P@U6`Q?<&!B{xCU7 zo|v4<7tzWvJZMheV(v5=IvI1l#$@8Cq`8sP%6^-YgDqreXbuf zCE;RrbrhAocz`tp@{1^L82VPsj$HykHU9Uo5~2*NHKHbGr+S_!V%wM5hRbyuZY{FF zdXW6;)jVIgGdj?6!NX?pKet@D`Df<82_sY)IH_;k`fC&SkLcEWhJ%ii2WWOsfOhJn z4Jr>8yPN6-5)#ukJuWETi)Sk?xup_KokB20>_^vo0w4p@fh;B8D$;ZJ3PUDiMgH7+ z%S;VxXry@>Cp1GEo{FB&GP#a`3aLe&=%pfoInfV}DlDu|FHc*FSfdvcj5rVa_7lt{ zjG{ETcaO|Qxu=a_6`+8~=UmnEfHpEHOFfCejaU59h~~nrArJAim(DHL4dXO4)PESG zyB6!HkaAK~BX&SbV?Id*u-LN@5GB?mTJL#_9&hJ(YbK zUW8TL^S6Xv6_MC+5a=P-GRC<~Y(;o#`Icl|a$uGeDD;Cnc^k%QA*DAs ze`_pwx4-I-i?A*a0?PStkF$(^N62d(@ddDV-^KjGmOLosddW8h|Lf2qHQv0uz&3Yi zybus_j23=gC8@kxc6$XKlL-I}Sw_$~n?K1fwDh8vJ7DHIOF% z2~gb0o+<9I&lLBnCyM(A%KWR+rngjq`unn>4WL_D40gkFcR^V4zY6TX5%OzWRpcS# ziofMClZ9ZiyR>naKMcadfk`s(7pv#eQ}1)IfOYfl^kWNveoR(~lMz*%XwPe0xvzLr zYri@PpWB}y>nj;;-MUgltyHxFX=i<`-%+5zb$Maf!PL%p{R8`ztQrFs^{rihBcekv zom=Ebj31$J3Pu1nUbJ7Jh#0DB#+Rnrw5i-9D%LpL3z`5nv*=6;vO#na1RFWTtc3Nu zAenIZQv;YOtD*r#9i1HqJ_JeAkl1BwqACel1b6Wua&};ZGj*Z710zn)LL^5&XHYI0 zmeXz7wS4l9bs)uFTfrsX#=YTywK}$n8ASN%1(+K(u*fV;;~_ ziKIh86)5=SOEx>q2{05=K?HUR2ay8(l0ltlPR^9&(~c*8-tjy}d@&Okotz&Q)r_%8 zM9d=m78twn5y3;+@(6xAdQ|fO$NTLwX3wAZq(7*uehLhYyD1uw$uDq`(cbD3{&wL$ ziZ-Z1B{6^B4o#wIYB&vSJ5R;qfj^0mD?_nlO|Q+Ml6OADkSQ{TdnPCRtA2bTO+T>L z=>caXpwJ$adpCvDkY4`LhUR7=jSR*JpC}Bg2A>r&5zE8SA$$Q%B zLB38biTmh@DLvyX(oSxoKtUvHZ3+}bwGQ#5oWB^i>5~W%+Ss-hy(Sk3v7Yiw`mjKx zRc54m$ak*ugfUwhTpxIN(fE0e2~;ZjB>X`E0$X!G&q_G$^!>><`8nBaF!qt%w?)ci zK=F){|K6Hcsj^>q2Z;FT`>32i8OGo9f-dH%aw*&|bU^eaVHSnxNcxP@vng(a7CfH* zB0aG+2dSE8zQ9d3sNgASDPJh>ic13DRi1v{p~@gOXP6#ZP~IU~$UWW|GXLOCL|Krk zst13sqa0EIo2u1O)8S1KtP-Ca6 z$QP>mWfQw6PE=hjIJGLSzRWKvFn}^dQ4bNtUmH4602>Y!FEn)HeQMsRD84%{h+U8E zl9q9^sA(hSZsvw_Y(x-CKKD&rE@h9A>I$1 zoY&K*D-U?ypGdn3Wuc5 zn7;HvOT3im$AIPfaRk==lK2nE^!i|3+4IfAt%|uOM+COSJ<+ByO0Dt;Lk3n$djq&Ig z^?5t~{%;r>6ok}TNlg$=E=&FRbZfOs0+Wf0u0P+dm;1S0CJ29LpV+;g%|C58zcW-a z>KuZyQlGcDW78#QPa)!eLpc=edT=*Gef?e7a+UiddM|){#M7|**nMA0>Tb!GPbH)U z$+j?m_F6>9)QHR~L(BXu>Z0+bJeAU7F%~zl`A3()-r56-$*hL40Z*D-13FTMn6GSU zFS7~!D8IFf&W{y#m8B+1CWPLJ4eh;sBK%=1I-D{`4v<^l3%(G9@g^uIlfnMkt!>I* z>8W34VjD=75{ZvA;V<>9u3Xq3k1l>c&C4G)btj?4CxMZ<`yAuYE~LTiaJ0p)*&a7B zGy3E4!Ocy`#@_Blz0chhFi-a`6+fu!x--tFIxN`x?a3#&9Q~`{x-6LLmv@qjhuMvJ ztIwgaYf8J9!OKXE z*&&U`ykz}XTdruMD}8;7w&e<0#Ja>43g#X&PDWo=$791ZrpGNFm&WxMYup)h<+IvW zU+em)Pk#<>Ex-Z(|DeX>(xA8d&PTX|3aPtjdU$a6(~bQZtjm(`@4_RHM@RemZfO@> zQdSj;NkMy)*AHz*a9i~##17Pb1w!|;$;|Z8!h~07NF2Xz&L(&3+&$-CaXW6QfIwfG z`3r<}Z2{wGPy1UHwp22-3{+3vOa)M1L1Z3BAfOArRloaM+)Ney3vdr;S`4}Cr+w!s2xw(1+WFft zdxhT6=@%p`)c0*)!+Z*;zcb8!<$9MMUADn%f4O;cp>+__Jt-Vrx~eOmd_H$`I-_Tu zOwCMiefr7cEPa3Ry32=ZC(@8i+q>rLZ(AWJ;u~FSL93b(@Z{x?f;o?_{Ai!1##PJp zZZf!FZ_~lkoU5SyHS_59qnMg9O_x$AL{?3&EFP(AvVyp1}u|xpeoZsps$NqEAx1vwin}0=T(yz{;m(yu^J(X~( z&{?c)MyQ@$cg9GHKg0W>esXeq(prU?;1aFXVr=#Bkg>ihqaDiG<#Fib&I@;yGGl|7 zOIQA^{fKzO+CKF1*8Cx}dlN&bm0veYM}ECgKut&vBQq1#8NTiAR*ny=TmeL=fA{M! zc2Ee@E*pNQe$?}!(>v;;`_`G-j?CNwdsf?Q=EjQG)}?v;)rZSTK)iU}w=p|O_(?LE zadAaLP_Q+#t&P*Y>vZuK6`gkQio8MvXglGgjbkEy#{C7SQxHd!{picY=!$}&q3D9e zUo%$wH#45SV8>598jI$fPW*(k=4^BFbx&#fs%<@e*el4CD$19D%w3S-OeVwgILd%9GYFM4*95_gJ zj!gZwnZEMMwtj;!x^F5YxNT)ZWyHOa3gFY&X0=%t2CI0{jid=KE_*7N0_HISnX}GC zB1frjlP#)4ZB%3k8uRncm(1%Y`Oj!iXKQtlE3c*7;)U&VxH#^L; zP5VVhMZ47q?Y*X&=2BhoOAee_Zfm}XeBa5z7eDF_t`RHOjo+qOzg&4+qQ$+5zEa+7 z^93DKrrUG5hK_8U~(EjI|FZ}AwQ4ek(QWjSN-?i$?l})OXaEf z-qx=LZ1Y>co30b0k`ov6Am~4VF2LoCYwTs}a;v{)Sh44R zQ=g+>QtswROmKcxweDoeH}Rd5d#Cxj?P&jQ(^Gpss<(6%CsMtZKJWJE^+Wo@wa@__ z>py{|pU~UC4%_+DjFRmj|7(@p7`Vaj}p; zpJQhP96)Ap<7n?}Y^2XqoHx=R30T{vE^r+fw`27(Q@rF2~<| zv@-wHKgmtH!+m+1a0UDOr*`?dSYa`W5PrNhO|=Dr@=RmO*!EBE(m&h^^NLd*-B zLYJ#|=HpXCLQD^5?Y}lR3~DuP)Jitb3|iGAbu;=ZRGW)mw;i@yx5=*hXx#Pvs4`3H z%3AE}vCur=9T%>$vk$vz?tTC(MsFpNF6nViY_#KI8&{U3MSkEUlXKuZsy) z&2N}hQTt3In@JBCcCjzke73hx@a|o?amjlQ>Z)R}p~Ra@X7*nDCrRa$@7HDz7`fi;z=HK zk4*;|Tg+lC#N70|X<;8n#30{)g7F29>GzK+w~0NHtUyUthlFp)6e^Gp)gJ+&MVLo~ z!^HSnMKAm{D#Dkz6uxB7AsBfAVLtMy>h2nrf<-=~h!kLef%MJ&6!$1l{Qvh5LK(q&?z|eEXhEBw~DYEdEpwB7tN)ErnnurV!SeiJuN2_~Xi;-X^~)15nN~xw{NP);v^7TInm(g$MZEqD0%aQ&`u9D9UFD+8}!RniTd=G{9V=suJn-^IU`M>(g8V`+2Tm$&pyVX_H?zW@h#~ z?z_9nNC=%#%ryyGDq1c_pj~9bg$h2bBnyFIwkx}%gv6Ok@(WQCzA=g?Vg0l9Mo7Wa zR5ZawRSM9zb0-#Y+LeEsvopJin`*9GP@e`=RoYo4@O^j##qkgTt%c_5r_{50fJ>#& z*)F?c&T8Z$XE+aYu$Rw`g_U%ll8%k-37ye@mKnMzg=s)<15&==eCB_meUUR<7FMqWj~&LgwMAHvX(SJz%iWwH1W|-jrp6 zhZb|n79R=H^3V*xg`g|ZI#BA_uRL!$He<~7!IT>ej z0m23%cN8|( zqNiiOd~@jtI!gc0DCsz-{)y_YF#wK z{^NMit)NtPwzT?YUeb%L>!4?)E(#W*DCzh*?Gq3bl);%xR#*v&{oBgp_Hdhx>4vB( zqj$86h%Mzq{0qBza+EOxn_&NYu=`6yN-C$Ri3mf9SqOLyMqV;Fv*8z*$5&cs-!0 zk*!tg2OkET6J^$4_y-rJOGGfYrt0J<2LmEfln9-Fq9WCQmnZ`P$o2>r%jF9@ zXUIS#MueD!`;)`|`MP^ejvL7H<$ts8j3OFIE>e(4*r5RP5f|l#8aF*A3E@6qQygqs zGnoeXI;lpa+w8@#8i@9j)4fW4f`MuZM5#$M0*W}w^x3t8aK*xtyO--!tP=-Wea{2l zcs;@fnZ8H@@Ip_HW2yLO-xq^o*}VlV^_-scaWQ)t!h}ebowboWMUBG#ohL#<(DR&> zKyK4K5(LPW2{c+M{^*w38zzZm>HoJGb{M#sp2p4};qn9^Tt=`s$Y-mC8^OUPM3d8v z`_u{?J}8AkQrn1GNiQ$&BDX_YIeDqm4~SbOl$5G847*^!+rNX9A2KbFc8|RwnSuB5mYD zGTcXl{-_`qEIB_qIyBkl)_j!)l63g=#TOi%?t%m3VWNRgwH50bB3O&NC0eoH2!O2UkI#qP_&g2v z3M!RIZD>OIa4Cvkdqj&Ok{x9Bh~kA{8es@yR+d6f!-cU>Px0emux&m>%y+K5L_SMM zk)}i*AZ^ZFc>3qYP?4n7#*F9uHqOqQlN=e&Nohos1|Wlo|9}i=pCN-@MTvx&FTi`# z%qf%U%s1u*=r4xDu$6tv5YP0NnrHe;FBXiGuChtMos)i)A|Rw^ez`nBK@%PE8)_Ab@YNoRBP0 zlCjYNxl0n*fbuZ4R7&O=lO|0_Pdh6Nqz3p#h{A=TlEw53^8q?^33G)+1z(tOR=6gv zYGWf-TPI#ZV3!I2wkxCoVaV&uu>g?K6q}UB@k@6F) z>n4;a9auy4LZ4uCEA2BQ92c`BX!fU7p$J?m@{i~u;MowYzJB{UND1NJiU5fMdc($% z%-;A41!de1ix&VnMGQoD$m#eqfZ8kXTY)sSD6mO+%8VVKb?7f)Ru0srM_S507$8p^ z%yvO}s{UDYwVi#?SzLc=P1$Ml48lVCy3}e93j60;(=Fzi&KmJVXZ=g+J+GWQ)7IE& z(44c0ESl_0OT71Yr<@OSnq9QQ{>ymK0_h%r84Y(^IxY}?*=ULB|H0SdTXV1~3}6ZL zp_Xgp**{7*&ZE60C>Mj|=Ch@jx5%F+W39_yAq#r~M!edD{iX&DZ3+hjo)^UC{pWqf zI38GXQiB`FtQ<8SQY+7y^*=|r86AJ$Ot1iuSqZh};URZR@pu91uwi##%sXWswkLvx zxk8Ysy1gc>TRES8AeIVTI5FK3aI6=o#1E&8O(Z?>Vz~(#Q>>?>?t~Dnw1{)QLhp=Grou+Bi|q( zXB=8z=9(6VevI|jPn-0+5+1kUVtU>to^L@$3k@Z}hZy!2R+F^jkiaO9bfZp`_|9&^ zW>B*=T!T%3bf6V?R=ydnoBw1TH9J3=r9?9%55SM-BrI7%$@pkvX@fKqHypFdX7r{9 zic^0&!?}(ShoC>j(uZeHi^z@$9v~B1KWD<9;1Zz1B@M{9*DIg!d!`Xgw9$Qq0O|s~ zou%P4z;I(8e61(s!}Mh0L>fTk$)fhJ6W5IiG%;~9o-kKe{y+z-1Q1OC#|`w)Bi;{M zfQb(ah;E9HXH3nC%k)-p>yLFrN(#nBor052+{=dZ%V3a0;ZByd=AH`CK zMTFl2_#R&a4APPRj3(&A^vCWEsMGB<-v$O_w59%)R;kE155d227Ew`jW3HQm(W7Mm z&y8O1hjgI(!h%{{Q}UP3z_xD-fc_fB73mCZ+`}c-{O&Xfm*%DAS_hn9CjCEzIxZm8 z%>toL4o>3lp{}BGR5G9RTV!VxT`~m11sM#B)nWgb*2qr>78Y@MX&@qs!VzADmMcE- z8MWwx1XTDiCkSMF%o%}(iVJ_Ch2q%8SV(Y$RM^S#_S*5kQY&e8y>klsgLwZ*U%&)G z)^c7;rLt#ii_(6wDjlGLvTpg$X)c;lDgpGBi72FGT=d}{gq^vi*qqiD@TL_;%*f~1 zNoyjuy5Q(-B%G*WgwFa83raaJFy0~DPZ!*3FCT>H`Zbl!-kzvsA0~nG%V#!xlp4%S zrqed6yIV)%FxO>9eyQ29d*t5d7hl4K8KfQh-Zznxk(_H^_S&M7^`Wfe+U=o-hK>=l zJNwX^-mImNEJ1nS7x1>LyY$P~Pl8T1Jh8K^OX1ny4M4JxlTK-h<<=Dw ze+(bLZF#%ZHx0^e%s83ichC+j6Na+{lPM2M6rDX=_7Oi@M15G$qQn7wL`s#Xdm9I( z>#Ap%2fWwJKl$r)MiH=~lHtisn=gZqOY}HuJf(j8H7=UhgGxoE-~bp230$fOu;SlR z&5&C_9_#>Lgr6A8pEa04Q5w$g49^}IKfH_@LFnZEu%HTj$1-p!|8!ky9g;+~1W=Ki z@6@|10vh~@nv423@BhS(E8n#)MZrIQf|5o?&)#XB zZ5q;yG#`QK&N66#zK(#Pjl$vIJfc!0OuEOPJO`y00YV22p1d;L>k(PNv2Q*T9Z3XF z43p0I>y-Pmg%$dfWuGSPGnsD2NwwHwL(7q`R?h=>%DU!jN;Y-xv!G**Rih@pCn2mT%{Si`in*MTP#^u8&0b=yzUG?UC3s;w!bGqeE z+%@cPgnqN#seU#Y79>6_xZT^Z*J0*6I=Q)d@JyfnET1+zxO6uqy{z2qWwH6``YKq* z=?CGbajCOUsUV??s(I_;PwXZ_?K<-41qvZ&yAQ^yZ%ht{?p>NZ(twIahs*sp|9Cw`-;kik;(jeovmv5W~+7wbTJmQzuE?L8e`lyzUdX zKS369b^_WwaBCbjE4D7N#`;yTxU0VsrbpFkoaHpbKMn>fL{5R$w_g>->iZn(dOK#g zccwPl?!Gex?@?FSxqrI*WV*NRb7Ahb!dx&ktj%_|H8didHa)d}H#O)Ya5rVt-$o=f z@;f^9cd&%hM2p1MR)NIK?RuQO)z42}nXWCVQnoRvn!ix{K&iY1dFy?B=c1=h_jd;y z8wXcx)vOse!SvkhdpFk(EsW^fXTQ&ePQA6!U>VSh{M9q)rqbnaL6=1M8|Omo;9SR* zD%XaIQzv!7`f7rSFh_!4w(J*1F|(Jp_DQKctLl8_nblic(XFL7x2aTN7Y|c1_YOQ6 zroRoSMvi)bXC4Bcd8oX+iDGDWahSV8?(hxX1vh+&g6U-y%$B@$afEx1_Y zEPQDFfzMjWW<)-1IbHq+DIh_Jf6cusXfm`tKm2Gkx=%N1d#_;o;m0m$v+se-vn7Vl zJF`P_R&@_`!?S{?I%RyomPZ6!s^qimao57J84mGaKXXWrUCnjJ;e2T~sH< zbGfl;*Z4Sja!RwkkxmU)(p1h>U6(R9fwP)Tp24B;QtZKH_;J(DTcGW(>-73O*^oJO z8(558_E)ZgYA7YGQEF&Idz*Kgd%u3Ulvse2?@#RBc=EU1p6*`3eQpNZxYeBV^WRWdG`HM}CPri(3hf8E>N9%T*?-&UE}5ZMMB-0j_F zMWO5RelTNqngEVh{^ar6_G5C$quy=$_G8f#t|QbyC#Oyet)n%Ww$|E=GeHM zIaTvr&0s1>F4#D4ea$`FFLJj`m^P_?xoGK=bDh)fsdK@ML~L`<*wubsfwH>zvaT!d zaqHgp##YIAbyy?!@Cxq2q4lyN{Bm|NZ}oQXe%I~R^HZufOMea5JTe$2@4)6Av#pSz zE_{#onX*I`g~ixiD3gO-H8e4jF6f;?x9B^NE5#kSUotzMVG4zuT6Zgt8QB> z5qdSvj_HKl!DTh^a+=hJ!;mg>2q;;-b$6;AcXU8Beh+*M9ezT4cCw;-PE5Sr-fiE! zhq`(^B18?&y#-#++3$_yapL<7?7nIZwG{m1(SzbG!kG=-!@k8$yNCLX5JR7$>*GX$ zFY?Y0(bc+{z?Y@gi(#JSz1LkH1&RQd2aDA{aN8DY#_vwDA?;7P z^WJXIPJ9`ex6NwakSBW$svoolU=($%<=5a&Y+bk za`uWN-Kct9H1vD=({#0@{JrA*&DjBB2R`K6wt1AJaAK3m$y;^!b#I{-zop@uG>u>D z@3`~sMBjmrU*9{`EP8t@yu@yAzhg-ks9xC1QMKjSE)vM#tY{VW&S`C0FRS~JwW=iK zBk;JEpXBs`ui0qlp#AKMQ{kC3=tUs3Mokd2*(6fCdGwAY#5C!~eq`IX(d*|Xix}Er;D_S81S3QbJL2E2c|Bv! z&Ao_TcMxzGfYZaJaH~Va3G)Wh-A7W$j)Rn=CnkH@ zXZ;fOx#XC-#|_}9O=g#IP(nKq^%IQ0OKV53*68EqBpdO;CHJ}Kwe9xt(41yZP@S^K z{2=?z5EJw^{nV^;;M%zW-}MFam_f{!2#+bQ(`*pJMMx!D$0jXOx!{>-R2W4>*bDV< zrwuSH=!WhBwaRHJ6UfVsIxcW!66(fcyLnlmD=&y4D0Oh$>r0m7KDcEJMknvNYn04h6> z8L}*3OsL@x|SAW301DI zB5}Z)Jq)oMBr?t5rU!VnLJ24dXUIhcFcOmP-0Y`UW!zFj3^3TYkfBdL$o&*F&#%r3 z>eoitIt;Braw{6w;$RVr-)h{d-7P|FHM4QM<@euQV!-L``$S+@N^^XdV0f#g*J;SQ{`>Y37SYV;}|n$ehKV4_y}XoXCwHCRP8=!%G2v;=?=dUa%`nF z1l?gaM97bDlca$3bPf5Y&AyFpz_%nV;WC=~G~^cErSFHM3M}M18|G0|1Um*{ofJpk z(Fy61(XW>2zfOemvBX&SW$5Q*y^m?oHtWc169`y<4z+{Q(yapqe9&wr0bP%T|pEFPCro|IO*_G z#oj$p{LX&lhrZ+a_VUX-2!_cW86_6{lb9PoiZDjICP4hT2q4b#ACl6%95LlxJv0W2M>>;e$v7)IE?OfY zAG~TQW&Lh*iY<}eucg%nl`XElCdmnH%K4P71^baQb2WzBw5nutN1#ExDRC11?ruZ? zVsh5amlqBR;#Ig)jj^C7)XwD^MiN@ApIx-DA8&0R^c>*W>sa-&765^C83`4F@}Rz)*FN8D5+wukN+Qy^qmE%-VFvVtLI^Ml|)>ndt*ZR{u*gx=~R`we|T z@(pwxe9kS

2PsB!~R5SV zhfXV%l%`#)Ykpc8Kl#2vcVz5is|g%rMsg3bG2(8`Xuxyxj*TyfkK-su30#>uHLjS2vZcSMrbuxw-)z z^4N|j?uz$N3fV=uxJY|&){roZKWWr!xx91jl>>H*p+f6;I}hO)f4tD~3v$S1RW*o> z{zj@+quDw^b0)XNmLTB$7-x-f@QvvW&43jXG=hNZw|HwC^<;HFtO5f2Mgex8H+sQ2 z-8ek&t>RsEm68%w#shP}OXP=-kZ&Ngvpxn|9d@oa`d8xxzNl2v_y6Jf)>ibG)%<-! z{(*>rb~SoEg=E*2dnp7LcX&UzvFtOFJlFL1U&S59YKio0I7UTs+eykH8&NLrq-7xX zazvw4!keSYMX+Z@*~RCiqV24Fd#kBBDQfTqY&zE7_qnw~!3)+he8|dHB;8QouQ2n2 zWT=`Bn-+YGtIAuhWCP9KR6?%Jb*%f9LLy^~?R8o4jRDpAMT5($KB+tl0z)DY7forXd1y$D zGbr10odWyy#i}Vaqa3?tPELhBQOe3m5v91I1`m%!MDyLFHV9vm2%@^-*<5ztc)J#p`{ukXEUh z2y^7hGD6BeAZr-1`h^vKmpaf@5Z^ac8h$Z1KZX+)TW`Hi{Y!`{Za{D>0V z^Q}Iz@=dVDoI?qoGK?|5)E&#jBJA7P-yW?j`8?@TLP%$weUluAD)QIO#yr_|^)LG& zDLoaa0*Zca{#0Jp&FtEm#?EjTON~ocEYfj!_`DK&XUm^O>@qgv`i`+HA9GIGQ43% zqYgBVR-K=;xv-!nT^NV7iciF--e)-p+c|{`kdgT0bp5Z|o;$9|Y*~;hQ9wEZ8k$r| zFN7jpTBt!2U62T&Ne!Vl5fD^BL=dEd2uhPGf)u5QN>Q4$1*C{9Qbj;T@CA0Si?H{; zz3;a#|0Li1&NpYyoS8Z2OlIa7-EPbN8gtAi^QAozS0WI8!R+C6YJ%vEnw!Q+%}^<$ zqa7QP2}0TfJjV<#M&Xil%oM*daEK-%ZUoM;W;)+FQKT@dnxX~ED2{Pcj&;Dj>zXm5 zn_u#<<+~_puIn)`0g%^SF4hf7&?Rr!PYT3kM7wZx6!EvX~%>MNI zu8gqwk+2u=Po10de$l26kIP5h>3tw(oyflLD6nrWro{Swmq8n|(`Y|NW`9jLO55Ri zJE`c}?8(PNJ)TOlPR~7zDvy84wKMG&IsQpGpqkZRD!o2vjCj7|iA&+=d8@r<0uKID z>HZ8c-o8=lpR9ZZ1O6W`sedcQM52&#aO93yS@@3HTmRlrp};$jy6t9XFHj9tVBZ7w0Pql=%Urfq zI`Ju;WSIas zoe?^*6nv@C*rJXXKe5PCtuofKXoM5KlIxmyhnH{S=0nFjvm$4b zrzYcS1Nkk6ZUzC909jt0$$8L~+GwVtK2oo>kU~~DSwBhPsOA`Ormv7X zlUrbA%2X0>R1j&}>scGXd{mlKjKm7Z5-L@D3R03AeFn3%vA#T zboX#Znldm>-m^4kDXPjBOE6$;EH5EklupiRA6I0Z9-nO;pIyGR4jhH&)O}b!n?Rc7 zn%YdQf)NPsVw_*U5t#Tj*Daa;Wc9JR{Yldy&Kr*)Y}Z?^y>%F0gS6c>IN#;5ugX|n zLpdRtk!9AHRDWUl`&_(i^#w0K;kEfy%v6)v(7M-Dkew0yQ z^0@d{x?14y(qGt&9rfOR*o@*&#&d$^eCC+Y}1(W~MM+M)JjqvaF76JqB zjB#&I9FFcmaCIqN((0Oe*;|Yy#AV}sUmv*rcC$dNp@(AM zbU&njRK4$Folug2+}d!&F{^Va!@*9Vg!Chd{%N(Dd}swvSNhZvAs_D^|C#Q|v6DAn z-l(t^xx!41)C-?m*Bs^5BHXGFP?{HdHu$APT2ay|<9=q(=ApM83Ewxy>X;=gLsh>c z+>islo`>jYdHRlhe(ldbJ;5ZZ@igl|-6pjf@Y&?nbpAGMu~$g&7l2u&u33fQa?Zhp z;J%hQ{xXBr)zw!MIdNL9j+4%H_gA!f=faOt^4f;g+aTCP;Fwp-4YBBPe(Pk9ccrmD zhei}SB8;0nBf%Gj+H1?oYBY?~qzQ@M&+YEE!k8TEMyK9KL@3HMW2 z@cT@uf9QM*siH0v^hG2%t|;h%gXhwCZf=W`iU@}STbjAjLR|yzY^Ppa?RAu7{yB5& z1h$MX=av?iT>89D4SX?g628troRs?D_sP^H&FFw7bMezXw6DPd}`MdtG50auzcDUR;ugfmVb8^bl5 z;4s1b7bmKjB9++Ppve+50r@k(GCVF+ct_jdh*WSW)Ki#yO&6@4E-2b$@{n*{^$1^t zU)Xgp6nmi;uZ@3R!6;*XJ2GEc*OaJu2}_^U`_Q-nYay>0c31y{;!3a^L}sxx|D#zQ zq&Z{5^>EA*p(HmakFRz}SD@&lT)dRz$Co&PqsK^PgAZ_Z9k;4G$_5 zsi^NV?|m~L9M3E9tgITOzwfqUxhZj#h901HV6q>X#V>)z`3!%?DI`aU7bijp>_ znJ+%AZmNa`+H(VxlR3@J`(5YyV&*&*24&cmv=w4yzWHw8Cwb>DxC`ou(+Inr(t>QUXxH3mH zBpn;3>vS|vz-&5dSoBLPedj2L5^K#nGKs3nGkL&3-3d!q zY{W9HwmQfKfKv}`ksLOa5Y5T3f;Tg|s4s!(Gs3RRGf)oO z%*smnMME5fE1OjzkpcCOJYGZOTlomDjwlJy zh%~IK6fixSKR)uwa~{{y*Sw$HBlPI=eN!gKP)=1im)iM<^cRN;xyfqYA6v`xELTkg zOeK4R*uyG(j(j^v+3;-HyD0qCadX??{-bLG6!?4go7T=X;lC=J3g3~v^&b`iG#s|$ znAuAhiXK9;2lSn0U+>Ch#+z&Fw4F80R@qms$xatw_tmnXg1}VgDCTlFW4eCw=sMjy zr|0}v$thTiq~>RbeWzTf+62$Y)PvFruf}@h+2muF{LN~lM|j0S$p)qMSCeXcTv%ej-F|WNJTrWY3sTnipZcG*DUiccpc#O5LO7Cp`(9_$* zMt2%GHSH98bajpAvxJ>DJK&=s9o28mDZ!$D4uiHjrX8VbchFbE)rL=Z0UJID+Q%CxkRg%@1E{v*rzNML zsryBmmAGdd@0E6Cb#)k4*gc9p#HN1$NWU=NX<1l2R<)Q-6T}yKr};MfjkiSO=HMo1 zs-dX`9yh3`esKNLi5H@>7?F$eCp8+Zcz{x3qc2s&8N^B(mia0$>A9LjV(|O$!>bG~ zu_y5qefm{FZmHXZi1N6*MOtc7l02vRj*IkF0n0w>im|>|YjXAEGg>e_Daa#qR3-C{ zs4yH5!&TaIy3GzFfKTwt8W^x6X=-VtrOs8q=RS-ByFIe_u>aYVyKarq3*)_qw;3ox9GYOEU1SN{9iF?bWO5kAxXkPU;TrV#5sl|ML%kvk(N z5)^PE_up~S!^}O~(PeBgZ|rWQL2$Q57XOxbq<}>31|$iNyEe11^A$t!4z~CZEF` zBJS+=!g?DAa2y)4_rOLPSyUp$WHo zbD!eVk?p2rlN+*1I}FNk$9b3T$RskWNDmidQUD{rGAtiZF`*G;a4p8eG{n5|I5)u( zZQHWAc<%U>2965{(1-S2s+jP`e7Al=>|CZ5^u=)K zma*#y0|LY&bIJ}KkrLRTARZiy;bMtQMQE5Uy=k%o`3t3d>o#Ppe^2Y@F8uha9-#5D z-9WahWWW$X4W-_rgF$v#`Gs%D2bm4HDjFK>k zdowVgP#m`DX!Oi;{tAB$_DTXCnvg14v|2_Sm~EHy-H%?v3&;u5ujJ6n%PeD z6?6q@mi0zW<@cQ6=F}ZBFI-p|a5$&vs~p|p=3C=yTX$Qkkge%d!G!J&e5&!lBDDgI zqY|s5puyS%vA=jOcsR|#Yt|F*a=Ein?oP$jH#fWVV)w$-3$ro;Ykb(GBL20fj+@U~DsRr#@aKGR^Yg0R zSc~aWlj+Us3>udlsb;)V8aZ4t%$==XKGG4IIn5i+_F>qw@zeJ;I@=Q#=h(b*pAK<8 zmXDtGMn!(;UiAj2`oGLsarVv<@I@bI)oeIre zj)J*HUtwN{>fsTUyI(t59-VnYz+eN~3mLK!$Gd)%yU2X4t{@^@meKd9D4NCLiTmj~wX(^s}qU zx(S2EvLId)iPyd_C`CE6cKiyb{sT1H_eNRCYsj{JcvcQv)!tY#@YtQ}LED`cz6aIv zzf>pJj@82J>DiNo>&q&=NMRBgD2R!+MKxPW!h@i7vum!^3P*CrWOAN1)ktsBJ!mr0 z-@8A6??^EI^U$~*YVc1wnjpaKJav%PKd@8Sv8B-zcW$G#hC(9Bf68ee`u2& zJxLUMD~8*@vw|4)1@`1lc$ZSh3a63o`xIM+d?SETLfZN_w>eu!r z6}UZ%z-_^&x+;D|^Zb}?Fh~c7$4NrvAaW?Glk$g@?BJFW(Uydz)-?c`xs!ef%Qhx; zB8BMaVhj2g6!^QaQAN*|F`;zaCl^HT9?5Gf={4Ga_vL4sir6a<_ zhy)lcAq7$=lZX_17w7-niYoNh9#$l>q&<%0O#yC?0(jmB2*JVdxE;gsCX;Mfz(53q z1^nX!L?RH>Fe4k_A21{uMdjnx2k86<4331ssNCKKgF~tE{}T)XlSfkL<0lvt3ZY`! z1(TPDQ5E>7b}$qIy&E4KCckw@!cXnsNCaeeAJhha0fQiR_W=e&<#*Ep2E(Dd#|s7{ zkb7Y$@Lm{tYo>pq4-A$^?}0%e(B1q4Zxzef{VtV(d*4OC_QI%s@ICLM_woQDkDvOKA!)Fu(y&DFBLlC?BfI^{rWQ|G+WcPSs zRA%k&7Y2sGcJlxRM!p!dFuhU{%8zf%V( Date: Tue, 12 Mar 2024 12:50:02 +0100 Subject: [PATCH 07/28] Added results pagination Redefined order of some methods --- src/data/make_dataset.py | 116 ++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 50 deletions(-) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 0993e61..3049d11 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -12,6 +12,7 @@ RAW = 'raw' # name of the folder for raw data PROCESSED = 'processed' # name of the folder for processed data +MAX_PAGES = 2 # limit of the ads pages to be requested # find .env automagically by walking up directories until it's found, then # load up the .env entries as environment variables @@ -54,6 +55,19 @@ def parse_filter_params(params_dict: dict) -> dict: filtered_params[k] = v return filtered_params + def get_oauth_token(self) -> str: + pass + + def get_results(self) -> dict: + pass + + def export_results(self, results: dict): + # export results from query + output_filename = os.path.join(self.data_dir, RAW, f'dump_{int(time.time())}.json') + self.logger.info(f'Exporting data to {output_filename}') + with open(output_filename, 'w') as f: + f.write(json.dumps(results, indent=4)) + def create_dataset(self): pass @@ -68,19 +82,6 @@ def export_dataset(self): output_filename = os.path.join(self.data_dir, PROCESSED, f'df_total_{int(time.time())}.csv') self.df.to_csv(output_filename) - def get_oauth_token(self) -> str: - pass - - def get_results(self) -> dict: - pass - - def export_results(self, results: dict): - # export results from query - output_filename = os.path.join(self.data_dir, RAW, f'dump_{int(time.time())}.json') - self.logger.info(f'Exporting data to {output_filename}') - with open(output_filename, 'w') as f: - f.write(json.dumps(results, indent=4)) - class Idealista(Datasource): api_key: str = os.environ['IDEALISTA_API_KEY'] @@ -89,36 +90,10 @@ class Idealista(Datasource): def __init__(self, name: str, config_filepath: str, data_dir: str): super().__init__(name, config_filepath, data_dir) - def create_dataset(self): - source_dir = os.path.join(self.data_dir, RAW) - json_files = [f for f in pathlib.Path(source_dir).glob("*.json")] - json_files.reverse() # put files in descending order - dfs = [] - for file in json_files: - with open(file, 'r') as f: - elements_dict = json.load(f)['elementList'] - dfs.append(pd.DataFrame.from_dict(elements_dict)) - self.df = pd.concat(dfs) - # removing duplicates based on propertyCode - self.df.drop_duplicates(subset=['propertyCode'], keep='first') - - def clean_dataset(self): - columns = ['propertyCode', 'floor', 'price', 'size', 'rooms', 'bathrooms', 'address', 'province', - 'municipality', 'district', 'latitude', 'longitude', 'showAddress', 'url', 'distance', 'description', - 'status', 'newDevelopment', 'hasLift', 'priceByArea', 'detailedType', 'hasPlan', 'hasStaging', - 'topNewDevelopment', 'topPlus', 'externalReference', 'isAuction', 'parkingSpace', 'labels', - 'highlight', 'newDevelopmentFinished'] - self.df = self.df[columns] # keep only specified columns - - # cast to int - self.df = self.df.astype({'price': 'int', 'size': 'int', 'priceByArea': 'int'}) - - # remove auction ads - self.df = self.df[self.df['isAuction'].isna()] - self.df = self.df.drop(columns=['isAuction']) - - # convert floors to numbers - self.df['floor'] = self.df['floor'].replace('ss', -1).replace('bj', 0).replace('en', 0.5) + def define_search_url(self) -> str: + country = self.filtered_params['country'] + search_url = f'https://api.idealista.com/3.5/{country}/search' + return search_url def get_oauth_token(self) -> str: message = f"{self.api_key}:{self.secret}" @@ -154,20 +129,61 @@ def get_results(self) -> dict: "Content-Type": "application/x-www-form-urlencoded" } try: - r = requests.post(self.search_url, headers=headers_dict, params=self.filtered_params) - r.raise_for_status() + elements = [] + result = self.search(headers_dict) # get results for the first page + self.logger.info(f"Available items: {result['total']} ({result['totalPages']} pages)") + elements.extend(result["elementList"]) + + for i in range(2, min(MAX_PAGES, result["totalPages"]) + 1): + self.filtered_params["numPage"] = i + result = self.search(headers_dict) # get results for the subsequent pages + elements.extend(result["elementList"]) + + result["elementList"] = elements # update dictionary with cumulative results + self.logger.info(f"Stored {len(elements)} items over a total of {result['total']} available") - result = r.json() except requests.exceptions.RequestException as e: self.logger.error(f"Connection error: '{str(e)}'") raise return result - def define_search_url(self) -> str: - country = self.filtered_params['country'] - search_url = f'https://api.idealista.com/3.5/{country}/search' - return search_url + def search(self, headers_dict) -> dict: + r = requests.post(self.search_url, headers=headers_dict, params=self.filtered_params) + r.raise_for_status() + result = r.json() + return result + + def create_dataset(self): + source_dir = os.path.join(self.data_dir, RAW) + json_files = [f for f in pathlib.Path(source_dir).glob("*.json")] + json_files.reverse() # put files in descending order + dfs = [] + for file in json_files: + with open(file, 'r') as f: + elements_dict = json.load(f)['elementList'] + dfs.append(pd.DataFrame.from_dict(elements_dict)) + self.df = pd.concat(dfs) + # removing duplicates based on propertyCode + self.df.drop_duplicates(subset=['propertyCode'], keep='first') + + def clean_dataset(self): + columns = ['propertyCode', 'floor', 'price', 'size', 'rooms', 'bathrooms', 'address', 'province', + 'municipality', 'district', 'latitude', 'longitude', 'showAddress', 'url', 'distance', 'description', + 'status', 'newDevelopment', 'hasLift', 'priceByArea', 'detailedType', 'hasPlan', 'hasStaging', + 'topNewDevelopment', 'topPlus', 'externalReference', 'isAuction', 'parkingSpace', 'labels', + 'highlight', 'newDevelopmentFinished'] + self.df = self.df[columns] # keep only specified columns + + # cast to int + self.df = self.df.astype({'price': 'int', 'size': 'int', 'priceByArea': 'int'}) + + # remove auction ads + self.df = self.df[self.df['isAuction'].isna()] + self.df = self.df.drop(columns=['isAuction']) + + # convert floors to numbers + self.df['floor'] = self.df['floor'].replace('ss', -1).replace('bj', 0).replace('en', 0.5) def main(): From 8f1c70ced38f584b9935d02994366dc4f320791a Mon Sep 17 00:00:00 2001 From: matteorosato Date: Tue, 12 Mar 2024 17:51:41 +0100 Subject: [PATCH 08/28] filtered_params is not a property anymore modified parameters in config.toml --- config.toml | 12 ++++++------ src/data/make_dataset.py | 9 +++------ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/config.toml b/config.toml index 7cb8bc2..ede34e3 100644 --- a/config.toml +++ b/config.toml @@ -20,17 +20,17 @@ adIds = "" # Filter by adid (multivalued field) [array] hasMultimedia = true # Retrieve properties with pictures or video or virtual tour [boolean] [HOME_FILTERS] -minSize = "" # min size (from 60 m2 to 1000m2) [double] -maxSize = "" # max size (from 60 m2 to 1000m2) [double] +minSize = 35 # min size (from 60 m2 to 1000m2) [double] +maxSize = 180 # max size (from 60 m2 to 1000m2) [double] virtualTour = "" # virtual tour [boolean] flat = true # property is a flat [boolean] penthouse = false # [boolean] duplex = true # [boolean] -studio = true # [boolean] +studio = "" # [boolean] chalet = true # [boolean] -countryHouse = false # [boolean] -bedrooms = "" # bedroom number (multivalued field: 0,1,2,3,4) [string] -bathrooms = "" # bathroom number (multivalued field: 0,1,2,3) [string] +countryHouse = true # [boolean] +bedrooms = "1,2,3,4" # bedroom number (multivalued field: 0,1,2,3,4) [string] +bathrooms = "1,2,3" # bathroom number (multivalued field: 0,1,2,3) [string] preservation = "" # property preservation (good, renew) [string] newDevelopment = "" # if true, return only new development properties [boolean] furnished = "" # (furnished, furnishedKitchen) [string] diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 3049d11..6b2f1a0 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -30,10 +30,7 @@ def __init__(self, name: str, config_filepath: str, data_dir: str): self.config_filepath = config_filepath self.data_dir = data_dir self.df = None - - @property - def filtered_params(self): - return self.parse_filter_params( + self.filtered_params = self.parse_filter_params( params_dict=self.read_toml_config(file_path=self.config_filepath)) @property @@ -140,7 +137,7 @@ def get_results(self) -> dict: elements.extend(result["elementList"]) result["elementList"] = elements # update dictionary with cumulative results - self.logger.info(f"Stored {len(elements)} items over a total of {result['total']} available") + self.logger.info(f"Got {len(elements)} items over a total of {result['total']} available") except requests.exceptions.RequestException as e: self.logger.error(f"Connection error: '{str(e)}'") @@ -165,7 +162,7 @@ def create_dataset(self): dfs.append(pd.DataFrame.from_dict(elements_dict)) self.df = pd.concat(dfs) # removing duplicates based on propertyCode - self.df.drop_duplicates(subset=['propertyCode'], keep='first') + self.df = self.df.drop_duplicates(subset=['propertyCode'], keep='first') def clean_dataset(self): columns = ['propertyCode', 'floor', 'price', 'size', 'rooms', 'bathrooms', 'address', 'province', From f1b22aad711ce6a8e4f93e076c93b7315eb1e355 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Wed, 13 Mar 2024 16:07:58 +0100 Subject: [PATCH 09/28] Changed the way duplicates are deleted --- README.md | 2 +- src/data/make_dataset.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 59e1b1a..d48a7b1 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The project is developed entirely using Python and follows object-oriented progr ## Who is this for? This tool is intended for: -- Developers interested in real estate data extraction and analysis. +- Data scientists interested in real estate data extraction and analysis. - Real estate agents/companies looking to integrate listing data into their systems. - Anyone curious about exploring the world of real estate through data. diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 6b2f1a0..5d89896 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -161,16 +161,20 @@ def create_dataset(self): elements_dict = json.load(f)['elementList'] dfs.append(pd.DataFrame.from_dict(elements_dict)) self.df = pd.concat(dfs) - # removing duplicates based on propertyCode - self.df = self.df.drop_duplicates(subset=['propertyCode'], keep='first') def clean_dataset(self): + # removing duplicates based on propertyCode + duplicates = self.df.duplicated(subset=['propertyCode'], keep='first') + self.logger.debug(f'Found {sum(duplicates.values)} duplicates. Removing them...') + self.df = self.df[~duplicates.values] + + # keep only specified columns columns = ['propertyCode', 'floor', 'price', 'size', 'rooms', 'bathrooms', 'address', 'province', 'municipality', 'district', 'latitude', 'longitude', 'showAddress', 'url', 'distance', 'description', 'status', 'newDevelopment', 'hasLift', 'priceByArea', 'detailedType', 'hasPlan', 'hasStaging', 'topNewDevelopment', 'topPlus', 'externalReference', 'isAuction', 'parkingSpace', 'labels', 'highlight', 'newDevelopmentFinished'] - self.df = self.df[columns] # keep only specified columns + self.df = self.df[columns] # cast to int self.df = self.df.astype({'price': 'int', 'size': 'int', 'priceByArea': 'int'}) From c45a2e390369ef2c8de1b9bc00612ea2d1c7d7ff Mon Sep 17 00:00:00 2001 From: matteorosato Date: Wed, 20 Mar 2024 17:31:36 +0100 Subject: [PATCH 10/28] Major improvements in Idealista class Improved clean_dataset method Added minor utility methods Modified the main function Updated dependencies in requirements.txt --- requirements.txt | 3 +- src/data/make_dataset.py | 130 +++++++++++++++++++++++++-------------- 2 files changed, 86 insertions(+), 47 deletions(-) diff --git a/requirements.txt b/requirements.txt index f47dafa..242131d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,6 @@ coverage awscli flake8 python-dotenv>=0.5.1 -pandas>=2.2.0 +pandas==2.1.4 +scikit-learn==0.24.2 toml \ No newline at end of file diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 5d89896..73e6a97 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -9,9 +9,11 @@ import requests import toml from dotenv import find_dotenv, load_dotenv +from sklearn.model_selection import train_test_split -RAW = 'raw' # name of the folder for raw data -PROCESSED = 'processed' # name of the folder for processed data +PROJECT_DIR = pathlib.Path(__file__).resolve().parents[2] +RAW_DIR = PROJECT_DIR.joinpath("data/raw") # name of the folder for raw data +PROCESSED_DIR = PROJECT_DIR.joinpath("data/processed") # name of the folder for processed data MAX_PAGES = 2 # limit of the ads pages to be requested # find .env automagically by walking up directories until it's found, then @@ -23,16 +25,16 @@ class Datasource: api_key = None secret = None - logger = logging.getLogger(__name__) - - def __init__(self, name: str, config_filepath: str, data_dir: str): + def __init__(self, name: str, config_filepath: str): self.name = name self.config_filepath = config_filepath - self.data_dir = data_dir - self.df = None self.filtered_params = self.parse_filter_params( params_dict=self.read_toml_config(file_path=self.config_filepath)) + @property + def logger(self): + return logging.getLogger(f'{__name__}.{self.__class__.__name__}') + @property def search_url(self): return self.define_search_url() @@ -60,7 +62,7 @@ def get_results(self) -> dict: def export_results(self, results: dict): # export results from query - output_filename = os.path.join(self.data_dir, RAW, f'dump_{int(time.time())}.json') + output_filename = RAW_DIR.joinpath(f'dump_{int(time.time())}.json') self.logger.info(f'Exporting data to {output_filename}') with open(output_filename, 'w') as f: f.write(json.dumps(results, indent=4)) @@ -71,21 +73,16 @@ def create_dataset(self): def define_search_url(self) -> str: pass - def clean_dataset(self): + def clean_dataset(self, df) -> pd.DataFrame: pass - def export_dataset(self): - # export df for backup purposes - output_filename = os.path.join(self.data_dir, PROCESSED, f'df_total_{int(time.time())}.csv') - self.df.to_csv(output_filename) - class Idealista(Datasource): api_key: str = os.environ['IDEALISTA_API_KEY'] secret: str = os.environ['IDEALISTA_SECRET'] - def __init__(self, name: str, config_filepath: str, data_dir: str): - super().__init__(name, config_filepath, data_dir) + def __init__(self, name: str, config_filepath: str): + super().__init__(name, config_filepath) def define_search_url(self) -> str: country = self.filtered_params['country'] @@ -123,8 +120,7 @@ def get_oauth_token(self) -> str: def get_results(self) -> dict: token = self.get_oauth_token() headers_dict = {"Authorization": 'Bearer ' + token, - "Content-Type": "application/x-www-form-urlencoded" - } + "Content-Type": "application/x-www-form-urlencoded"} try: elements = [] result = self.search(headers_dict) # get results for the first page @@ -151,8 +147,8 @@ def search(self, headers_dict) -> dict: result = r.json() return result - def create_dataset(self): - source_dir = os.path.join(self.data_dir, RAW) + def create_dataset(self) -> pd.DataFrame: + source_dir = RAW_DIR json_files = [f for f in pathlib.Path(source_dir).glob("*.json")] json_files.reverse() # put files in descending order dfs = [] @@ -160,54 +156,96 @@ def create_dataset(self): with open(file, 'r') as f: elements_dict = json.load(f)['elementList'] dfs.append(pd.DataFrame.from_dict(elements_dict)) - self.df = pd.concat(dfs) + df = pd.concat(dfs) + return df + + def clean_dataset(self, df) -> pd.DataFrame: - def clean_dataset(self): # removing duplicates based on propertyCode - duplicates = self.df.duplicated(subset=['propertyCode'], keep='first') - self.logger.debug(f'Found {sum(duplicates.values)} duplicates. Removing them...') - self.df = self.df[~duplicates.values] + duplicates = df.duplicated(subset=['propertyCode'], keep='first') + self.logger.info(f'Found {sum(duplicates.values)} duplicates. Removing them...') + df = df[~duplicates.values] # keep only specified columns - columns = ['propertyCode', 'floor', 'price', 'size', 'rooms', 'bathrooms', 'address', 'province', - 'municipality', 'district', 'latitude', 'longitude', 'showAddress', 'url', 'distance', 'description', - 'status', 'newDevelopment', 'hasLift', 'priceByArea', 'detailedType', 'hasPlan', 'hasStaging', - 'topNewDevelopment', 'topPlus', 'externalReference', 'isAuction', 'parkingSpace', 'labels', - 'highlight', 'newDevelopmentFinished'] - self.df = self.df[columns] + columns = ['floor', 'size', 'rooms', 'bathrooms', 'municipality', 'district', 'status', 'hasLift', + 'priceByArea', 'parkingSpace', 'price', 'isAuction', 'propertyCode'] + df = df[columns] - # cast to int - self.df = self.df.astype({'price': 'int', 'size': 'int', 'priceByArea': 'int'}) + # fill missing district by setting it equal to municipality + df.fillna({'district': df['municipality']}, inplace=True) # remove auction ads - self.df = self.df[self.df['isAuction'].isna()] - self.df = self.df.drop(columns=['isAuction']) + df = df[df['isAuction'].isna()] + df = df.drop(columns=['isAuction']) + + # extract info for parking space + def has_parking_space(x): + if isinstance(x, dict) and x['hasParkingSpace'] and x['isParkingSpaceIncludedInPrice']: + # keep only the ads with park included in the final price + return True + else: + return False - # convert floors to numbers - self.df['floor'] = self.df['floor'].replace('ss', -1).replace('bj', 0).replace('en', 0.5) + df['parkingSpace'] = df['parkingSpace'].apply(has_parking_space) + + # fill nan + df.fillna({'hasLift': False}, inplace=True) + + # cast to int + cols_to_int = ['price', 'size', 'priceByArea'] + for col in cols_to_int: + df[col] = df[col].astype('int') + + # convert to categories + cols_to_categories = ['floor', 'municipality', 'district', 'status'] + for col in cols_to_categories: + df[col] = df[col].astype('category') + + # apply One Hot Encoding to categories + df = pd.get_dummies(df, columns=cols_to_categories) + + # checking features which have nan values + nan_values = df.isna().sum() + if nan_values.any(): + self.logger.warning("There are still nan values in your dataset. Please check it") + + # use 'propertyCode' as index + df.set_index('propertyCode', inplace=True) + + return df + + @staticmethod + def create_train_test_df(df, test_size=0.2) -> tuple[pd.DataFrame, pd.DataFrame]: + # split the data into train and test set + df_train, df_test = train_test_split(df, test_size=test_size, random_state=11, shuffle=True) + return df_train, df_test def main(): - """ Runs data processing scripts to turn raw data from (../raw) into - cleaned data ready to be analyzed (saved in ../processed). - """ + logger = logging.getLogger(__name__) - idealista = Idealista(name='Idealista', config_filepath='config.toml', data_dir='data') + idealista = Idealista(name='Idealista', config_filepath='config.toml') logger.info(f'Getting results from {idealista.name} website') results = idealista.get_results() logger.info(f'Exporting results from {idealista.name} website') idealista.export_results(results) logger.info('Creating dataset...') - idealista.create_dataset() + df_raw = idealista.create_dataset() + logger.info('Exporting raw data...') + df_raw.to_csv(RAW_DIR.joinpath('raw_data.csv')) logger.info('Cleaning dataset...') - idealista.clean_dataset() - logger.info(f'Exporting dataset...') - idealista.export_dataset() + df_cleaned = idealista.clean_dataset(df_raw) + logger.info('Separating train and test datasets...') + df_train, df_test = idealista.create_train_test_df(df_cleaned) + logger.info('Exporting train data...') + df_train.to_csv(PROCESSED_DIR.joinpath('training_data.csv')) + logger.info('Exporting test data...') + df_test.to_csv(PROCESSED_DIR.joinpath('test_data.csv')) + logger.info('Done!') if __name__ == '__main__': log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) - logger = logging.getLogger(__name__) main() From 653f3e10262f76481dcb0c3b3370ff993414abd6 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Wed, 20 Mar 2024 17:46:46 +0100 Subject: [PATCH 11/28] Added train_model.py: - defined 3 sample regressors - added ModelTrainer class which has methods for training and evaluating models - defined main function --- src/models/train_model.py | 91 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/src/models/train_model.py b/src/models/train_model.py index e69de29..50619a4 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +import logging +import pickle +from pathlib import Path +import pandas as pd +from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score +from sklearn.model_selection import train_test_split, cross_val_score +from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor +from sklearn.metrics import mean_squared_error +from sklearn.neighbors import KNeighborsRegressor + +PROJECT_DIR = Path(__file__).resolve().parents[2] +PROCESSED_DIR = PROJECT_DIR.joinpath("data/processed") # name of the folder for processed data +MODELS_DIR = PROJECT_DIR.joinpath("models") # name of the folder for models +AVAILABLE_MODELS = { + # sample models, may be extended in the future + 'RandomForest': RandomForestRegressor(n_estimators=100, criterion='mse'), + 'ExtraTrees': ExtraTreesRegressor(n_estimators=100, criterion='mae'), + 'Knn': KNeighborsRegressor(n_neighbors=5, weights='distance', metric='minkowski') +} + + +class ModelTrainer: + + def __init__(self, model_name, target_name='price'): + if model_name not in AVAILABLE_MODELS.keys(): + raise ValueError(f'model_name must be one of {list(AVAILABLE_MODELS.keys())}') + self.model_name = model_name + self.target_name = target_name + self.model = AVAILABLE_MODELS[self.model_name] + + @property + def logger(self): + return logging.getLogger(f'{__name__}.{self.__class__.__name__}') + + def split_df(self, df): + X = df.drop(columns=self.target_name) + y = df[self.target_name] + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=555) + return X_train, X_test, y_train, y_test + + def fit_model(self, X, y): + return self.model.fit(X, y) + + def evaluate_model(self, X_train, X_test, y_train, y_test): + y_pred = self.model.predict(X_test).astype(int) + score = self.model.score(X_train, y_train) + self.logger.info(f'R2 score on training data: {round(score, 2)}') + cv_scores = -1 * cross_val_score(self.model, + X=pd.concat([X_train, X_test], axis=0), + y=pd.concat([y_train, y_test], axis=0), + scoring="neg_mean_absolute_error", + cv=5) + mean_cv_score = round(cv_scores.mean(), 2) + self.logger.info(f'Mean cross-validation score: {mean_cv_score}') + metrics = [r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error] + metric_results = dict() + for metric in metrics: + metric_results[metric.__name__] = round(metric(y_test, y_pred), 2) + return metric_results + + def export_model(self): + output_filename = MODELS_DIR.joinpath(self.model_name + '.pkl') + with open(output_filename, 'wb') as f: + pickle.dump(self.model, f) + + def train(self, df): + X_train, X_val, y_train, y_val = self.split_df(df) + self.fit_model(X_train, y_train) + evaluation_dict = self.evaluate_model(X_train, X_val, y_train, y_val) + self.logger.info(f"Results on validation data: {evaluation_dict}") + + +def main(): + my_model = ModelTrainer(model_name='RandomForest', target_name='price') + logger.info('Loading training data...') + training_datapath = PROCESSED_DIR.joinpath('training_data.csv') + training_df = pd.read_csv(training_datapath, index_col='propertyCode') + logger.info('Training the model...') + my_model.train(training_df) + logger.info('Persisting the trained model...') + my_model.export_model() + logger.info('Done!') + + +if __name__ == '__main__': + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) + logger = logging.getLogger(__name__) + + main() From 466b2017a26bb90ede0b4245ab1acb737dd7a125 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 11:40:54 +0100 Subject: [PATCH 12/28] separated build_features from clean_dataset removed "priceByArea" feature create_train_test_df method moved to base Datasource class now exported cleaned instead of raw data --- src/data/make_dataset.py | 44 ++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 73e6a97..50b2f0a 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -76,6 +76,12 @@ def define_search_url(self) -> str: def clean_dataset(self, df) -> pd.DataFrame: pass + @staticmethod + def create_train_test_df(df, test_size=0.2) -> tuple[pd.DataFrame, pd.DataFrame]: + # split the data into train and test set + df_train, df_test = train_test_split(df, test_size=test_size, random_state=11, shuffle=True) + return df_train, df_test + class Idealista(Datasource): api_key: str = os.environ['IDEALISTA_API_KEY'] @@ -167,8 +173,8 @@ def clean_dataset(self, df) -> pd.DataFrame: df = df[~duplicates.values] # keep only specified columns - columns = ['floor', 'size', 'rooms', 'bathrooms', 'municipality', 'district', 'status', 'hasLift', - 'priceByArea', 'parkingSpace', 'price', 'isAuction', 'propertyCode'] + columns = ['floor', 'size', 'rooms', 'bathrooms', 'address', 'municipality', 'district', 'status', 'hasLift', + 'price', 'priceByArea', 'parkingSpace', 'isAuction', 'url', 'propertyCode'] df = df[columns] # fill missing district by setting it equal to municipality @@ -193,13 +199,22 @@ def has_parking_space(x): # cast to int cols_to_int = ['price', 'size', 'priceByArea'] - for col in cols_to_int: - df[col] = df[col].astype('int') + df[cols_to_int] = df[cols_to_int].astype('int') + + # use 'propertyCode' as index + df.set_index('propertyCode', inplace=True) + + return df + + def build_features(self, df) -> pd.DataFrame: + # keep only specified columns (features) + features = ['floor', 'size', 'rooms', 'bathrooms', 'municipality', 'district', 'status', 'hasLift', + 'parkingSpace', 'price'] + df = df[features] # convert to categories cols_to_categories = ['floor', 'municipality', 'district', 'status'] - for col in cols_to_categories: - df[col] = df[col].astype('category') + df[cols_to_categories] = df[cols_to_categories].astype('category') # apply One Hot Encoding to categories df = pd.get_dummies(df, columns=cols_to_categories) @@ -209,17 +224,8 @@ def has_parking_space(x): if nan_values.any(): self.logger.warning("There are still nan values in your dataset. Please check it") - # use 'propertyCode' as index - df.set_index('propertyCode', inplace=True) - return df - @staticmethod - def create_train_test_df(df, test_size=0.2) -> tuple[pd.DataFrame, pd.DataFrame]: - # split the data into train and test set - df_train, df_test = train_test_split(df, test_size=test_size, random_state=11, shuffle=True) - return df_train, df_test - def main(): logger = logging.getLogger(__name__) @@ -231,12 +237,14 @@ def main(): idealista.export_results(results) logger.info('Creating dataset...') df_raw = idealista.create_dataset() - logger.info('Exporting raw data...') - df_raw.to_csv(RAW_DIR.joinpath('raw_data.csv')) logger.info('Cleaning dataset...') df_cleaned = idealista.clean_dataset(df_raw) + logger.info('Exporting cleaned data...') + df_cleaned.to_csv(PROCESSED_DIR.joinpath('cleaned_data.csv')) + logger.info('Building features...') + df_processed = idealista.build_features(df_cleaned) logger.info('Separating train and test datasets...') - df_train, df_test = idealista.create_train_test_df(df_cleaned) + df_train, df_test = idealista.create_train_test_df(df_processed) logger.info('Exporting train data...') df_train.to_csv(PROCESSED_DIR.joinpath('training_data.csv')) logger.info('Exporting test data...') From 6cf01e7c44927152488f67593f7b23dfcebcee46 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 11:43:09 +0100 Subject: [PATCH 13/28] first version of predict_model.py - created Predictor class along with its methods - defined a main function --- src/models/predict_model.py | 100 ++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/src/models/predict_model.py b/src/models/predict_model.py index e69de29..9557752 100644 --- a/src/models/predict_model.py +++ b/src/models/predict_model.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +import logging +import os +import pickle +import time +from pathlib import Path +import pandas as pd +from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score +from sklearn.metrics import mean_squared_error + +PROJECT_DIR = Path(__file__).resolve().parents[2] +RAW_DIR = PROJECT_DIR.joinpath("data/raw") # name of the folder for raw data +PROCESSED_DIR = PROJECT_DIR.joinpath("data/processed") # name of the folder for processed data +MODELS_DIR = PROJECT_DIR.joinpath("models") # name of the folder for models + + +class Predictor: + + def __init__(self, model_filepath): + self.model_filepath = model_filepath + self.model_name = self.get_model_name(self.model_filepath) + self.model = self.load_model(self.model_filepath) + + @property + def logger(self): + return logging.getLogger(f'{__name__}.{self.__class__.__name__}') + + @staticmethod + def get_model_name(filepath): + filename = os.path.basename(filepath) + model_name = os.path.splitext(filename)[0] + return model_name + + @staticmethod + def load_model(model_filepath): + with open(model_filepath, 'rb') as f: + model = pickle.load(f) + return model + + @staticmethod + def evaluate_model(y_test, y_pred): + metrics = [r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error] + metric_results = dict() + for metric in metrics: + metric_results[metric.__name__] = round(metric(y_test, y_pred), 2) + return metric_results + + @staticmethod + def generate_report(cleaned_df, predictions_df) -> pd.DataFrame: + # keep only specified columns + columns = ['municipality', 'address', 'size', 'url'] + df = cleaned_df[columns] + + # create a new column for including price difference (predicted vs real) + predictions_df['price_diff'] = predictions_df['predicted_price'] - predictions_df['price'] + + # add info about prices removing unneeded items (inner join) + df = df.join(predictions_df, how='inner').sort_values(by="price_diff", ascending=True) + + return df + + def predict(self, test_df) -> pd.DataFrame: + price = test_df.pop('price') + + self.logger.info(f"Run predictions...") + predicted_price = self.model.predict(test_df).astype(int) + + evaluation_dict = self.evaluate_model(y_test=price, y_pred=predicted_price) + self.logger.info(f"Results on test data: {evaluation_dict}") + + predictions_df = pd.DataFrame({'price': price, 'predicted_price': predicted_price}) + return predictions_df + + +def main(): + model_filepath = MODELS_DIR.joinpath('RandomForest' + '.pkl') + predictor = Predictor(model_filepath=model_filepath) + + test_datapath = PROCESSED_DIR.joinpath('test_data.csv') + test_df = pd.read_csv(test_datapath, index_col='propertyCode') + + logger.info(f'Predicting results with {predictor.model_name} model') + predictions_df = predictor.predict(test_df) + + cleaned_datapath = PROCESSED_DIR.joinpath('cleaned_data.csv') + cleaned_df = pd.read_csv(cleaned_datapath, index_col='propertyCode') + results_df = predictor.generate_report(cleaned_df, predictions_df) + + output_filepath = PROJECT_DIR.joinpath(f'results/report_{int(time.time())}.csv') + logger.info(f'Exporting final report to to {output_filepath}') + results_df.to_csv(output_filepath) + logger.info("Done!") + + +if __name__ == "__main__": + log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + logging.basicConfig(level=logging.INFO, format=log_fmt) + logger = logging.getLogger(__name__) + + main() From a7e41280e403dbd4f105b4853678229a80d99fd6 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 11:57:54 +0100 Subject: [PATCH 14/28] added constants.py refactored modules using constants --- src/constants.py | 8 ++++++++ src/data/make_dataset.py | 12 +++++------- src/models/predict_model.py | 9 ++------- src/models/train_model.py | 5 +---- 4 files changed, 16 insertions(+), 18 deletions(-) create mode 100644 src/constants.py diff --git a/src/constants.py b/src/constants.py new file mode 100644 index 0000000..3de92b9 --- /dev/null +++ b/src/constants.py @@ -0,0 +1,8 @@ +"""This module defines project-level constants""" +import pathlib + +PROJECT_DIR = pathlib.Path(__file__).resolve().parents[1] +RAW_DIR = PROJECT_DIR.joinpath("data/raw") # name of the folder for raw data +PROCESSED_DIR = PROJECT_DIR.joinpath("data/processed") # name of the folder for processed data +MODELS_DIR = PROJECT_DIR.joinpath("models") # name of the folder for models +RESULTS_DIR = PROJECT_DIR.joinpath("results") # name of the folder for storing results diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 50b2f0a..f8178e6 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -10,10 +10,8 @@ import toml from dotenv import find_dotenv, load_dotenv from sklearn.model_selection import train_test_split +from src.constants import RAW_DIR, PROCESSED_DIR -PROJECT_DIR = pathlib.Path(__file__).resolve().parents[2] -RAW_DIR = PROJECT_DIR.joinpath("data/raw") # name of the folder for raw data -PROCESSED_DIR = PROJECT_DIR.joinpath("data/processed") # name of the folder for processed data MAX_PAGES = 2 # limit of the ads pages to be requested # find .env automagically by walking up directories until it's found, then @@ -231,10 +229,10 @@ def main(): logger = logging.getLogger(__name__) idealista = Idealista(name='Idealista', config_filepath='config.toml') - logger.info(f'Getting results from {idealista.name} website') - results = idealista.get_results() - logger.info(f'Exporting results from {idealista.name} website') - idealista.export_results(results) + # logger.info(f'Getting results from {idealista.name} website') + # results = idealista.get_results() + # logger.info(f'Exporting results from {idealista.name} website') + # idealista.export_results(results) logger.info('Creating dataset...') df_raw = idealista.create_dataset() logger.info('Cleaning dataset...') diff --git a/src/models/predict_model.py b/src/models/predict_model.py index 9557752..ae07a2f 100644 --- a/src/models/predict_model.py +++ b/src/models/predict_model.py @@ -3,15 +3,10 @@ import os import pickle import time -from pathlib import Path import pandas as pd from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error - -PROJECT_DIR = Path(__file__).resolve().parents[2] -RAW_DIR = PROJECT_DIR.joinpath("data/raw") # name of the folder for raw data -PROCESSED_DIR = PROJECT_DIR.joinpath("data/processed") # name of the folder for processed data -MODELS_DIR = PROJECT_DIR.joinpath("models") # name of the folder for models +from src.constants import PROJECT_DIR, PROCESSED_DIR, MODELS_DIR, RESULTS_DIR class Predictor: @@ -86,7 +81,7 @@ def main(): cleaned_df = pd.read_csv(cleaned_datapath, index_col='propertyCode') results_df = predictor.generate_report(cleaned_df, predictions_df) - output_filepath = PROJECT_DIR.joinpath(f'results/report_{int(time.time())}.csv') + output_filepath = RESULTS_DIR.joinpath(f'report_{int(time.time())}.csv') logger.info(f'Exporting final report to to {output_filepath}') results_df.to_csv(output_filepath) logger.info("Done!") diff --git a/src/models/train_model.py b/src/models/train_model.py index 50619a4..a414e89 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -1,17 +1,14 @@ # -*- coding: utf-8 -*- import logging import pickle -from pathlib import Path import pandas as pd from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score from sklearn.model_selection import train_test_split, cross_val_score from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor from sklearn.metrics import mean_squared_error from sklearn.neighbors import KNeighborsRegressor +from src.constants import PROCESSED_DIR, MODELS_DIR -PROJECT_DIR = Path(__file__).resolve().parents[2] -PROCESSED_DIR = PROJECT_DIR.joinpath("data/processed") # name of the folder for processed data -MODELS_DIR = PROJECT_DIR.joinpath("models") # name of the folder for models AVAILABLE_MODELS = { # sample models, may be extended in the future 'RandomForest': RandomForestRegressor(n_estimators=100, criterion='mse'), From f63367fe2ff8e05b6360d689ff788c1336ff9132 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 11:59:00 +0100 Subject: [PATCH 15/28] added results folder to .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7ebb815..6da6588 100644 --- a/.gitignore +++ b/.gitignore @@ -80,8 +80,9 @@ venv.bak/ # Jupyter NB Checkpoints .ipynb_checkpoints/ -# exclude data from source control by default +# exclude specific folders from source control by default /data/ +/results/ # Mac OS-specific storage files .DS_Store From 9760f1ebc612d98b32de47d062466c5d69b9c4b9 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 12:15:54 +0100 Subject: [PATCH 16/28] max_pages is now an instance attribute --- src/data/make_dataset.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index f8178e6..7f97d48 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -10,9 +10,8 @@ import toml from dotenv import find_dotenv, load_dotenv from sklearn.model_selection import train_test_split -from src.constants import RAW_DIR, PROCESSED_DIR -MAX_PAGES = 2 # limit of the ads pages to be requested +from src.constants import RAW_DIR, PROCESSED_DIR # find .env automagically by walking up directories until it's found, then # load up the .env entries as environment variables @@ -23,9 +22,10 @@ class Datasource: api_key = None secret = None - def __init__(self, name: str, config_filepath: str): + def __init__(self, name: str, config_filepath: str, max_pages=2): self.name = name self.config_filepath = config_filepath + self.max_pages = max_pages # limit of the ads pages to be requested self.filtered_params = self.parse_filter_params( params_dict=self.read_toml_config(file_path=self.config_filepath)) @@ -131,7 +131,7 @@ def get_results(self) -> dict: self.logger.info(f"Available items: {result['total']} ({result['totalPages']} pages)") elements.extend(result["elementList"]) - for i in range(2, min(MAX_PAGES, result["totalPages"]) + 1): + for i in range(2, min(self.max_pages, result["totalPages"]) + 1): self.filtered_params["numPage"] = i result = self.search(headers_dict) # get results for the subsequent pages elements.extend(result["elementList"]) @@ -229,10 +229,10 @@ def main(): logger = logging.getLogger(__name__) idealista = Idealista(name='Idealista', config_filepath='config.toml') - # logger.info(f'Getting results from {idealista.name} website') - # results = idealista.get_results() - # logger.info(f'Exporting results from {idealista.name} website') - # idealista.export_results(results) + logger.info(f'Getting results from {idealista.name} website') + results = idealista.get_results() + logger.info(f'Exporting results from {idealista.name} website') + idealista.export_results(results) logger.info('Creating dataset...') df_raw = idealista.create_dataset() logger.info('Cleaning dataset...') From 9f19d393f569651f99cf2ede1d91c47224baccf6 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 12:35:15 +0100 Subject: [PATCH 17/28] minor changes in predict_model.py --- src/models/predict_model.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/models/predict_model.py b/src/models/predict_model.py index ae07a2f..bce2ca9 100644 --- a/src/models/predict_model.py +++ b/src/models/predict_model.py @@ -6,7 +6,7 @@ import pandas as pd from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error -from src.constants import PROJECT_DIR, PROCESSED_DIR, MODELS_DIR, RESULTS_DIR +from src.constants import PROCESSED_DIR, MODELS_DIR, RESULTS_DIR class Predictor: @@ -50,19 +50,15 @@ def generate_report(cleaned_df, predictions_df) -> pd.DataFrame: predictions_df['price_diff'] = predictions_df['predicted_price'] - predictions_df['price'] # add info about prices removing unneeded items (inner join) - df = df.join(predictions_df, how='inner').sort_values(by="price_diff", ascending=True) + df = df.join(predictions_df, how='inner').sort_values(by='price_diff', ascending=True) return df def predict(self, test_df) -> pd.DataFrame: price = test_df.pop('price') - - self.logger.info(f"Run predictions...") predicted_price = self.model.predict(test_df).astype(int) - evaluation_dict = self.evaluate_model(y_test=price, y_pred=predicted_price) - self.logger.info(f"Results on test data: {evaluation_dict}") - + self.logger.info(f'Results on test data: {evaluation_dict}') predictions_df = pd.DataFrame({'price': price, 'predicted_price': predicted_price}) return predictions_df @@ -81,10 +77,10 @@ def main(): cleaned_df = pd.read_csv(cleaned_datapath, index_col='propertyCode') results_df = predictor.generate_report(cleaned_df, predictions_df) - output_filepath = RESULTS_DIR.joinpath(f'report_{int(time.time())}.csv') - logger.info(f'Exporting final report to to {output_filepath}') + output_filepath = RESULTS_DIR.joinpath(f'results_{int(time.time())}.csv') + logger.info(f'Exporting final results to {output_filepath}') results_df.to_csv(output_filepath) - logger.info("Done!") + logger.info('Done!') if __name__ == "__main__": From fefee775299b8dd73e95810e2adb69db08837ea5 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 12:50:35 +0100 Subject: [PATCH 18/28] added type hint to make_dataset.py methods --- src/data/make_dataset.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 7f97d48..1a7eb49 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -5,6 +5,7 @@ import os import pathlib import time +from typing import Union, Dict, Any, Tuple import pandas as pd import requests import toml @@ -22,7 +23,7 @@ class Datasource: api_key = None secret = None - def __init__(self, name: str, config_filepath: str, max_pages=2): + def __init__(self, name: str, config_filepath: Union[str, pathlib.Path], max_pages: int = 2): self.name = name self.config_filepath = config_filepath self.max_pages = max_pages # limit of the ads pages to be requested @@ -30,21 +31,21 @@ def __init__(self, name: str, config_filepath: str, max_pages=2): params_dict=self.read_toml_config(file_path=self.config_filepath)) @property - def logger(self): + def logger(self) -> logging.Logger: return logging.getLogger(f'{__name__}.{self.__class__.__name__}') @property - def search_url(self): + def search_url(self) -> str: return self.define_search_url() @staticmethod - def read_toml_config(file_path: str) -> dict: + def read_toml_config(file_path: Union[str, pathlib.Path]) -> Dict[str, Any]: with open(file_path, 'r') as file: config_dict = toml.load(file) return config_dict @staticmethod - def parse_filter_params(params_dict: dict) -> dict: + def parse_filter_params(params_dict: Dict[str, Dict[str, Any]]) -> Dict[str, Any]: filtered_params = dict() for dictionary in params_dict.values(): for k, v in dictionary.items(): @@ -75,8 +76,7 @@ def clean_dataset(self, df) -> pd.DataFrame: pass @staticmethod - def create_train_test_df(df, test_size=0.2) -> tuple[pd.DataFrame, pd.DataFrame]: - # split the data into train and test set + def create_train_test_df(df: pd.DataFrame, test_size: float = 0.2) -> Tuple[pd.DataFrame, pd.DataFrame]: df_train, df_test = train_test_split(df, test_size=test_size, random_state=11, shuffle=True) return df_train, df_test @@ -229,10 +229,10 @@ def main(): logger = logging.getLogger(__name__) idealista = Idealista(name='Idealista', config_filepath='config.toml') - logger.info(f'Getting results from {idealista.name} website') - results = idealista.get_results() - logger.info(f'Exporting results from {idealista.name} website') - idealista.export_results(results) + # logger.info(f'Getting results from {idealista.name} website') + # results = idealista.get_results() + # logger.info(f'Exporting results from {idealista.name} website') + # idealista.export_results(results) logger.info('Creating dataset...') df_raw = idealista.create_dataset() logger.info('Cleaning dataset...') From 8d7145ce1e6bab699907dd733e056c78b00e8a3d Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 12:57:02 +0100 Subject: [PATCH 19/28] added type hint to train_model.py methods --- src/models/train_model.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/models/train_model.py b/src/models/train_model.py index a414e89..7de9fdb 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -2,6 +2,7 @@ import logging import pickle import pandas as pd +from typing import Tuple, Dict from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score from sklearn.model_selection import train_test_split, cross_val_score from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor @@ -19,7 +20,7 @@ class ModelTrainer: - def __init__(self, model_name, target_name='price'): + def __init__(self, model_name: str, target_name: str = 'price'): if model_name not in AVAILABLE_MODELS.keys(): raise ValueError(f'model_name must be one of {list(AVAILABLE_MODELS.keys())}') self.model_name = model_name @@ -27,26 +28,26 @@ def __init__(self, model_name, target_name='price'): self.model = AVAILABLE_MODELS[self.model_name] @property - def logger(self): + def logger(self) -> logging.Logger: return logging.getLogger(f'{__name__}.{self.__class__.__name__}') - def split_df(self, df): + def split_df(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: X = df.drop(columns=self.target_name) y = df[self.target_name] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=555) return X_train, X_test, y_train, y_test - def fit_model(self, X, y): - return self.model.fit(X, y) + def fit_model(self, X: pd.DataFrame, y: pd.Series): + self.model.fit(X, y) - def evaluate_model(self, X_train, X_test, y_train, y_test): + def evaluate_model(self, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> Dict[str, float]: y_pred = self.model.predict(X_test).astype(int) score = self.model.score(X_train, y_train) self.logger.info(f'R2 score on training data: {round(score, 2)}') cv_scores = -1 * cross_val_score(self.model, X=pd.concat([X_train, X_test], axis=0), y=pd.concat([y_train, y_test], axis=0), - scoring="neg_mean_absolute_error", + scoring='neg_mean_absolute_error', cv=5) mean_cv_score = round(cv_scores.mean(), 2) self.logger.info(f'Mean cross-validation score: {mean_cv_score}') @@ -61,7 +62,7 @@ def export_model(self): with open(output_filename, 'wb') as f: pickle.dump(self.model, f) - def train(self, df): + def train(self, df: pd.DataFrame): X_train, X_val, y_train, y_val = self.split_df(df) self.fit_model(X_train, y_train) evaluation_dict = self.evaluate_model(X_train, X_val, y_train, y_val) @@ -73,7 +74,7 @@ def main(): logger.info('Loading training data...') training_datapath = PROCESSED_DIR.joinpath('training_data.csv') training_df = pd.read_csv(training_datapath, index_col='propertyCode') - logger.info('Training the model...') + logger.info(f'Training the {my_model.model_name} model...') my_model.train(training_df) logger.info('Persisting the trained model...') my_model.export_model() From dfc81230b5cc32465a6e7028e8db56f6ff240423 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 13:02:48 +0100 Subject: [PATCH 20/28] added type hint to predict_model.py methods modified get_model_name method --- src/models/predict_model.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/models/predict_model.py b/src/models/predict_model.py index bce2ca9..7bbe4aa 100644 --- a/src/models/predict_model.py +++ b/src/models/predict_model.py @@ -3,7 +3,9 @@ import os import pickle import time +from pathlib import Path import pandas as pd +from typing import Dict from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error from src.constants import PROCESSED_DIR, MODELS_DIR, RESULTS_DIR @@ -11,29 +13,27 @@ class Predictor: - def __init__(self, model_filepath): + def __init__(self, model_filepath: Path): self.model_filepath = model_filepath self.model_name = self.get_model_name(self.model_filepath) self.model = self.load_model(self.model_filepath) @property - def logger(self): + def logger(self) -> logging.Logger: return logging.getLogger(f'{__name__}.{self.__class__.__name__}') @staticmethod - def get_model_name(filepath): - filename = os.path.basename(filepath) - model_name = os.path.splitext(filename)[0] - return model_name + def get_model_name(filepath: Path) -> str: + return os.path.splitext(filepath.name)[0] @staticmethod - def load_model(model_filepath): + def load_model(model_filepath: Path): with open(model_filepath, 'rb') as f: model = pickle.load(f) return model @staticmethod - def evaluate_model(y_test, y_pred): + def evaluate_model(y_test: pd.Series, y_pred: pd.Series) -> Dict[str, float]: metrics = [r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error] metric_results = dict() for metric in metrics: @@ -41,7 +41,7 @@ def evaluate_model(y_test, y_pred): return metric_results @staticmethod - def generate_report(cleaned_df, predictions_df) -> pd.DataFrame: + def generate_report(cleaned_df: pd.DataFrame, predictions_df: pd.DataFrame) -> pd.DataFrame: # keep only specified columns columns = ['municipality', 'address', 'size', 'url'] df = cleaned_df[columns] @@ -54,7 +54,7 @@ def generate_report(cleaned_df, predictions_df) -> pd.DataFrame: return df - def predict(self, test_df) -> pd.DataFrame: + def predict(self, test_df: pd.DataFrame) -> pd.DataFrame: price = test_df.pop('price') predicted_price = self.model.predict(test_df).astype(int) evaluation_dict = self.evaluate_model(y_test=price, y_pred=predicted_price) From 6330cd7bca8d84887927a208d4bac92bae55c9b4 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 17:45:58 +0100 Subject: [PATCH 21/28] removed some models from train_model.py --- src/models/train_model.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/models/train_model.py b/src/models/train_model.py index 7de9fdb..cc9ebc2 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -5,16 +5,13 @@ from typing import Tuple, Dict from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score from sklearn.model_selection import train_test_split, cross_val_score -from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor +from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error -from sklearn.neighbors import KNeighborsRegressor from src.constants import PROCESSED_DIR, MODELS_DIR AVAILABLE_MODELS = { # sample models, may be extended in the future - 'RandomForest': RandomForestRegressor(n_estimators=100, criterion='mse'), - 'ExtraTrees': ExtraTreesRegressor(n_estimators=100, criterion='mae'), - 'Knn': KNeighborsRegressor(n_neighbors=5, weights='distance', metric='minkowski') + 'RandomForest': RandomForestRegressor(n_estimators=100, criterion='mse') } From bd03382ca1b49067448a53872cabfc491af45778 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Fri, 22 Mar 2024 17:46:14 +0100 Subject: [PATCH 22/28] minor changes in config.toml --- config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.toml b/config.toml index ede34e3..85e38a6 100644 --- a/config.toml +++ b/config.toml @@ -6,8 +6,8 @@ country = "it" # Country code for the website (required) [string] operation = "sale" # Type of operation (required) [string] propertyType = "homes" # Type of property (required) [string] center = "40.353,18.174" # Geographic coordinates for search center (WGS84) (latitude, longitude) [string] +distance = 4000 # Distance to center, in meters (ratio) [integer]. Check also https://www.calcmaps.com/map-radius/ locale = "it" # Search language for summary [string] -distance = 3000 # Distance to center, in meters (ratio) [integer] locationId = "" # Idealista location code [string] maxItems = 50 # Items per page (maximum allowed: 50) [integer] numPage = "" # Page number for pagination (1, 2, 3..n) [integer] From b0934469d3a34dba922d77d149220ea90a1ef906 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Mon, 25 Mar 2024 12:58:41 +0100 Subject: [PATCH 23/28] Changed references from "property-finder" "house-finder" added author email in setup.py minor changes in README.md --- Makefile | 2 +- README.md | 27 +++++++++++++++++++-------- docs/Makefile | 8 ++++---- docs/conf.py | 18 +++++++++--------- docs/index.rst | 4 ++-- docs/make.bat | 4 ++-- setup.py | 5 +++-- 7 files changed, 40 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index c6b2ae2..a375de2 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') PROFILE = default -PROJECT_NAME = property-finder +PROJECT_NAME = house-finder PYTHON_INTERPRETER = python3 ifeq (,$(shell which conda)) diff --git a/README.md b/README.md index d48a7b1..a40c654 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,19 @@ -Property finder +House finder ============================== -This project is dedicated to retrieving and analyzing property advertisements from real estate websites. It provides a +This project is dedicated to retrieving and analyzing house advertisements from real estate websites. It provides a convenient way to gather information for analysis, research, or any other purposes related to the real estate domain. +Below are the main steps: + +1. **Data Collection**: Collects data from real estate websites using their APIs. +2. **Data Analysis**: Analyzes the collected data using Pandas and Machine Learning (ML) algorithms. +3. **Report Generation**: Generates a report summarizing the findings from the analysis. The project is developed entirely using Python and follows object-oriented programming (OOP) practices. The initial template is provided by [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/). ## Who is this for? -This tool is intended for: - Data scientists interested in real estate data extraction and analysis. -- Real estate agents/companies looking to integrate listing data into their systems. +- Real estate companies looking to integrate listing data into their systems. - Anyone curious about exploring the world of real estate through data. ## Fair Use Disclaimer @@ -21,12 +25,12 @@ To use the tool, follow these steps: 1. Ensure you have Python 3.10 and pip installed on your system. 2. Clone the repository to your local machine: ```shell - git clone https://github.com/matteorosato/property-finder.git + git clone https://github.com/matteorosato/house-finder.git ``` 3. Navigate to the project directory: ``` - cd property-finder + cd house-finder ``` 4. Create a virtual environment for the project: @@ -52,9 +56,16 @@ To use the tool, follow these steps: 8. Fill the `config.toml` file according to your preferences and needs. -9. Run the tool: +9. Run the tool (select one or more steps accordingly): ``` - TODO + # Download the data and create the dataset + python src/data/make_dataset.py + + # Train the model + python src/models/train_model.py + + # Make predictions + python src/models/predict_model.py ``` ## Supported websites diff --git a/docs/Makefile b/docs/Makefile index cb5a700..eef267f 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -77,17 +77,17 @@ qthelp: @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/property-finder.qhcp" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/house-finder.qhcp" @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/property-finder.qhc" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/house-finder.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/property-finder" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/property-finder" + @echo "# mkdir -p $$HOME/.local/share/devhelp/house-finder" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/house-finder" @echo "# devhelp" epub: diff --git a/docs/conf.py b/docs/conf.py index c63afa5..58c613e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Property finder documentation build configuration file, created by +# House finder documentation build configuration file, created by # sphinx-quickstart. # # This file is execfile()d with the current directory set to its containing dir. @@ -41,7 +41,7 @@ master_doc = 'index' # General information about the project. -project = u'Property finder' +project = u'House finder' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -164,7 +164,7 @@ # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'property-finderdoc' +htmlhelp_basename = 'house-finderdoc' # -- Options for LaTeX output -------------------------------------------------- @@ -184,8 +184,8 @@ # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', - 'property-finder.tex', - u'Property finder Documentation', + 'house-finder.tex', + u'House finder Documentation', u"Matteo Rosato", 'manual'), ] @@ -215,7 +215,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'property-finder', u'Property finder Documentation', + ('index', 'house-finder', u'House finder Documentation', [u"Matteo Rosato"], 1) ] @@ -229,9 +229,9 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'property-finder', u'Property finder Documentation', - u"Matteo Rosato", 'Property finder', - 'This project is dedicated to retrieving and analyzing property advertisements from real estate websites.', 'Miscellaneous'), + ('index', 'house-finder', u'House finder Documentation', + u"Matteo Rosato", 'House finder', + 'Project for retrieving and analyzing house ads from real estate websites.', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. diff --git a/docs/index.rst b/docs/index.rst index ef71dea..6940723 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,9 +1,9 @@ -.. Property finder documentation master file, created by +.. House finder documentation master file, created by sphinx-quickstart. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Property finder documentation! +House finder documentation! ============================================== Contents: diff --git a/docs/make.bat b/docs/make.bat index 584d7fd..48d2554 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -99,9 +99,9 @@ if "%1" == "qthelp" ( echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\property-finder.qhcp + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\house-finder.qhcp echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\property-finder.ghc + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\house-finder.ghc goto end ) diff --git a/setup.py b/setup.py index 08eeec2..2e6772d 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,8 @@ name='src', packages=find_packages(), version='0.1.0', - description='Project for retrieving and analyzing property advertisements from real estate websites.', + description='Project for retrieving and analyzing house ads from real estate websites.', author='Matteo Rosato', - license='AGPL-3.0 license', + author_email='matteorosato.dev@gmail.com', + license='AGPL-3.0 license' ) From 32ff8dd072903f2d5d124e1a40c5972f90956ba5 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Tue, 26 Mar 2024 12:40:59 +0100 Subject: [PATCH 24/28] modified README.md modified requirements.txt added CHANGELOG.md --- CHANGELOG.md | 11 +++++++++++ README.md | 2 +- requirements.txt | 13 +++++++------ 3 files changed, 19 insertions(+), 7 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..0e46d4d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Changelog + +## 0.1.0 (2024-03-26) +#### New Features + +* defined project structure +* added scripts for generating data +* added scripts for training and prediction + +#### Docs +* added README \ No newline at end of file diff --git a/README.md b/README.md index a40c654..ab8f1e8 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Note that this code is provided free of charge as is. For any bugs, see the issu ## Setup and Use To use the tool, follow these steps: -1. Ensure you have Python 3.10 and pip installed on your system. +1. Ensure you have Python 3.9 and pip installed on your system. 2. Clone the repository to your local machine: ```shell git clone https://github.com/matteorosato/house-finder.git diff --git a/requirements.txt b/requirements.txt index 242131d..ceb72b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,12 +2,13 @@ -e . # external requirements -click -Sphinx -coverage -awscli -flake8 +# click +# Sphinx +# coverage +# awscli +# flake8 python-dotenv>=0.5.1 pandas==2.1.4 scikit-learn==0.24.2 -toml \ No newline at end of file +toml==0.10.2 +auto-changelog==0.6.0 \ No newline at end of file From 88422e83a98a5318d5634712fac35e4df6e15846 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Tue, 26 Mar 2024 17:42:44 +0100 Subject: [PATCH 25/28] added run.py fixed make_dataset.py, train_model.py, predict_model.py changed criterion in RandomForestRegressor updated README.md updated dependencies in requirements.txt --- README.md | 44 ++++++++++++++++++++----------------- requirements.txt | 3 ++- src/data/make_dataset.py | 17 +++++++------- src/models/predict_model.py | 8 +++---- src/models/train_model.py | 12 +++++----- src/run.py | 19 ++++++++++++++++ 6 files changed, 63 insertions(+), 40 deletions(-) create mode 100644 src/run.py diff --git a/README.md b/README.md index ab8f1e8..4b0d801 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Note that this code is provided free of charge as is. For any bugs, see the issu ## Setup and Use To use the tool, follow these steps: -1. Ensure you have Python 3.9 and pip installed on your system. +1. Ensure you have Python 3.10 and pip installed on your system. 2. Clone the repository to your local machine: ```shell git clone https://github.com/matteorosato/house-finder.git @@ -33,39 +33,43 @@ To use the tool, follow these steps: cd house-finder ``` -4. Create a virtual environment for the project: - ``` - python -m venv venv - ``` +4. [Create a virtual environment](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/#create-a-new-virtual-environment) for the project: + - On Windows: + ``` + py -m venv .venv + ``` + - On macOS and Linux: + ``` + python3 -m venv .venv + ``` -5. Activate the virtual environment: +5. [Activate the virtual environment](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/#activate-a-virtual-environment): - On Windows: ``` - venv\Scripts\activate + .venv\Scripts\activate ``` - On macOS and Linux: ``` - source venv/bin/activate + source .venv/bin/activate ``` -6. Install the required dependencies by running: - ``` - pip install -r requirements.txt - ``` +6. Install the [required dependencies](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/#using-a-requirements-file) by running: + - On Windows: + ``` + py -m pip install -r requirements.txt + ``` + - On macOS and Linux: + ``` + python3 -m pip install -r requirements.txt + ``` 7. Fill the `.env` file with the required environment variables. Use the `.env.example` file as reference. 8. Fill the `config.toml` file according to your preferences and needs. -9. Run the tool (select one or more steps accordingly): +9. Run the tool: ``` # Download the data and create the dataset - python src/data/make_dataset.py - - # Train the model - python src/models/train_model.py - - # Make predictions - python src/models/predict_model.py + python src/run.py ``` ## Supported websites diff --git a/requirements.txt b/requirements.txt index ceb72b0..111aaaf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,8 +7,9 @@ # coverage # awscli # flake8 +requests python-dotenv>=0.5.1 pandas==2.1.4 -scikit-learn==0.24.2 +scikit-learn==1.3.2 toml==0.10.2 auto-changelog==0.6.0 \ No newline at end of file diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 1a7eb49..3575e97 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -12,7 +12,7 @@ from dotenv import find_dotenv, load_dotenv from sklearn.model_selection import train_test_split -from src.constants import RAW_DIR, PROCESSED_DIR +from src.constants import RAW_DIR, PROCESSED_DIR, PROJECT_DIR # find .env automagically by walking up directories until it's found, then # load up the .env entries as environment variables @@ -226,13 +226,15 @@ def build_features(self, df) -> pd.DataFrame: def main(): + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) logger = logging.getLogger(__name__) - idealista = Idealista(name='Idealista', config_filepath='config.toml') - # logger.info(f'Getting results from {idealista.name} website') - # results = idealista.get_results() - # logger.info(f'Exporting results from {idealista.name} website') - # idealista.export_results(results) + idealista = Idealista(name='Idealista', config_filepath=PROJECT_DIR.joinpath('config.toml')) + logger.info(f'Getting results from {idealista.name} website') + results = idealista.get_results() + logger.info(f'Exporting results from {idealista.name} website') + idealista.export_results(results) logger.info('Creating dataset...') df_raw = idealista.create_dataset() logger.info('Cleaning dataset...') @@ -251,7 +253,4 @@ def main(): if __name__ == '__main__': - log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' - logging.basicConfig(level=logging.INFO, format=log_fmt) - main() diff --git a/src/models/predict_model.py b/src/models/predict_model.py index 7bbe4aa..ff77418 100644 --- a/src/models/predict_model.py +++ b/src/models/predict_model.py @@ -64,6 +64,10 @@ def predict(self, test_df: pd.DataFrame) -> pd.DataFrame: def main(): + log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + logging.basicConfig(level=logging.INFO, format=log_fmt) + logger = logging.getLogger(__name__) + model_filepath = MODELS_DIR.joinpath('RandomForest' + '.pkl') predictor = Predictor(model_filepath=model_filepath) @@ -84,8 +88,4 @@ def main(): if __name__ == "__main__": - log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - logging.basicConfig(level=logging.INFO, format=log_fmt) - logger = logging.getLogger(__name__) - main() diff --git a/src/models/train_model.py b/src/models/train_model.py index cc9ebc2..9ea7553 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -11,7 +11,7 @@ AVAILABLE_MODELS = { # sample models, may be extended in the future - 'RandomForest': RandomForestRegressor(n_estimators=100, criterion='mse') + 'RandomForest': RandomForestRegressor(n_estimators=100, criterion='friedman_mse') } @@ -37,7 +37,8 @@ def split_df(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Ser def fit_model(self, X: pd.DataFrame, y: pd.Series): self.model.fit(X, y) - def evaluate_model(self, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> Dict[str, float]: + def evaluate_model(self, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> \ + Dict[str, float]: y_pred = self.model.predict(X_test).astype(int) score = self.model.score(X_train, y_train) self.logger.info(f'R2 score on training data: {round(score, 2)}') @@ -67,6 +68,9 @@ def train(self, df: pd.DataFrame): def main(): + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) + logger = logging.getLogger(__name__) my_model = ModelTrainer(model_name='RandomForest', target_name='price') logger.info('Loading training data...') training_datapath = PROCESSED_DIR.joinpath('training_data.csv') @@ -79,8 +83,4 @@ def main(): if __name__ == '__main__': - log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' - logging.basicConfig(level=logging.INFO, format=log_fmt) - logger = logging.getLogger(__name__) - main() diff --git a/src/run.py b/src/run.py new file mode 100644 index 0000000..19c7290 --- /dev/null +++ b/src/run.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +from src.data import make_dataset +from src.models import train_model, predict_model + + +def main(): + """ + Wrapper for running the main functions of the tool. (Un)comment one or more step if (un)needed. + """ + # 1. Download the data and create the dataset + make_dataset.main() + # 2. Train the model + train_model.main() + # 3. Test the model -> get the report! + predict_model.main() + + +if __name__ == "__main__": + main() From ea93377ad42757325024db949fb251be68f7ead7 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Wed, 3 Apr 2024 12:10:35 +0200 Subject: [PATCH 26/28] printed extra info in create_train_test_df method added docstrings for Datasource, ModelTrainer, Predictor classes --- src/data/make_dataset.py | 25 +++++++++++++++++-------- src/models/predict_model.py | 13 ++++++++++--- src/models/train_model.py | 7 +++++++ 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 3575e97..4c59196 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -23,10 +23,18 @@ class Datasource: api_key = None secret = None - def __init__(self, name: str, config_filepath: Union[str, pathlib.Path], max_pages: int = 2): + def __init__(self, name: str, config_filepath: Union[str, pathlib.Path], max_pages: int = 10): + """ + Initialize the Datasource object + + Args: + name (str): The name of the object. + config_filepath (Union[str, pathlib.Path]): The filepath of the configuration file. + max_pages (int): The maximum number of pages of ads to be requested (for each request) + """ self.name = name self.config_filepath = config_filepath - self.max_pages = max_pages # limit of the ads pages to be requested + self.max_pages = max_pages self.filtered_params = self.parse_filter_params( params_dict=self.read_toml_config(file_path=self.config_filepath)) @@ -75,9 +83,13 @@ def define_search_url(self) -> str: def clean_dataset(self, df) -> pd.DataFrame: pass - @staticmethod - def create_train_test_df(df: pd.DataFrame, test_size: float = 0.2) -> Tuple[pd.DataFrame, pd.DataFrame]: + def create_train_test_df(self, df: pd.DataFrame, test_size: float = 0.2) -> Tuple[pd.DataFrame, pd.DataFrame]: df_train, df_test = train_test_split(df, test_size=test_size, random_state=11, shuffle=True) + + # print summary info + self.logger.info(f'Total items in the training dataset: {len(df_train)}') + self.logger.info(f'Total items in the test dataset: {len(df_test)}') + return df_train, df_test @@ -85,9 +97,6 @@ class Idealista(Datasource): api_key: str = os.environ['IDEALISTA_API_KEY'] secret: str = os.environ['IDEALISTA_SECRET'] - def __init__(self, name: str, config_filepath: str): - super().__init__(name, config_filepath) - def define_search_url(self) -> str: country = self.filtered_params['country'] search_url = f'https://api.idealista.com/3.5/{country}/search' @@ -230,7 +239,7 @@ def main(): logging.basicConfig(level=logging.INFO, format=log_fmt) logger = logging.getLogger(__name__) - idealista = Idealista(name='Idealista', config_filepath=PROJECT_DIR.joinpath('config.toml')) + idealista = Idealista(name='Idealista', config_filepath=PROJECT_DIR.joinpath('config.toml'), max_pages=10) logger.info(f'Getting results from {idealista.name} website') results = idealista.get_results() logger.info(f'Exporting results from {idealista.name} website') diff --git a/src/models/predict_model.py b/src/models/predict_model.py index ff77418..649e678 100644 --- a/src/models/predict_model.py +++ b/src/models/predict_model.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- import logging import os +import pathlib import pickle import time from pathlib import Path import pandas as pd -from typing import Dict +from typing import Dict, Union from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score from sklearn.metrics import mean_squared_error from src.constants import PROCESSED_DIR, MODELS_DIR, RESULTS_DIR @@ -13,7 +14,13 @@ class Predictor: - def __init__(self, model_filepath: Path): + def __init__(self, model_filepath: Union[str, pathlib.Path]): + """ + Initialize the Predictor object + + Args: + model_filepath (Union[str, pathlib.Path]): The filepath of the trained model. + """ self.model_filepath = model_filepath self.model_name = self.get_model_name(self.model_filepath) self.model = self.load_model(self.model_filepath) @@ -50,7 +57,7 @@ def generate_report(cleaned_df: pd.DataFrame, predictions_df: pd.DataFrame) -> p predictions_df['price_diff'] = predictions_df['predicted_price'] - predictions_df['price'] # add info about prices removing unneeded items (inner join) - df = df.join(predictions_df, how='inner').sort_values(by='price_diff', ascending=True) + df = df.join(predictions_df, how='inner').sort_values(by='price_diff', ascending=False) return df diff --git a/src/models/train_model.py b/src/models/train_model.py index 9ea7553..7edb0ca 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -18,6 +18,13 @@ class ModelTrainer: def __init__(self, model_name: str, target_name: str = 'price'): + """ + Initialize a ModelTrainer object. + + Args: + model_name (str): The name of the model to be trained. + target_name (str): The name of the target variable to be predicted. + """ if model_name not in AVAILABLE_MODELS.keys(): raise ValueError(f'model_name must be one of {list(AVAILABLE_MODELS.keys())}') self.model_name = model_name From a4702ec29bd2da80d08e784f29125c04ece475c7 Mon Sep 17 00:00:00 2001 From: matteorosato Date: Wed, 3 Apr 2024 12:10:47 +0200 Subject: [PATCH 27/28] minor changes in config.toml --- config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.toml b/config.toml index 85e38a6..5278f63 100644 --- a/config.toml +++ b/config.toml @@ -5,8 +5,8 @@ country = "it" # Country code for the website (required) [string] operation = "sale" # Type of operation (required) [string] propertyType = "homes" # Type of property (required) [string] -center = "40.353,18.174" # Geographic coordinates for search center (WGS84) (latitude, longitude) [string] -distance = 4000 # Distance to center, in meters (ratio) [integer]. Check also https://www.calcmaps.com/map-radius/ +center = "40.36826,18.16143" # Geographic coordinates for search center (WGS84) (latitude, longitude) [string] +distance = 5800 # Distance to center, in meters (ratio) [integer]. Check also https://www.calcmaps.com/map-radius/ locale = "it" # Search language for summary [string] locationId = "" # Idealista location code [string] maxItems = 50 # Items per page (maximum allowed: 50) [integer] From dd48f1ff551da0326532d8f4c8cd504dd894631e Mon Sep 17 00:00:00 2001 From: matteorosato Date: Wed, 3 Apr 2024 12:13:30 +0200 Subject: [PATCH 28/28] modified date in CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e46d4d..3c72641 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## 0.1.0 (2024-03-26) +## 0.1.0 (2024-04-01) #### New Features * defined project structure