Skip to content

Commit

Permalink
Merge pull request #230 from lryanle/133/clean-fb-attributes
Browse files Browse the repository at this point in the history
133/clean fb attributes
  • Loading branch information
lryanle authored Apr 16, 2024
2 parents b20d59f + d4db9a9 commit d147fbc
Show file tree
Hide file tree
Showing 31 changed files with 6,844 additions and 1,572 deletions.
17 changes: 16 additions & 1 deletion backend/.env.example
Original file line number Diff line number Diff line change
@@ -1,4 +1,19 @@
PASSWORD=PASSWORD
USERNAME=USERNAME
DB_URI=MONGODB_CONNECTION_STR_USING_${USERNAME}_AND_${PASSWORD}
OPENAI_GPT_KEY=OPENAI_KEY

OPENAI_GPT_KEY=API_KEY

PROXY_USERNAME=USERNAME
PROXY_PASSWORD=PASSWORD
PROXY_PORT=PORT

PROD_ENV=FALSE
SCRAPE_MINUTES=6
CLEAN_MINUTES=2
ANALYZE_MINUTES=4

SCRAPE_DUP_LIMIT=5
CLEAN_CONS_ERR_LIMIT=20
MODEL_WEIGHTS=0,60,40,10,20,10,0
MODEL_BATCH_SIZE=15
2 changes: 0 additions & 2 deletions backend/Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ COPY --from=dependencies /var/lang/lib/python3.11/site-packages /var/lang/lib/py
WORKDIR /var/task
COPY orchestrator.py ./
COPY src ./src
COPY app.py ./
COPY .env ./

CMD [ "app.craigslist" ]

13 changes: 7 additions & 6 deletions backend/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ dev = "docker build -f Dockerfile.dev --platform linux/amd64 -t smare ."
stop = "docker rm -f smarecontainer"
start = "docker run --name smarecontainer -d smare:latest"
exec = "docker exec -it smarecontainer"
craigslist = "pipenv run exec python3 -c 'import app; app.craigslist()'"
facebook = "pipenv run exec python3 -c 'import app; app.facebook()'"
cleaner = "pipenv run exec python3 -c 'import app; app.clean()'"
model = "pipenv run exec python3 -c 'import app; app.model()'"
cl_app = "pipenv run exec python3 -c 'import app; app.pipeline_craigslist()'"
fb_app = "pipenv run exec python3 -c 'import app; app.pipeline_facebook()'"
craigslist = "pipenv run exec python3 -c 'import orchestrator; orchestrator.craigslist()'"
facebook = "pipenv run exec python3 -c 'import orchestrator; orchestrator.facebook()'"
clean = "pipenv run exec python3 -c 'import orchestrator; orchestrator.clean()'"
model = "pipenv run exec python3 -c 'import orchestrator; orchestrator.model()'"
cl_app = "pipenv run exec python3 -c 'import orchestrator; orchestrator.smare_craigslist()'"
fb_app = "pipenv run exec python3 -c 'import orchestrator; orchestrator.smare_facebook()'"

[packages]
selenium = "*"
Expand All @@ -29,6 +29,7 @@ fuzzywuzzy = "*"
loguru = "*"
openai = "*"
urllib3 = "*"
sendgrid = "*"

[dev-packages]
isort = "*"
Expand Down
530 changes: 372 additions & 158 deletions backend/Pipfile.lock

Large diffs are not rendered by default.

32 changes: 0 additions & 32 deletions backend/app.py

This file was deleted.

34 changes: 22 additions & 12 deletions backend/orchestrator.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
import os
from datetime import datetime, timedelta

from dotenv import load_dotenv
from src.cleaners.cleaner import run as run_cleaner
from src.scrapers.scraper import run as run_scraper
from src.models.model_manager import run as run_analyzer
from src.scrapers.scraper import run as run_scraper
from src.utilities.logger import SmareLogger

SCRAPER_DURATION = 3 * 60
CLEANER_DURATION = 2 * 60
ANALYZER_DURATION = 8 * 60
load_dotenv()

SCRAPER_DURATION = float(os.environ.get("SCRAPE_MINUTES", 6)) * 60
CLEANER_DURATION = float(os.environ.get("CLEAN_MINUTES", 2)) * 60
ANALYZER_DURATION = float(os.environ.get("ANALYZE_MINUTES", 4)) * 60

CL_SCRAPER_VERSION = 6
FB_SCRAPER_VERSION = 6
CLEANER_VERSION = 3

DUPLICATE_TERMINATION_LIMIT = 5

logger = SmareLogger()


Expand All @@ -25,20 +27,28 @@ def calculate_timestamp(seconds):
logger.critical(f"Orchestrator failed to generate module termination-timestamp. Error: {e}")


def facebook(termination_timestamp):
def facebook(termination_timestamp=calculate_timestamp(7 * 24 * 60 * 60)):
try:
run_scraper(termination_timestamp, "facebook", FB_SCRAPER_VERSION, DUPLICATE_TERMINATION_LIMIT)
run_scraper(termination_timestamp, "facebook", FB_SCRAPER_VERSION)
except Exception as e:
logger.critical(f"Orchestrator failed runnning facebook scraper. Error: {e}")


def craigslist(termination_timestamp):
def craigslist(termination_timestamp=calculate_timestamp(7 * 24 * 60 * 60)):
try:
run_scraper(termination_timestamp, "craigslist", CL_SCRAPER_VERSION, DUPLICATE_TERMINATION_LIMIT)
run_scraper(termination_timestamp, "craigslist", CL_SCRAPER_VERSION)
except Exception as e:
logger.critical(f"Orchestrator failed runnning craigslist scraper. Error: {e}")


def clean(termination_timestamp=calculate_timestamp(7 * 24 * 60 * 60)):
run_cleaner(termination_timestamp, CLEANER_VERSION)


def model(termination_timestamp=calculate_timestamp(7 * 24 * 60 * 60)):
run_analyzer(termination_timestamp)


def smare(scraper_name):
try:
if scraper_name == "facebook":
Expand All @@ -56,12 +66,12 @@ def smare(scraper_name):
logger.critical(f"Orchestrator failed while runnning the scraper module. Error: {e}")

try:
run_cleaner(calculate_timestamp(SCRAPER_DURATION), CLEANER_VERSION)
clean(calculate_timestamp(CLEANER_DURATION))
except Exception as e:
logger.critical(f"Orchestrator failed runnning the cleaner module. Error: {e}")

try:
run_analyzer(calculate_timestamp(ANALYZER_DURATION))
model(calculate_timestamp(ANALYZER_DURATION))
except Exception as e:
logger.critical(f"Orchestrator failed runnning analyzer module (model manager). Error: {e}")

Expand Down
40 changes: 28 additions & 12 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
annotated-types==0.6.0
anyio==4.3.0
astroid==3.1.0
attrs==23.2.0
beautifulsoup4==4.12.3
Expand All @@ -7,50 +9,64 @@ certifi==2024.2.2
charset-normalizer==3.3.2
click==8.1.7
dill==0.3.8
distro==1.9.0
dnspython==2.6.1
flake8==7.0.0
fuzzywuzzy==0.18.0
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
idna==3.6
imbalanced-learn==0.12.0
imbalanced-learn==0.12.2
imblearn==0.0
isort==5.13.2
joblib==1.3.2
joblib==1.4.0
loguru==0.7.2
markdown-it-py==3.0.0
mccabe==0.7.0
mdurl==0.1.2
mypy==1.9.0
mypy-extensions==1.0.0
numpy==1.26.4
openai==1.17.0
outcome==1.3.0.post0
packaging==24.0
pandas==2.2.1
pathspec==0.12.1
platformdirs==4.2.0
pycodestyle==2.11.1
pydantic==2.6.4
pydantic_core==2.16.3
pyflakes==3.2.0
Pygments==2.17.2
pylint==3.1.0
pymongo==4.6.2
pymongo==4.6.3
PySocks==1.7.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-http-client==3.3.7
pytz==2024.1
requests==2.31.0
scikit-learn==1.4.1.post1
scipy==1.12.0
selenium==4.18.1
rich==13.7.1
scikit-learn==1.4.2
scipy==1.13.0
selenium==4.19.0
sendgrid==6.11.0
shellingham==1.5.4
six==1.16.0
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.5
threadpoolctl==3.3.0
starkbank-ecdsa==2.2.0
threadpoolctl==3.4.0
tomlkit==0.12.4
trio==0.24.0
tqdm==4.66.2
trio==0.25.0
trio-websocket==0.11.1
typer==0.9.0
typer==0.12.3
types-beautifulsoup4==4.12.0.20240229
types-html5lib==1.1.11.20240228
typing_extensions==4.10.0
typing_extensions==4.11.0
tzdata==2024.1
urllib3==2.2.1
wsproto==1.2.0
openai==1.14.2
urllib3==1.26.18
69 changes: 44 additions & 25 deletions backend/src/cleaners/cleaner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from datetime import datetime

from ..utilities import database as db
Expand All @@ -8,7 +9,7 @@

logger = logger.SmareLogger()

CONSECUTIVE_ERROR_LIMIT = 20
CONSECUTIVE_ERROR_LIMIT = int(os.environ.get("CLEAN_CONS_ERR_LIMIT", 20))


class MakeModelException(Exception):
Expand All @@ -21,37 +22,41 @@ def clean(car):
clean_car = {}

if car["source"] == "facebook":
clean_car["year"] = fb.extract_year(car["title"])
attributes = clean_car["attributes"] = fb.extract_attributes(
car["attributes"]
)
make = clean_car["make"] = utils.extract_make(car["title"])
model = clean_car["model"] = fb.extract_model(
car["title"], clean_car["make"]
)
if "attributes" in car:
clean_car["attributes"] = fb.extract_attributes(car["attributes"])
clean_car["year"] = utils.extract_year(car["title"])
clean_car["make"] = utils.extract_make(car["title"])
clean_car["model"] = fb.extract_model(car["title"], clean_car["make"])
elif car["source"] == "craigslist":
attributes = clean_car["attributes"] = cl.extract_attributes(
car["attributes"]
)
if "attributes" in car:
clean_car["attributes"] = cl.extract_attributes(car["attributes"])

if "makemodel" in car:
clean_car["make"] = utils.extract_make(car["makemodel"])
clean_car["model"] = cl.extract_model(car["makemodel"], clean_car["make"])
else:
clean_car["make"] = utils.extract_make(car["title"])
clean_car["model"] = cl.extract_model(car["title"], clean_car["make"])

clean_car.update(cl.str_to_num(car))
make = clean_car["make"] = utils.extract_make(car["makemodel"])
model = clean_car["model"] = cl.extract_model(
car["makemodel"], clean_car["make"]
)

if not attributes:
raise MakeModelException("Failed cleaning attributes")
if "attributes" not in clean_car and "attributes" not in car:
logger.warning(f"Attributes not found in car {car['_id']}")

if not make:
if not clean_car["make"]:
logger.debug(f"car: {car['makemodel']}")
raise MakeModelException("Failed cleaning make")

if not model:
logger.debug(f"car {make}: {car['makemodel']}")
if not clean_car["model"]:
logger.debug(f"car {clean_car['make']}: {car['makemodel']}")
raise MakeModelException("Failed cleaning model")

clean_car["price"] = utils.clean_currency(car["price"])
clean_car["odometer"] = clean_car["attributes"]["odometer"]

if "attributes" in clean_car and "odometer" in clean_car["attributes"] and isinstance(clean_car["attributes"]["odometer"], int):
clean_car["odometer"] = clean_car["attributes"]["odometer"]
else:
clean_car["odometer"] = utils.clean_odometer(car["odometer"])

return clean_car
except MakeModelException as error:
Expand All @@ -62,6 +67,20 @@ def clean(car):
return None


def check(car):
required_checks = [
"price" in car and car["price"] > 0,
"odometer" in car and car["odometer"] > 0,
"year" in car and car["year"] >= 2000,
"make" in car,
"model" in car,
]

logger.debug(f"Checks: {required_checks}")

return False not in required_checks


def run(termination_timestamp, version):
cars = db.find_cars_in_stage("scrape")

Expand All @@ -75,10 +94,10 @@ def run(termination_timestamp, version):
for car in cars:
try:
clean_fields = clean(db.decode(car))
if clean_fields:
if clean_fields and check(clean_fields):
clean_fields["stage"] = "clean"
clean_fields["cleaner_version"] = version
# Initializing additional model fields and risk_score

clean_fields["model_scores"] = {}
clean_fields["model_versions"] = {}
for i in range(1, 8):
Expand Down Expand Up @@ -112,7 +131,7 @@ def run(termination_timestamp, version):
break

logger.info(
f"Cleaning summary: {total_errs} errors, {total_cleaned} cleaned, "
f"Cleaning summary: {total_errs} errors, {total_cleaned} cleaned"
f"{len(cars) - total_cleaned} unreached (due to errors or incomplete processing)."
)

Expand Down
Loading

1 comment on commit d147fbc

@vercel
Copy link

@vercel vercel bot commented on d147fbc Apr 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.