From fbd0dabf4c8f77affdd8411a17f9d2558757419d Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 12:59:38 -0500 Subject: [PATCH 01/39] schema updates --- .../docker-entrypoint-initdb.d/01_tables.sql | 150 ++++++++++++++++- src/app/schemas/highscores.py | 104 ++++++++++++ src/database/models/highscores.py | 116 ++++++++++++- src/main.py | 158 +++++++++++++++--- 4 files changed, 500 insertions(+), 28 deletions(-) diff --git a/mysql/docker-entrypoint-initdb.d/01_tables.sql b/mysql/docker-entrypoint-initdb.d/01_tables.sql index e671895..0ec099e 100644 --- a/mysql/docker-entrypoint-initdb.d/01_tables.sql +++ b/mysql/docker-entrypoint-initdb.d/01_tables.sql @@ -123,4 +123,152 @@ CREATE TABLE `playerHiscoreData` ( UNIQUE KEY `Unique_player_date` (`Player_id`,`ts_date`), CONSTRAINT `FK_Players_id` FOREIGN KEY (`Player_id`) REFERENCES `Players` (`id`) ON DELETE RESTRICT ON UPDATE RESTRICT ); -CREATE TRIGGER `hiscore_date_OnInsert` BEFORE INSERT ON `playerHiscoreData` FOR EACH ROW SET new.ts_date = DATE(new.timestamp); +CREATE TRIGGER `hiscore_date_OnInsert` BEFORE INSERT ON `playerHiscoreData` FOR EACH ROW SET new.ts_date = DATE(new.timestamp); + +start transaction; + +drop table if exists skills; +drop table if exists activities; +drop table if exists player_skills; +drop table if exists player_activities; +drop table if exists scraper_data; + +# done +CREATE TABLE scraper_data ( + scraper_id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + player_id SMALLINT UNSIGNED NOT NULL, + record_date DATE AS (DATE(created_at)) STORED, + UNIQUE KEY unique_player_per_day (player_id, record_date) +); + +CREATE TABLE skills ( + skill_id TINYINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, # < 255 + skill_name VARCHAR(50) NOT NULL, + UNIQUE KEY unique_skill_name (skill_name) +); +INSERT INTO skills (skill_name) VALUES + ('total'), + ('attack'), + ('defence'), + ('strength'), + ('hitpoints'), + ('ranged'), + ('prayer'), + ('magic'), + ('cooking'), + ('woodcutting'), + ('fletching'), + ('fishing'), + ('firemaking'), + ('crafting'), + ('smithing'), + ('mining'), + ('herblore'), + ('agility'), + ('thieving'), + ('slayer'), + ('farming'), + ('runecraft'), + ('hunter'), + ('construction') +; + +CREATE TABLE activities ( + activity_id TINYINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, # < 255 + activity_name VARCHAR(50) NOT NULL, + UNIQUE KEY unique_activity_name (activity_name) +); + +INSERT INTO activities (activity_name) VALUES + ('abyssal_sire'), + ('alchemical_hydra'), + ('artio'), + ('barrows_chests'), + ('bounty_hunter_hunter'), + ('bounty_hunter_rogue'), + ('bryophyta'), + ('callisto'), + ('calvarion'), + ('cerberus'), + ('chambers_of_xeric'), + ('chambers_of_xeric_challenge_mode'), + ('chaos_elemental'), + ('chaos_fanatic'), + ('commander_zilyana'), + ('corporeal_beast'), + ('crazy_archaeologist'), + ('cs_all'), + ('cs_beginner'), + ('cs_easy'), + ('cs_elite'), + ('cs_hard'), + ('cs_master'), + ('cs_medium'), + ('dagannoth_prime'), + ('dagannoth_rex'), + ('dagannoth_supreme'), + ('deranged_archaeologist'), + ('duke_sucellus'), + ('general_graardor'), + ('giant_mole'), + ('grotesque_guardians'), + ('hespori'), + ('kalphite_queen'), + ('king_black_dragon'), + ('kraken'), + ('kreearra'), + ('kril_tsutsaroth'), + ('league'), + ('lms_rank'), + ('mimic'), + ('nex'), + ('nightmare'), + ('obor'), + ('phantom_muspah'), + ('phosanis_nightmare'), + ('rifts_closed'), + ('sarachnis'), + ('scorpia'), + ('skotizo'), + ('soul_wars_zeal'), + ('spindel'), + ('tempoross'), + ('the_corrupted_gauntlet'), + ('the_gauntlet'), + ('the_leviathan'), + ('the_whisperer'), + ('theatre_of_blood'), + ('theatre_of_blood_hard'), + ('thermonuclear_smoke_devil'), + ('tombs_of_amascut'), + ('tombs_of_amascut_expert'), + ('tzkal_zuk'), + ('tztok_jad'), + ('vardorvis'), + ('venenatis'), + ('vetion'), + ('vorkath'), + ('wintertodt'), + ('zalcano'), + ('zulrah') +; + +CREATE TABLE player_skills ( + scraper_id BIGINT UNSIGNED NOT NULL, + skill_id TINYINT UNSIGNED NOT NULL, + skill_value INT UNSIGNED NOT NULL DEFAULT 0, # < 200 000 000 + FOREIGN KEY (scraper_id) REFERENCES scraper_data(scraper_id) ON DELETE CASCADE, + FOREIGN KEY (skill_id) REFERENCES skills(skill_id) ON DELETE CASCADE, + PRIMARY KEY (scraper_id, skill_id) +); + +CREATE TABLE player_activities ( + scraper_id BIGINT UNSIGNED NOT NULL, + activity_id TINYINT UNSIGNED NOT NULL, + activity_value INT UNSIGNED NOT NULL DEFAULT 0, # some guy could get over 65k kc + FOREIGN KEY (scraper_id) REFERENCES scraper_data(scraper_id) ON DELETE CASCADE, + FOREIGN KEY (activity_id) REFERENCES activities(activity_id) ON DELETE CASCADE, + PRIMARY KEY (scraper_id, activity_id) +); +commit; \ No newline at end of file diff --git a/src/app/schemas/highscores.py b/src/app/schemas/highscores.py index 00fd30d..23bfa43 100644 --- a/src/app/schemas/highscores.py +++ b/src/app/schemas/highscores.py @@ -1,6 +1,7 @@ from datetime import datetime from pydantic import BaseModel, ConfigDict +from typing import Optional class playerHiscoreData(BaseModel): @@ -105,3 +106,106 @@ class playerHiscoreData(BaseModel): the_leviathan: int = 0 the_whisperer: int = 0 vardorvis: int = 0 + + +class ScraperDataBase(BaseModel): + player_id: int + + +class ScraperDataCreate(ScraperDataBase): + pass + + +class ScraperData(ScraperDataBase): + scraper_id: int + created_at: Optional[str] = None + record_date: Optional[str] = None + + class Config: + orm_mode = True + + +class SkillBase(BaseModel): + skill_name: str + + +class SkillCreate(SkillBase): + pass + + +class Skill(SkillBase): + skill_id: int + + class Config: + orm_mode = True + + +class ActivityBase(BaseModel): + activity_name: str + + +class ActivityCreate(ActivityBase): + pass + + +class Activity(ActivityBase): + activity_id: int + + class Config: + orm_mode = True + + +class PlayerSkillBase(BaseModel): + scraper_id: int + skill_id: int + skill_value: int + + +class PlayerSkillCreate(PlayerSkillBase): + pass + + +class PlayerSkill(PlayerSkillBase): + class Config: + orm_mode = True + + +class PlayerActivityBase(BaseModel): + scraper_id: int + activity_id: int + activity_value: int + + +class PlayerActivityCreate(PlayerActivityBase): + pass + + +class PlayerActivity(PlayerActivityBase): + class Config: + orm_mode = True + + +class PlayerBase(BaseModel): + name: str + possible_ban: Optional[bool] = None + confirmed_ban: Optional[bool] = None + confirmed_player: Optional[bool] = None + label_id: Optional[int] = None + label_jagex: Optional[int] = None + ironman: Optional[bool] = None + hardcore_ironman: Optional[bool] = None + ultimate_ironman: Optional[bool] = None + normalized_name: str + + +class PlayerCreate(PlayerBase): + pass + + +class Player(PlayerBase): + id: int + created_at: Optional[str] = None + updated_at: Optional[str] = None + + class Config: + orm_mode = True diff --git a/src/database/models/highscores.py b/src/database/models/highscores.py index ab5ff9b..1dab5a2 100644 --- a/src/database/models/highscores.py +++ b/src/database/models/highscores.py @@ -1,6 +1,18 @@ -from sqlalchemy import BigInteger, Column, Date, DateTime, Integer, func +from sqlalchemy import ( + BigInteger, + SmallInteger, + Column, + Date, + DateTime, + Integer, + func, + ForeignKey, +) +from sqlalchemy.orm import relationship +from sqlalchemy.schema import UniqueConstraint from database.database import Base +from sqlalchemy import String class PlayerHiscoreData(Base): @@ -105,3 +117,105 @@ class PlayerHiscoreData(Base): the_leviathan = Column(Integer, default=0) the_whisperer = Column(Integer, default=0) vardorvis = Column(Integer, default=0) + + +# CREATE TABLE scraper_data ( +# scraper_id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, +# created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, +# player_id SMALLINT UNSIGNED NOT NULL, +# record_date DATE AS (DATE(created_at)) STORED, +# UNIQUE KEY unique_player_per_day (player_id, record_date) +# ); +class ScraperData(Base): + __tablename__ = "scraper_data" + + scraper_id = Column(BigInteger, primary_key=True, autoincrement=True) + created_at = Column(DateTime, nullable=False, server_default=func.now()) + player_id = Column(SmallInteger, nullable=False) + record_date = Column(Date, nullable=True, server_onupdate=func.current_date()) + + __table_args__ = ( + UniqueConstraint("player_id", "record_date", name="unique_player_per_day"), + ) + + +# CREATE TABLE skills ( +# skill_id TINYINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, # < 255 +# skill_name VARCHAR(50) NOT NULL, +# UNIQUE KEY unique_skill_name (skill_name) +# ); +class Skills(Base): + __tablename__ = "skills" + + skill_id = Column(SmallInteger, primary_key=True, autoincrement=True) + skill_name = Column(String(50), nullable=False) + + __table_args__ = (UniqueConstraint("skill_name", name="unique_skill_name"),) + + +# done +# CREATE TABLE activities ( +# activity_id TINYINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, # < 255 +# activity_name VARCHAR(50) NOT NULL, +# UNIQUE KEY unique_activity_name (activity_name) +# ); +class Activities(Base): + __tablename__ = "activities" + + activity_id = Column(SmallInteger, primary_key=True, autoincrement=True) + activity_name = Column(String(50), nullable=False) + + __table_args__ = (UniqueConstraint("activity_name", name="unique_activity_name"),) + + +# CREATE TABLE player_skills ( +# scraper_id BIGINT UNSIGNED NOT NULL, +# skill_id TINYINT UNSIGNED NOT NULL, +# skill_value INT UNSIGNED NOT NULL DEFAULT 0, # < 200 000 000 +# FOREIGN KEY (scraper_id) REFERENCES scraper_data(scraper_id) ON DELETE CASCADE, +# FOREIGN KEY (skill_id) REFERENCES skills(skill_id) ON DELETE CASCADE, +# PRIMARY KEY (scraper_id, skill_id) +# ); +class PlayerSkills(Base): + __tablename__ = "player_skills" + + scraper_id = Column( + BigInteger, + ForeignKey("scraper_data.scraper_id", ondelete="CASCADE"), + primary_key=True, + ) + skill_id = Column( + SmallInteger, + ForeignKey("skills.skill_id", ondelete="CASCADE"), + primary_key=True, + ) + skill_value = Column(Integer, nullable=False, default=0) + + scraper_data = relationship("ScraperData", back_populates="player_skills") + skills = relationship("Skills", back_populates="player_skills") + + +# CREATE TABLE player_activities ( +# scraper_id BIGINT UNSIGNED NOT NULL, +# activity_id TINYINT UNSIGNED NOT NULL, +# activity_value INT UNSIGNED NOT NULL DEFAULT 0, # some guy could get over 65k kc +# FOREIGN KEY (scraper_id) REFERENCES scraper_data(scraper_id) ON DELETE CASCADE, +# FOREIGN KEY (activity_id) REFERENCES activities(activity_id) ON DELETE CASCADE, +# PRIMARY KEY (scraper_id, activity_id) +# ); + + +class PlayerActivities(Base): + __tablename__ = "player_activities" + + scraper_id = Column( + BigInteger, + ForeignKey("scraper_data.scraper_id", ondelete="CASCADE"), + primary_key=True, + ) + activity_id = Column( + SmallInteger, + ForeignKey("activities.activity_id", ondelete="CASCADE"), + primary_key=True, + ) + activity_value = Column(Integer, nullable=False, default=0) diff --git a/src/main.py b/src/main.py index 27c6247..33d0ede 100644 --- a/src/main.py +++ b/src/main.py @@ -6,10 +6,22 @@ from asyncio import Queue from aiokafka import AIOKafkaConsumer, AIOKafkaProducer -from app.schemas.highscores import playerHiscoreData as playerHiscoreDataSchema +from sqlalchemy import select +from app.schemas.highscores import ( + playerHiscoreData as playerHiscoreDataSchema, + PlayerActivityCreate, + ScraperDataCreate, + PlayerSkillCreate, + PlayerCreate, +) from core.config import settings from database.database import get_session -from database.models.highscores import PlayerHiscoreData +from database.models.highscores import ( + # PlayerHiscoreData, + PlayerActivities, + PlayerSkills, + ScraperData, +) from database.models.player import Player from sqlalchemy import insert, update from sqlalchemy.exc import IntegrityError, OperationalError @@ -18,6 +30,12 @@ logger = logging.getLogger(__name__) +# Global variables to cache the skill and activity names +SKILL_NAMES = None +ACTIVITY_NAMES = None +# Global lock for updating the cache +CACHE_UPDATE_LOCK = asyncio.Lock() + async def kafka_consumer(topic: str, group: str): consumer = AIOKafkaConsumer( @@ -61,6 +79,7 @@ async def send_messages(topic: str, producer: AIOKafkaProducer, send_queue: Queu await producer.send(topic, value=message) send_queue.task_done() + def log_speed( counter: int, start_time: float, _queue: Queue, topic: str, interval: int = 15 ) -> tuple[float, int]: @@ -85,30 +104,68 @@ def log_speed( # Return the current time and reset the counter to zero return time.time(), 0 -async def insert_data(batch: list[dict], error_queue:Queue): - try: - highscores:list[dict] = [msg.get("hiscores") for msg in batch] - players:list[dict] = [msg.get("player") for msg in batch] - highscores = [playerHiscoreDataSchema(**hs) for hs in highscores if hs] - highscores = [hs.model_dump(mode="json") for hs in highscores ] +async def insert_data(batch: list[dict], error_queue: Queue): + session: AsyncSession = await get_session() - session: AsyncSession = await get_session() - - logger.info(f"Received: {len(players)=}, {len(highscores)=}") + try: + # Transform the old data format into the new format + batch = [await transform_data(msg, session) for msg in batch] + + scraper_data_list: list[dict] = [msg.get("scraper_data") for msg in batch] + player_skills_list: list[dict] = [msg.get("player_skills") for msg in batch] + player_activities_list: list[dict] = [ + msg.get("player_activities") for msg in batch + ] + players: list[dict] = [msg.get("player") for msg in batch] + + scraper_data_list = [ScraperDataCreate(**sd) for sd in scraper_data_list if sd] + player_skills_list = [ + PlayerSkillCreate(**ps) for ps in player_skills_list if ps + ] + player_activities_list = [ + PlayerActivityCreate(**pa) for pa in player_activities_list if pa + ] + players = [PlayerCreate(**p) for p in players if p] + + logger.info( + f"Received: players={len(players)}, scraper_data={len(scraper_data_list)}, skills={len(player_skills_list)}, activities={len(player_activities_list)}" + ) # start a transaction async with session.begin(): - # insert into table values () - insert_sql:Insert = insert(PlayerHiscoreData) - insert_sql = insert_sql.values(highscores) - insert_sql = insert_sql.prefix_with("ignore") - await session.execute(insert_sql) - # update table + # insert into scraper_data table + for scraper_data in scraper_data_list: + insert_scraper_data = ( + insert(ScraperData) + .values(scraper_data.dict()) + .prefix_with("ignore") + ) + await session.execute(insert_scraper_data) + + # insert into player_skills table + for player_skill in player_skills_list: + insert_player_skill = ( + insert(PlayerSkills) + .values(player_skill.dict()) + .prefix_with("ignore") + ) + await session.execute(insert_player_skill) + + # insert into player_activities table + for player_activity in player_activities_list: + insert_player_activity = ( + insert(PlayerActivities) + .values(player_activity.dict()) + .prefix_with("ignore") + ) + await session.execute(insert_player_activity) + + # update Player table for player in players: - update_sql:Update = update(Player) - update_sql = update_sql.where(Player.id == player.get("id")) - update_sql = update_sql.values(player) + update_sql: Update = update(Player) + update_sql = update_sql.where(Player.id == player.id) + update_sql = update_sql.values(player.dict()) await session.execute(update_sql) except (OperationalError, IntegrityError) as e: for message in batch: @@ -124,6 +181,54 @@ async def insert_data(batch: list[dict], error_queue:Queue): logger.debug(f"Traceback: \n{traceback.format_exc()}") logger.info(f"error_qsize={error_queue.qsize()}, {message=}") + +async def transform_data(old_data: dict, session: AsyncSession) -> dict: + global SKILL_NAMES, ACTIVITY_NAMES + + # Fetch the skill and activity names from the database if they're not already cached + async with CACHE_UPDATE_LOCK: + if SKILL_NAMES is None: + skill_names = await session.execute(select(PlayerSkills.skill_name)) + SKILL_NAMES = [result[0] for result in skill_names.scalars().all()] + if ACTIVITY_NAMES is None: + activity_names = await session.execute( + select(PlayerActivities.activity_name) + ) + ACTIVITY_NAMES = [result[0] for result in activity_names.scalars().all()] + + # Transform the old data format into the new format + new_data = { + "scraper_data": { + "scraper_id": old_data.get("id"), + "created_at": old_data.get("timestamp"), + "player_id": old_data.get("Player_id"), + "record_date": old_data.get("ts_date"), + }, + "player_skills": [ + {"skill_id": i, "skill_value": old_data.get(skill)} + for i, skill in enumerate(SKILL_NAMES) + ], + "player_activities": [ + {"activity_id": i, "activity_value": old_data.get(activity)} + for i, activity in enumerate(ACTIVITY_NAMES) + ], + "player": { + "id": old_data.get("Player_id"), + }, + } + return new_data + + +async def update_cache(session: AsyncSession): + global SKILL_NAMES, ACTIVITY_NAMES + + # Fetch the skill and activity names from the database + skill_names = await session.execute(select(PlayerSkills.skill_name)) + SKILL_NAMES = [result[0] for result in skill_names.scalars().all()] + activity_names = await session.execute(select(PlayerActivities.activity_name)) + ACTIVITY_NAMES = [result[0] for result in activity_names.scalars().all()] + + async def process_data(receive_queue: Queue, error_queue: Queue): # Initialize counter and start time counter = 0 @@ -140,7 +245,7 @@ async def process_data(receive_queue: Queue, error_queue: Queue): start_time=start_time, _queue=receive_queue, topic="scraper", - interval=15 + interval=15, ) # Check if queue is empty @@ -150,8 +255,8 @@ async def process_data(receive_queue: Queue, error_queue: Queue): # Get a message from the chosen queue message: dict = await receive_queue.get() - - #TODO fix test data + + # TODO fix test data if settings.ENV != "PRD": player = message.get("player") player_id = player.get("id") @@ -159,21 +264,22 @@ async def process_data(receive_queue: Queue, error_queue: Queue): MAX_PLAYER_ID = 300 if not (MIN_PLAYER_ID < player_id <= MAX_PLAYER_ID): continue - + # batch message batch.append(message) now = time.time() # insert data in batches of N or interval of N - if len(batch) > 100 or now-start_time > 15: + if len(batch) > 100 or now - start_time > 15: async with semaphore: await insert_data(batch=batch, error_queue=error_queue) batch = [] - + receive_queue.task_done() counter += 1 + async def main(): # get kafka engine consumer = await kafka_consumer(topic="scraper", group="highscore-worker") From a12998e5e81892fb204fc3818c1704ee22413dbd Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 13:03:49 -0500 Subject: [PATCH 02/39] cleanup and moving add out to data --- .../docker-entrypoint-initdb.d/01_tables.sql | 108 ------------------ 1 file changed, 108 deletions(-) diff --git a/mysql/docker-entrypoint-initdb.d/01_tables.sql b/mysql/docker-entrypoint-initdb.d/01_tables.sql index 0ec099e..5369118 100644 --- a/mysql/docker-entrypoint-initdb.d/01_tables.sql +++ b/mysql/docker-entrypoint-initdb.d/01_tables.sql @@ -125,14 +125,6 @@ CREATE TABLE `playerHiscoreData` ( ); CREATE TRIGGER `hiscore_date_OnInsert` BEFORE INSERT ON `playerHiscoreData` FOR EACH ROW SET new.ts_date = DATE(new.timestamp); -start transaction; - -drop table if exists skills; -drop table if exists activities; -drop table if exists player_skills; -drop table if exists player_activities; -drop table if exists scraper_data; - # done CREATE TABLE scraper_data ( scraper_id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, @@ -147,112 +139,12 @@ CREATE TABLE skills ( skill_name VARCHAR(50) NOT NULL, UNIQUE KEY unique_skill_name (skill_name) ); -INSERT INTO skills (skill_name) VALUES - ('total'), - ('attack'), - ('defence'), - ('strength'), - ('hitpoints'), - ('ranged'), - ('prayer'), - ('magic'), - ('cooking'), - ('woodcutting'), - ('fletching'), - ('fishing'), - ('firemaking'), - ('crafting'), - ('smithing'), - ('mining'), - ('herblore'), - ('agility'), - ('thieving'), - ('slayer'), - ('farming'), - ('runecraft'), - ('hunter'), - ('construction') -; - CREATE TABLE activities ( activity_id TINYINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, # < 255 activity_name VARCHAR(50) NOT NULL, UNIQUE KEY unique_activity_name (activity_name) ); -INSERT INTO activities (activity_name) VALUES - ('abyssal_sire'), - ('alchemical_hydra'), - ('artio'), - ('barrows_chests'), - ('bounty_hunter_hunter'), - ('bounty_hunter_rogue'), - ('bryophyta'), - ('callisto'), - ('calvarion'), - ('cerberus'), - ('chambers_of_xeric'), - ('chambers_of_xeric_challenge_mode'), - ('chaos_elemental'), - ('chaos_fanatic'), - ('commander_zilyana'), - ('corporeal_beast'), - ('crazy_archaeologist'), - ('cs_all'), - ('cs_beginner'), - ('cs_easy'), - ('cs_elite'), - ('cs_hard'), - ('cs_master'), - ('cs_medium'), - ('dagannoth_prime'), - ('dagannoth_rex'), - ('dagannoth_supreme'), - ('deranged_archaeologist'), - ('duke_sucellus'), - ('general_graardor'), - ('giant_mole'), - ('grotesque_guardians'), - ('hespori'), - ('kalphite_queen'), - ('king_black_dragon'), - ('kraken'), - ('kreearra'), - ('kril_tsutsaroth'), - ('league'), - ('lms_rank'), - ('mimic'), - ('nex'), - ('nightmare'), - ('obor'), - ('phantom_muspah'), - ('phosanis_nightmare'), - ('rifts_closed'), - ('sarachnis'), - ('scorpia'), - ('skotizo'), - ('soul_wars_zeal'), - ('spindel'), - ('tempoross'), - ('the_corrupted_gauntlet'), - ('the_gauntlet'), - ('the_leviathan'), - ('the_whisperer'), - ('theatre_of_blood'), - ('theatre_of_blood_hard'), - ('thermonuclear_smoke_devil'), - ('tombs_of_amascut'), - ('tombs_of_amascut_expert'), - ('tzkal_zuk'), - ('tztok_jad'), - ('vardorvis'), - ('venenatis'), - ('vetion'), - ('vorkath'), - ('wintertodt'), - ('zalcano'), - ('zulrah') -; CREATE TABLE player_skills ( scraper_id BIGINT UNSIGNED NOT NULL, From 49afe591c28c9c385a5b090b51817a6ce0c94bb3 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 13:04:01 -0500 Subject: [PATCH 03/39] insert into activities and skills --- mysql/docker-entrypoint-initdb.d/02_data.sql | 102 +++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/mysql/docker-entrypoint-initdb.d/02_data.sql b/mysql/docker-entrypoint-initdb.d/02_data.sql index 475ba27..69af2b2 100644 --- a/mysql/docker-entrypoint-initdb.d/02_data.sql +++ b/mysql/docker-entrypoint-initdb.d/02_data.sql @@ -53,3 +53,105 @@ SET name = CONCAT('player', id), normalized_name = CONCAT('player', id) ; + +INSERT INTO skills (skill_name) VALUES + ('total'), + ('attack'), + ('defence'), + ('strength'), + ('hitpoints'), + ('ranged'), + ('prayer'), + ('magic'), + ('cooking'), + ('woodcutting'), + ('fletching'), + ('fishing'), + ('firemaking'), + ('crafting'), + ('smithing'), + ('mining'), + ('herblore'), + ('agility'), + ('thieving'), + ('slayer'), + ('farming'), + ('runecraft'), + ('hunter'), + ('construction') +; + + +INSERT INTO activities (activity_name) VALUES + ('abyssal_sire'), + ('alchemical_hydra'), + ('artio'), + ('barrows_chests'), + ('bounty_hunter_hunter'), + ('bounty_hunter_rogue'), + ('bryophyta'), + ('callisto'), + ('calvarion'), + ('cerberus'), + ('chambers_of_xeric'), + ('chambers_of_xeric_challenge_mode'), + ('chaos_elemental'), + ('chaos_fanatic'), + ('commander_zilyana'), + ('corporeal_beast'), + ('crazy_archaeologist'), + ('cs_all'), + ('cs_beginner'), + ('cs_easy'), + ('cs_elite'), + ('cs_hard'), + ('cs_master'), + ('cs_medium'), + ('dagannoth_prime'), + ('dagannoth_rex'), + ('dagannoth_supreme'), + ('deranged_archaeologist'), + ('duke_sucellus'), + ('general_graardor'), + ('giant_mole'), + ('grotesque_guardians'), + ('hespori'), + ('kalphite_queen'), + ('king_black_dragon'), + ('kraken'), + ('kreearra'), + ('kril_tsutsaroth'), + ('league'), + ('lms_rank'), + ('mimic'), + ('nex'), + ('nightmare'), + ('obor'), + ('phantom_muspah'), + ('phosanis_nightmare'), + ('rifts_closed'), + ('sarachnis'), + ('scorpia'), + ('skotizo'), + ('soul_wars_zeal'), + ('spindel'), + ('tempoross'), + ('the_corrupted_gauntlet'), + ('the_gauntlet'), + ('the_leviathan'), + ('the_whisperer'), + ('theatre_of_blood'), + ('theatre_of_blood_hard'), + ('thermonuclear_smoke_devil'), + ('tombs_of_amascut'), + ('tombs_of_amascut_expert'), + ('tzkal_zuk'), + ('tztok_jad'), + ('vardorvis'), + ('venenatis'), + ('vetion'), + ('vorkath'), + ('wintertodt'), + ('zalcano'), + ('zulrah') +; \ No newline at end of file From 0ce7dece956b0785d721e722c0d28a1ba2f6278c Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 13:07:27 -0500 Subject: [PATCH 04/39] move done --- mysql/docker-entrypoint-initdb.d/01_tables.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/mysql/docker-entrypoint-initdb.d/01_tables.sql b/mysql/docker-entrypoint-initdb.d/01_tables.sql index 5369118..9277220 100644 --- a/mysql/docker-entrypoint-initdb.d/01_tables.sql +++ b/mysql/docker-entrypoint-initdb.d/01_tables.sql @@ -125,7 +125,6 @@ CREATE TABLE `playerHiscoreData` ( ); CREATE TRIGGER `hiscore_date_OnInsert` BEFORE INSERT ON `playerHiscoreData` FOR EACH ROW SET new.ts_date = DATE(new.timestamp); -# done CREATE TABLE scraper_data ( scraper_id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, From d0210b8c084cb6414dec87b440efc184a6cceb0d Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 13:52:11 -0500 Subject: [PATCH 05/39] bug fixes --- src/database/models/highscores.py | 6 ++++-- src/main.py | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/database/models/highscores.py b/src/database/models/highscores.py index 1dab5a2..294f421 100644 --- a/src/database/models/highscores.py +++ b/src/database/models/highscores.py @@ -133,7 +133,7 @@ class ScraperData(Base): created_at = Column(DateTime, nullable=False, server_default=func.now()) player_id = Column(SmallInteger, nullable=False) record_date = Column(Date, nullable=True, server_onupdate=func.current_date()) - + player_skills = relationship("PlayerSkills", back_populates="scraper_data") __table_args__ = ( UniqueConstraint("player_id", "record_date", name="unique_player_per_day"), ) @@ -150,6 +150,8 @@ class Skills(Base): skill_id = Column(SmallInteger, primary_key=True, autoincrement=True) skill_name = Column(String(50), nullable=False) + player_skills = relationship("PlayerSkills", back_populates="skill") + __table_args__ = (UniqueConstraint("skill_name", name="unique_skill_name"),) @@ -192,7 +194,7 @@ class PlayerSkills(Base): skill_value = Column(Integer, nullable=False, default=0) scraper_data = relationship("ScraperData", back_populates="player_skills") - skills = relationship("Skills", back_populates="player_skills") + skill = relationship("Skills", back_populates="player_skills") # CREATE TABLE player_activities ( diff --git a/src/main.py b/src/main.py index 33d0ede..d929e1d 100644 --- a/src/main.py +++ b/src/main.py @@ -188,12 +188,14 @@ async def transform_data(old_data: dict, session: AsyncSession) -> dict: # Fetch the skill and activity names from the database if they're not already cached async with CACHE_UPDATE_LOCK: if SKILL_NAMES is None: - skill_names = await session.execute(select(PlayerSkills.skill_name)) + skill_names = await session.execute( + select(PlayerSkills.skill_value) + ) # Update this line SKILL_NAMES = [result[0] for result in skill_names.scalars().all()] if ACTIVITY_NAMES is None: activity_names = await session.execute( - select(PlayerActivities.activity_name) - ) + select(PlayerActivities.activity_value) + ) # And this line ACTIVITY_NAMES = [result[0] for result in activity_names.scalars().all()] # Transform the old data format into the new format From 1699d42e94d1a158679b71e24dd920a0820cf6ae Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 13:55:40 -0500 Subject: [PATCH 06/39] add debug logic --- src/main.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/main.py b/src/main.py index d929e1d..8e9ad8f 100644 --- a/src/main.py +++ b/src/main.py @@ -182,20 +182,25 @@ async def insert_data(batch: list[dict], error_queue: Queue): logger.info(f"error_qsize={error_queue.qsize()}, {message=}") +import logging + +logger = logging.getLogger(__name__) + + async def transform_data(old_data: dict, session: AsyncSession) -> dict: global SKILL_NAMES, ACTIVITY_NAMES + logger.debug(f"Input data: {old_data}") + # Fetch the skill and activity names from the database if they're not already cached async with CACHE_UPDATE_LOCK: if SKILL_NAMES is None: - skill_names = await session.execute( - select(PlayerSkills.skill_value) - ) # Update this line + skill_names = await session.execute(select(PlayerSkills.skill_value)) SKILL_NAMES = [result[0] for result in skill_names.scalars().all()] if ACTIVITY_NAMES is None: activity_names = await session.execute( select(PlayerActivities.activity_value) - ) # And this line + ) ACTIVITY_NAMES = [result[0] for result in activity_names.scalars().all()] # Transform the old data format into the new format @@ -218,6 +223,9 @@ async def transform_data(old_data: dict, session: AsyncSession) -> dict: "id": old_data.get("Player_id"), }, } + + logger.debug(f"Transformed data: {new_data}") + return new_data From 168fd214e765d9bfda65bd1851f9675dfd07e3cb Mon Sep 17 00:00:00 2001 From: extreme4all <40169115+extreme4all@users.noreply.github.com> Date: Sat, 24 Feb 2024 20:29:39 +0100 Subject: [PATCH 07/39] parse message from kafka --- src/main.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/main.py b/src/main.py index 8e9ad8f..a9ac9c1 100644 --- a/src/main.py +++ b/src/main.py @@ -28,8 +28,16 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.sql.expression import Insert, Update +from pydantic import BaseModel + logger = logging.getLogger(__name__) + +class Message(BaseModel): + hiscore: dict + player: dict | None + + # Global variables to cache the skill and activity names SKILL_NAMES = None ACTIVITY_NAMES = None @@ -105,7 +113,7 @@ def log_speed( return time.time(), 0 -async def insert_data(batch: list[dict], error_queue: Queue): +async def insert_data(batch: list[Message], error_queue: Queue): session: AsyncSession = await get_session() try: @@ -182,11 +190,6 @@ async def insert_data(batch: list[dict], error_queue: Queue): logger.info(f"error_qsize={error_queue.qsize()}, {message=}") -import logging - -logger = logging.getLogger(__name__) - - async def transform_data(old_data: dict, session: AsyncSession) -> dict: global SKILL_NAMES, ACTIVITY_NAMES @@ -247,7 +250,7 @@ async def process_data(receive_queue: Queue, error_queue: Queue): # limit the number of async insert_data calls semaphore = asyncio.Semaphore(5) - batch = [] + batch: list[Message] = [] # Run indefinitely while True: start_time, counter = log_speed( @@ -265,10 +268,11 @@ async def process_data(receive_queue: Queue, error_queue: Queue): # Get a message from the chosen queue message: dict = await receive_queue.get() + message: Message = Message(**message) # TODO fix test data if settings.ENV != "PRD": - player = message.get("player") + player = message.player player_id = player.get("id") MIN_PLAYER_ID = 0 MAX_PLAYER_ID = 300 From 2c86002cf98e0bc74d1e166dfc66586fd7321201 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 15:20:45 -0500 Subject: [PATCH 08/39] removed commit --- mysql/docker-entrypoint-initdb.d/01_tables.sql | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mysql/docker-entrypoint-initdb.d/01_tables.sql b/mysql/docker-entrypoint-initdb.d/01_tables.sql index 9277220..b8c2c9c 100644 --- a/mysql/docker-entrypoint-initdb.d/01_tables.sql +++ b/mysql/docker-entrypoint-initdb.d/01_tables.sql @@ -161,5 +161,4 @@ CREATE TABLE player_activities ( FOREIGN KEY (scraper_id) REFERENCES scraper_data(scraper_id) ON DELETE CASCADE, FOREIGN KEY (activity_id) REFERENCES activities(activity_id) ON DELETE CASCADE, PRIMARY KEY (scraper_id, activity_id) -); -commit; \ No newline at end of file +); \ No newline at end of file From 57bcfe6791d9138560445e30de1e009e8828dc7a Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 15:23:52 -0500 Subject: [PATCH 09/39] collapsed basemodel --- src/app/schemas/highscores.py | 58 ++++++----------------------------- 1 file changed, 9 insertions(+), 49 deletions(-) diff --git a/src/app/schemas/highscores.py b/src/app/schemas/highscores.py index 23bfa43..822c853 100644 --- a/src/app/schemas/highscores.py +++ b/src/app/schemas/highscores.py @@ -108,15 +108,8 @@ class playerHiscoreData(BaseModel): vardorvis: int = 0 -class ScraperDataBase(BaseModel): +class ScraperData(BaseModel): player_id: int - - -class ScraperDataCreate(ScraperDataBase): - pass - - -class ScraperData(ScraperDataBase): scraper_id: int created_at: Optional[str] = None record_date: Optional[str] = None @@ -125,67 +118,42 @@ class Config: orm_mode = True -class SkillBase(BaseModel): - skill_name: str - - -class SkillCreate(SkillBase): - pass - - -class Skill(SkillBase): +class Skill(BaseModel): skill_id: int + skill_name: str class Config: orm_mode = True -class ActivityBase(BaseModel): - activity_name: str - - -class ActivityCreate(ActivityBase): - pass - - -class Activity(ActivityBase): +class Activity(BaseModel): activity_id: int + activity_name: str class Config: orm_mode = True -class PlayerSkillBase(BaseModel): +class PlayerSkill(BaseModel): scraper_id: int skill_id: int skill_value: int - -class PlayerSkillCreate(PlayerSkillBase): - pass - - -class PlayerSkill(PlayerSkillBase): class Config: orm_mode = True -class PlayerActivityBase(BaseModel): +class PlayerActivity(BaseModel): scraper_id: int activity_id: int activity_value: int - -class PlayerActivityCreate(PlayerActivityBase): - pass - - -class PlayerActivity(PlayerActivityBase): class Config: orm_mode = True -class PlayerBase(BaseModel): +class Player(BaseModel): + id: int name: str possible_ban: Optional[bool] = None confirmed_ban: Optional[bool] = None @@ -196,14 +164,6 @@ class PlayerBase(BaseModel): hardcore_ironman: Optional[bool] = None ultimate_ironman: Optional[bool] = None normalized_name: str - - -class PlayerCreate(PlayerBase): - pass - - -class Player(PlayerBase): - id: int created_at: Optional[str] = None updated_at: Optional[str] = None From f7719eda629234accdabaff3463a29c5463f9b7d Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 15:28:02 -0500 Subject: [PATCH 10/39] update to use tinyinteger --- src/database/models/highscores.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/database/models/highscores.py b/src/database/models/highscores.py index 294f421..6d17478 100644 --- a/src/database/models/highscores.py +++ b/src/database/models/highscores.py @@ -1,6 +1,6 @@ from sqlalchemy import ( BigInteger, - SmallInteger, + TinyInteger, Column, Date, DateTime, @@ -147,7 +147,7 @@ class ScraperData(Base): class Skills(Base): __tablename__ = "skills" - skill_id = Column(SmallInteger, primary_key=True, autoincrement=True) + skill_id = Column(TinyInteger, primary_key=True, autoincrement=True) skill_name = Column(String(50), nullable=False) player_skills = relationship("PlayerSkills", back_populates="skill") @@ -164,7 +164,7 @@ class Skills(Base): class Activities(Base): __tablename__ = "activities" - activity_id = Column(SmallInteger, primary_key=True, autoincrement=True) + activity_id = Column(TinyInteger, primary_key=True, autoincrement=True) activity_name = Column(String(50), nullable=False) __table_args__ = (UniqueConstraint("activity_name", name="unique_activity_name"),) @@ -187,7 +187,7 @@ class PlayerSkills(Base): primary_key=True, ) skill_id = Column( - SmallInteger, + TinyInteger, ForeignKey("skills.skill_id", ondelete="CASCADE"), primary_key=True, ) @@ -216,7 +216,7 @@ class PlayerActivities(Base): primary_key=True, ) activity_id = Column( - SmallInteger, + TinyInteger, ForeignKey("activities.activity_id", ondelete="CASCADE"), primary_key=True, ) From 90004b8288326fdf4e8ce376b86a3c36016c903d Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 15:31:17 -0500 Subject: [PATCH 11/39] fixed small and tiny to match sql --- src/database/models/highscores.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/database/models/highscores.py b/src/database/models/highscores.py index 6d17478..d4434d0 100644 --- a/src/database/models/highscores.py +++ b/src/database/models/highscores.py @@ -1,6 +1,7 @@ from sqlalchemy import ( BigInteger, TinyInteger, + SmallInteger, Column, Date, DateTime, From 60a20e627e69a00d4f1e31025d99170a2872b5ac Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Sat, 24 Feb 2024 21:26:05 -0500 Subject: [PATCH 12/39] checkpoint --- .vscode/launch.json | 17 +- .vscode/tasks.json | 22 ++ Dockerfile | 4 + Makefile | 16 ++ docker-compose.yml | 5 + src/app/schemas/highscores.py | 33 +-- src/database/models/highscores.py | 24 +- src/main.py | 415 +++++++++++++++++++++--------- 8 files changed, 386 insertions(+), 150 deletions(-) create mode 100644 .vscode/tasks.json create mode 100644 Makefile diff --git a/.vscode/launch.json b/.vscode/launch.json index 8a94e49..fafe6d5 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,6 +4,21 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ + { + "name": "Python: Remote Attach", + "type": "python", + "request": "attach", + "connect": { + "host": "localhost", + "port": 5678 + }, + "pathMappings": [ + { + "localRoot": "${workspaceFolder}", + "remoteRoot": "/app" + } + ], + }, { "name": "Python: Current File", "type": "python", @@ -11,6 +26,6 @@ "program": "${file}", "console": "integratedTerminal", "justMyCode": true - } + }, ] } diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..08a8c36 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,22 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "compose-up", + "type": "shell", + "command": "docker-compose down --volumes && docker-compose up --build -d", + "isBackground": true, + "problemMatcher": { + "owner": "custom", + "pattern": { + "regexp": "^(.*)$" + }, + "background": { + "activeOnStart": true, + "beginsPattern": "^(.*Starting development server.*)$", + "endsPattern": "^(.*Attaching to.*)$" + } + } + } + ] +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 6c3f442..47413f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,6 +13,10 @@ WORKDIR /project COPY ./requirements.txt /project RUN pip install --no-cache-dir -r requirements.txt +# PTVSD is a Python debugger that can be used in a container +ARG INSTALL_PTVSD=false +RUN if [ "$INSTALL_PTVSD" = "true" ] ; then pip install debugpy ; fi + # copy the scripts to the folder COPY ./src /project/src diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6bce355 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +.PHONY: build up down clean cleanbuild + +build: + docker-compose build + +up: + docker-compose up -d + +down: + docker-compose down + +clean: + docker-compose down --volumes + +cleanbuild: clean + docker-compose up --build \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index b242d2d..bb7d8e9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -85,13 +85,18 @@ services: args: root_path: / api_port: 5000 + INSTALL_PTVSD: true # command: bash -c "apt update && apt install -y curl && sleep infinity" command: python src/main.py + ports: + - 5678:5678 environment: - KAFKA_HOST=kafka:9092 - DATABASE_URL=mysql+aiomysql://root:root_bot_buster@mysql:3306/playerdata - POOL_TIMEOUT=30 - POOL_RECYCLE=30 + - ENABLE_DEBUGPY=true + - PYDEVD_DISABLE_FILE_VALIDATION=1 networks: - botdetector-network volumes: diff --git a/src/app/schemas/highscores.py b/src/app/schemas/highscores.py index 822c853..0732dc5 100644 --- a/src/app/schemas/highscores.py +++ b/src/app/schemas/highscores.py @@ -2,6 +2,7 @@ from pydantic import BaseModel, ConfigDict from typing import Optional +from datetime import date class playerHiscoreData(BaseModel): @@ -108,51 +109,48 @@ class playerHiscoreData(BaseModel): vardorvis: int = 0 -class ScraperData(BaseModel): +class scraperData(BaseModel): + model_config = ConfigDict(from_attributes=True) + player_id: int scraper_id: int created_at: Optional[str] = None record_date: Optional[str] = None - class Config: - orm_mode = True +class skills(BaseModel): + model_config = ConfigDict(from_attributes=True) -class Skill(BaseModel): skill_id: int skill_name: str - class Config: - orm_mode = True +class activities(BaseModel): + model_config = ConfigDict(from_attributes=True) -class Activity(BaseModel): activity_id: int activity_name: str - class Config: - orm_mode = True +class playerSkills(BaseModel): + model_config = ConfigDict(from_attributes=True) -class PlayerSkill(BaseModel): scraper_id: int skill_id: int skill_value: int - class Config: - orm_mode = True +class playerActivities(BaseModel): + model_config = ConfigDict(from_attributes=True) -class PlayerActivity(BaseModel): scraper_id: int activity_id: int activity_value: int - class Config: - orm_mode = True +class player(BaseModel): + model_config = ConfigDict(from_attributes=True) -class Player(BaseModel): id: int name: str possible_ban: Optional[bool] = None @@ -166,6 +164,3 @@ class Player(BaseModel): normalized_name: str created_at: Optional[str] = None updated_at: Optional[str] = None - - class Config: - orm_mode = True diff --git a/src/database/models/highscores.py b/src/database/models/highscores.py index d4434d0..9c2e172 100644 --- a/src/database/models/highscores.py +++ b/src/database/models/highscores.py @@ -1,7 +1,4 @@ from sqlalchemy import ( - BigInteger, - TinyInteger, - SmallInteger, Column, Date, DateTime, @@ -9,6 +6,7 @@ func, ForeignKey, ) +from sqlalchemy.dialects.mysql import BIGINT, TINYINT, SMALLINT from sqlalchemy.orm import relationship from sqlalchemy.schema import UniqueConstraint @@ -19,11 +17,11 @@ class PlayerHiscoreData(Base): __tablename__ = "playerHiscoreData" - id = Column(BigInteger, primary_key=True, autoincrement=True) + id = Column(BIGINT, primary_key=True, autoincrement=True) timestamp = Column(DateTime, nullable=False, server_default=func.now()) ts_date = Column(Date, nullable=True) Player_id = Column(Integer, nullable=False) - total = Column(BigInteger, default=0) + total = Column(BIGINT, default=0) attack = Column(Integer, default=0) defence = Column(Integer, default=0) strength = Column(Integer, default=0) @@ -130,9 +128,9 @@ class PlayerHiscoreData(Base): class ScraperData(Base): __tablename__ = "scraper_data" - scraper_id = Column(BigInteger, primary_key=True, autoincrement=True) + scraper_id = Column(BIGINT, primary_key=True, autoincrement=True) created_at = Column(DateTime, nullable=False, server_default=func.now()) - player_id = Column(SmallInteger, nullable=False) + player_id = Column(SMALLINT, nullable=False) record_date = Column(Date, nullable=True, server_onupdate=func.current_date()) player_skills = relationship("PlayerSkills", back_populates="scraper_data") __table_args__ = ( @@ -148,7 +146,7 @@ class ScraperData(Base): class Skills(Base): __tablename__ = "skills" - skill_id = Column(TinyInteger, primary_key=True, autoincrement=True) + skill_id = Column(TINYINT, primary_key=True, autoincrement=True) skill_name = Column(String(50), nullable=False) player_skills = relationship("PlayerSkills", back_populates="skill") @@ -165,7 +163,7 @@ class Skills(Base): class Activities(Base): __tablename__ = "activities" - activity_id = Column(TinyInteger, primary_key=True, autoincrement=True) + activity_id = Column(TINYINT, primary_key=True, autoincrement=True) activity_name = Column(String(50), nullable=False) __table_args__ = (UniqueConstraint("activity_name", name="unique_activity_name"),) @@ -183,12 +181,12 @@ class PlayerSkills(Base): __tablename__ = "player_skills" scraper_id = Column( - BigInteger, + BIGINT, ForeignKey("scraper_data.scraper_id", ondelete="CASCADE"), primary_key=True, ) skill_id = Column( - TinyInteger, + TINYINT, ForeignKey("skills.skill_id", ondelete="CASCADE"), primary_key=True, ) @@ -212,12 +210,12 @@ class PlayerActivities(Base): __tablename__ = "player_activities" scraper_id = Column( - BigInteger, + BIGINT, ForeignKey("scraper_data.scraper_id", ondelete="CASCADE"), primary_key=True, ) activity_id = Column( - TinyInteger, + TINYINT, ForeignKey("activities.activity_id", ondelete="CASCADE"), primary_key=True, ) diff --git a/src/main.py b/src/main.py index a9ac9c1..23c6cf0 100644 --- a/src/main.py +++ b/src/main.py @@ -9,18 +9,19 @@ from sqlalchemy import select from app.schemas.highscores import ( playerHiscoreData as playerHiscoreDataSchema, - PlayerActivityCreate, - ScraperDataCreate, - PlayerSkillCreate, - PlayerCreate, + playerActivities as playerActivitiesSchema, + scraperData as scraperDataSchema, + playerSkills as playerSkillsSchema, + player as playerSchema, ) from core.config import settings from database.database import get_session from database.models.highscores import ( - # PlayerHiscoreData, PlayerActivities, PlayerSkills, ScraperData, + Skills, + Activities, ) from database.models.player import Player from sqlalchemy import insert, update @@ -30,19 +31,40 @@ from pydantic import BaseModel +import os +import debugpy + +# if os.getenv("ENABLE_DEBUGPY") == "true": +# debugpy.listen(("0.0.0.0", 5678)) +# print("Waiting for debugger to attach...") +# debugpy.wait_for_client() + logger = logging.getLogger(__name__) class Message(BaseModel): - hiscore: dict - player: dict | None + hiscores: playerHiscoreDataSchema | None + player: playerSchema | None + +class NewDataSchema(BaseModel): + scraper_data: scraperDataSchema + player_skills: list[playerSkillsSchema] + player_activities: list[playerActivitiesSchema] + player: playerSchema + + +from datetime import datetime, timedelta # Global variables to cache the skill and activity names -SKILL_NAMES = None -ACTIVITY_NAMES = None -# Global lock for updating the cache -CACHE_UPDATE_LOCK = asyncio.Lock() +SKILL_NAMES: list[playerSkillsSchema] = [] +ACTIVITY_NAMES: list[playerActivitiesSchema] = [] +# Global variables for the locks +SKILL_NAMES_LOCK = asyncio.Lock() +ACTIVITY_NAMES_LOCK = asyncio.Lock() +# Global variable to track when the cache was last updated +LAST_SKILL_NAMES_UPDATE = datetime.min +LAST_ACTIVITY_NAMES_UPDATE = datetime.min async def kafka_consumer(topic: str, group: str): @@ -84,7 +106,9 @@ async def send_messages(topic: str, producer: AIOKafkaProducer, send_queue: Queu await asyncio.sleep(1) continue message = await send_queue.get() - await producer.send(topic, value=message) + # Convert the Message object to a JSON serializable dictionary + message_dict = message.model_dump_json() + await producer.send(topic, value=message_dict) send_queue.task_done() @@ -113,68 +137,157 @@ def log_speed( return time.time(), 0 +# async def insert_data(batch: list[dict], error_queue: Queue): +# try: +# highscores: list[dict] = [msg.get("hiscores") for msg in batch] +# players: list[dict] = [msg.get("player") for msg in batch] + +# highscores = [playerHiscoreDataSchema(**hs) for hs in highscores if hs] +# highscores = [hs.model_dump(mode="json") for hs in highscores] + +# session: AsyncSession = await get_session() + +# logger.info(f"Received: {len(players)=}, {len(highscores)=}") + +# # start a transaction +# async with session.begin(): +# # insert into table values () +# insert_sql: Insert = insert(PlayerHiscoreData) +# insert_sql = insert_sql.values(highscores) +# insert_sql = insert_sql.prefix_with("ignore") +# await session.execute(insert_sql) +# # update table +# for player in players: +# update_sql: Update = update(Player) +# update_sql = update_sql.where(Player.id == player.get("id")) +# update_sql = update_sql.values(player) +# await session.execute(update_sql) +# except (OperationalError, IntegrityError) as e: +# for message in batch: +# await error_queue.put(message) + +# logger.error({"error": e}) +# logger.info(f"error_qsize={error_queue.qsize()}, {message=}") +# except Exception as e: +# for message in batch: +# await error_queue.put(message) + +# logger.error({"error": e}) +# logger.debug(f"Traceback: \n{traceback.format_exc()}") +# logger.info(f"error_qsize={error_queue.qsize()}, {message=}") + + +async def check_and_update_skill_cache(batch: list[Message], session: AsyncSession): + global SKILL_NAMES, LAST_SKILL_NAMES_UPDATE, SKILL_NAMES_LOCK, ACTIVITY_NAMES + + # Query the cache to get the skill IDs + skill_ids = {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} + + missing_skills = [ + skill + for message in batch + for skill in message.hiscores.model_fields.keys() + if skill + not in ["timestamp", "Player_id"] + [skill.skill_name for skill in SKILL_NAMES] + and skill not in skill_ids + ] + if missing_skills: + # Check if the cache was updated less than 10 minutes ago + if datetime.now() - LAST_SKILL_NAMES_UPDATE < timedelta(minutes=10): + logger.warning( + "Skill names cache update was called less than 10 minutes ago. Skipping batch." + ) + return None # Or however you want to handle this case + + # Update the skill names cache + async with SKILL_NAMES_LOCK: + await update_skill_names(session) + LAST_SKILL_NAMES_UPDATE = datetime.now() + + # Query the cache again to get the updated skill IDs + skill_ids = ( + {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} + ) + + return skill_ids + + +async def check_and_update_activity_cache(batch: list[Message], session: AsyncSession): + global ACTIVITY_NAMES, LAST_ACTIVITY_NAMES_UPDATE, ACTIVITY_NAMES_LOCK, SKILL_NAMES + + # Query the cache to get the activity IDs + activity_ids = ( + {activity.name: activity.id for activity in ACTIVITY_NAMES} + if ACTIVITY_NAMES + else {} + ) + + # Check if any activity name in any message is not found in the cache + missing_activities = [ + activity + for message in batch + for activity in message.hiscores.model_fields.keys() + if activity + not in ["timestamp", "Player_id"] + [skill.skill_name for skill in SKILL_NAMES] + and activity not in activity_ids + ] + if missing_activities: + # Check if the cache was updated less than 10 minutes ago + if datetime.now() - LAST_ACTIVITY_NAMES_UPDATE < timedelta(minutes=10): + logger.warning( + "Activity names cache update was called less than 10 minutes ago. Skipping batch." + ) + return None # Or however you want to handle this case + + # Update the activity names cache + async with ACTIVITY_NAMES_LOCK: + await update_activity_names(session) + LAST_ACTIVITY_NAMES_UPDATE = datetime.now() + + # Query the cache again to get the updated activity IDs + activity_ids = ( + {activity.name: activity.id for activity in ACTIVITY_NAMES} + if ACTIVITY_NAMES + else {} + ) + + return activity_ids + + async def insert_data(batch: list[Message], error_queue: Queue): - session: AsyncSession = await get_session() + # debugpy.breakpoint() + global SKILL_NAMES, ACTIVITY_NAMES, LAST_SKILL_NAMES_UPDATE, LAST_ACTIVITY_NAMES_UPDATE try: - # Transform the old data format into the new format - batch = [await transform_data(msg, session) for msg in batch] - - scraper_data_list: list[dict] = [msg.get("scraper_data") for msg in batch] - player_skills_list: list[dict] = [msg.get("player_skills") for msg in batch] - player_activities_list: list[dict] = [ - msg.get("player_activities") for msg in batch - ] - players: list[dict] = [msg.get("player") for msg in batch] - - scraper_data_list = [ScraperDataCreate(**sd) for sd in scraper_data_list if sd] - player_skills_list = [ - PlayerSkillCreate(**ps) for ps in player_skills_list if ps - ] - player_activities_list = [ - PlayerActivityCreate(**pa) for pa in player_activities_list if pa - ] - players = [PlayerCreate(**p) for p in players if p] - - logger.info( - f"Received: players={len(players)}, scraper_data={len(scraper_data_list)}, skills={len(player_skills_list)}, activities={len(player_activities_list)}" - ) + session: AsyncSession = await get_session() + + # # Check and update the skill and activity caches + # if ( + # await check_and_update_skill_cache(batch, session) is None + # or await check_and_update_activity_cache(batch, session) is None + # ): + # return - # start a transaction + batch_return = await transform_data(batch, session) async with session.begin(): - # insert into scraper_data table - for scraper_data in scraper_data_list: - insert_scraper_data = ( - insert(ScraperData) - .values(scraper_data.dict()) - .prefix_with("ignore") - ) - await session.execute(insert_scraper_data) - - # insert into player_skills table - for player_skill in player_skills_list: - insert_player_skill = ( - insert(PlayerSkills) - .values(player_skill.dict()) - .prefix_with("ignore") - ) - await session.execute(insert_player_skill) - - # insert into player_activities table - for player_activity in player_activities_list: - insert_player_activity = ( - insert(PlayerActivities) - .values(player_activity.dict()) - .prefix_with("ignore") - ) - await session.execute(insert_player_activity) - - # update Player table - for player in players: - update_sql: Update = update(Player) - update_sql = update_sql.where(Player.id == player.id) - update_sql = update_sql.values(player.dict()) - await session.execute(update_sql) + for new_data in batch_return: + # insert into scraper_data table + scraper_data = new_data.scraper_data + session.add(scraper_data) + + # insert into player_skills table + player_skills = new_data.player_skills + session.bulk_save_objects(player_skills) + + # insert into player_activities table + player_activities = new_data.player_activities + session.bulk_save_objects(player_activities) + + # update Player table + player = new_data.player + session.merge(player) + + await session.commit() except (OperationalError, IntegrityError) as e: for message in batch: await error_queue.put(message) @@ -190,56 +303,117 @@ async def insert_data(batch: list[Message], error_queue: Queue): logger.info(f"error_qsize={error_queue.qsize()}, {message=}") -async def transform_data(old_data: dict, session: AsyncSession) -> dict: - global SKILL_NAMES, ACTIVITY_NAMES +async def transform_data( + old_data_list: list[Message], session: AsyncSession +) -> NewDataSchema: + global SKILL_NAMES, ACTIVITY_NAMES, LAST_CACHE_UPDATE + + new_data_list = [] + + for old_data in old_data_list: + + # Query the cache to get the skill and activity IDs + skill_ids = ( + {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} + ) + activity_ids = ( + {activity.name: activity.id for activity in ACTIVITY_NAMES} + if ACTIVITY_NAMES + else {} + ) + + # Transform the old data format into the new format + new_data = NewDataSchema( + **{ + "scraper_data": { + "scraper_id": old_data.player.id if old_data.player else None, + "created_at": ( + old_data.hiscores.timestamp.isoformat() + if old_data.hiscores + else None + ), + "player_id": ( + old_data.hiscores.Player_id if old_data.hiscores else None + ), + "record_date": ( + datetime.utcnow().isoformat() if old_data.hiscores else None + ), + }, + "player_skills": ( + [ + { + "skill_id": ( + skill_ids[skill.name] + if skill.name in skill_ids + else None + ), + "skill_value": ( + getattr(old_data.hiscores, skill.name, None) + if old_data.hiscores + else None + ), + } + for skill in SKILL_NAMES + ] + if SKILL_NAMES + else [] + ), + "player_activities": ( + [ + { + "activity_id": ( + activity_ids[activity.name] + if activity.name in activity_ids + else None + ), + "activity_value": ( + getattr(old_data.hiscores, activity.name, None) + if old_data.hiscores + else None + ), + } + for activity in ACTIVITY_NAMES + ] + if ACTIVITY_NAMES + else [] + ), + "player": { + "id": old_data.hiscores.Player_id if old_data.hiscores else None, + "name": old_data.player.name if old_data.player else None, + "normalized_name": ( + old_data.player.normalized_name if old_data.player else None + ), + }, + } + ) + + logger.debug(f"Transformed data: {new_data}") + new_data_list.append(new_data) + + return new_data_list - logger.debug(f"Input data: {old_data}") - # Fetch the skill and activity names from the database if they're not already cached - async with CACHE_UPDATE_LOCK: +async def update_skill_names(session: AsyncSession): + global SKILL_NAMES, SKILL_NAMES_LOCK + + async with SKILL_NAMES_LOCK: if SKILL_NAMES is None: - skill_names = await session.execute(select(PlayerSkills.skill_value)) - SKILL_NAMES = [result[0] for result in skill_names.scalars().all()] + skill_records = await session.execute(select(Skills)) + SKILL_NAMES = [ + playerSkillsSchema(**record) for record in skill_records.scalars().all() + ] + + +async def update_activity_names(session: AsyncSession): + global ACTIVITY_NAMES, ACTIVITY_NAMES_LOCK + + async with ACTIVITY_NAMES_LOCK: if ACTIVITY_NAMES is None: - activity_names = await session.execute( - select(PlayerActivities.activity_value) - ) - ACTIVITY_NAMES = [result[0] for result in activity_names.scalars().all()] - - # Transform the old data format into the new format - new_data = { - "scraper_data": { - "scraper_id": old_data.get("id"), - "created_at": old_data.get("timestamp"), - "player_id": old_data.get("Player_id"), - "record_date": old_data.get("ts_date"), - }, - "player_skills": [ - {"skill_id": i, "skill_value": old_data.get(skill)} - for i, skill in enumerate(SKILL_NAMES) - ], - "player_activities": [ - {"activity_id": i, "activity_value": old_data.get(activity)} - for i, activity in enumerate(ACTIVITY_NAMES) - ], - "player": { - "id": old_data.get("Player_id"), - }, - } - - logger.debug(f"Transformed data: {new_data}") - - return new_data - - -async def update_cache(session: AsyncSession): - global SKILL_NAMES, ACTIVITY_NAMES - - # Fetch the skill and activity names from the database - skill_names = await session.execute(select(PlayerSkills.skill_name)) - SKILL_NAMES = [result[0] for result in skill_names.scalars().all()] - activity_names = await session.execute(select(PlayerActivities.activity_name)) - ACTIVITY_NAMES = [result[0] for result in activity_names.scalars().all()] + activity_records = await session.execute(select(Activities)) + ACTIVITY_NAMES = [ + playerActivitiesSchema(**record) + for record in activity_records.scalars().all() + ] async def process_data(receive_queue: Queue, error_queue: Queue): @@ -268,12 +442,19 @@ async def process_data(receive_queue: Queue, error_queue: Queue): # Get a message from the chosen queue message: dict = await receive_queue.get() - message: Message = Message(**message) + # debugpy.breakpoint() + # make sure the message has the 'hiscores' key and it's not None + if "hiscores" not in message or message["hiscores"] is None: + continue + # Ensure the 'player.normalized_name' key exists in the message + if "player" in message and "normalized_name" not in message["player"]: + message["player"]["normalized_name"] = "" # or some default value + message: Message = Message(**message) # TODO fix test data if settings.ENV != "PRD": player = message.player - player_id = player.get("id") + player_id = player.id # Access the 'id' attribute directly MIN_PLAYER_ID = 0 MAX_PLAYER_ID = 300 if not (MIN_PLAYER_ID < player_id <= MAX_PLAYER_ID): From 6676d1dde5acf201627dcb52d401d5f033cf33f7 Mon Sep 17 00:00:00 2001 From: extreme4all <40169115+extreme4all@users.noreply.github.com> Date: Sun, 25 Feb 2024 12:16:01 +0100 Subject: [PATCH 13/39] added models & repositories --- .vscode/settings.json | 2 +- src/app/repositories/__init__.py | 0 src/app/repositories/abc.py | 52 ++++++++ src/app/repositories/activities.py | 39 ++++++ src/app/repositories/highscore.py | 31 +++++ src/app/repositories/player.py | 0 src/app/repositories/skills.py | 37 ++++++ src/app/schemas/input/__init__.py | 0 src/app/schemas/input/activities.py | 16 +++ .../{highscores.py => input/highscore.py} | 61 +--------- src/app/schemas/input/player.py | 21 ++++ src/app/schemas/input/scraper_data.py | 12 ++ src/app/schemas/input/skills.py | 16 +++ src/app/schemas/output/__init__.py | 0 src/core/__init__.py | 1 - src/database/models/activities.py | 49 ++++++++ src/database/models/highscores.py | 113 +----------------- src/database/models/scraper_data.py | 29 +++++ src/database/models/skills.py | 47 ++++++++ src/main.py | 40 ++++--- 20 files changed, 377 insertions(+), 189 deletions(-) create mode 100644 src/app/repositories/__init__.py create mode 100644 src/app/repositories/abc.py create mode 100644 src/app/repositories/activities.py create mode 100644 src/app/repositories/highscore.py create mode 100644 src/app/repositories/player.py create mode 100644 src/app/repositories/skills.py create mode 100644 src/app/schemas/input/__init__.py create mode 100644 src/app/schemas/input/activities.py rename src/app/schemas/{highscores.py => input/highscore.py} (60%) create mode 100644 src/app/schemas/input/player.py create mode 100644 src/app/schemas/input/scraper_data.py create mode 100644 src/app/schemas/input/skills.py create mode 100644 src/app/schemas/output/__init__.py create mode 100644 src/database/models/activities.py create mode 100644 src/database/models/scraper_data.py create mode 100644 src/database/models/skills.py diff --git a/.vscode/settings.json b/.vscode/settings.json index a08d27f..9fde30e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,7 +5,7 @@ "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", + "editor.defaultFormatter": "charliermarsh.ruff", "editor.formatOnSave": true, "editor.codeActionsOnSave": { "source.organizeImports": "explicit" diff --git a/src/app/repositories/__init__.py b/src/app/repositories/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/app/repositories/abc.py b/src/app/repositories/abc.py new file mode 100644 index 0000000..f6c6051 --- /dev/null +++ b/src/app/repositories/abc.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod + +from src.database.database import get_session + + +class ABCRepo(ABC): + """ + Abstract base class for repositories. + """ + + async def _get_session(self): + return await get_session() + + @abstractmethod + async def create(self, data): + """ + Creates a new entity. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError("Subclasses must implement the create method") + + @abstractmethod + async def request(self, id): + """ + Retrieves an entity by its ID. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError("Subclasses must implement the request method") + + @abstractmethod + async def update(self, id, data): + """ + Updates an existing entity. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError("Subclasses must implement the update method") + + @abstractmethod + async def delete(self, id): + """ + Deletes an entity by its ID. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError("Subclasses must implement the delete method") diff --git a/src/app/repositories/activities.py b/src/app/repositories/activities.py new file mode 100644 index 0000000..6278369 --- /dev/null +++ b/src/app/repositories/activities.py @@ -0,0 +1,39 @@ +from sqlalchemy import insert, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.app.repositories.abc import ABCRepo +from src.app.schemas.input.activities import playerActivities +from src.database.models.activities import Activities as ActivitiesDB +from src.database.models.activities import PlayerActivities as playerActivitiesDB + + +class ActivitiesRepo(ABCRepo): + def __init__(self) -> None: + super().__init__() + + async def request(self, activity_id: int = None): + table = ActivitiesDB + sql = select(table) + + if activity_id: + sql = sql.where(table.activity_id == activity_id) + + async with self._get_session() as session: + session: AsyncSession + data = await session.execute(sql) + data = await data.all() + return data + + +class PlayerActivitiesRepo(ABCRepo): + def __init__(self) -> None: + super().__init__() + + async def create(self, data: list[playerActivities]): + data = [d.model_dump() for d in data] + table = playerActivitiesDB + sql = insert(table).values(data) + async with self._get_session() as session: + session: AsyncSession + await session.execute(sql) + return None diff --git a/src/app/repositories/highscore.py b/src/app/repositories/highscore.py new file mode 100644 index 0000000..a7af387 --- /dev/null +++ b/src/app/repositories/highscore.py @@ -0,0 +1,31 @@ +from sqlalchemy import insert, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.app.repositories.abc import ABCRepo +from src.app.schemas.input.highscore import PlayerHiscoreData +from src.database.models.highscores import PlayerHiscoreData as PlayerHiscoreDataDB + + +class HighscoreRepo(ABCRepo): + def __init__(self) -> None: + super().__init__() + + async def request(self, id: list[int] = None): + table = PlayerHiscoreDataDB + sql = select(table) + sql = sql.where(table.id.in_(id)) if id else sql + + async with self._get_session() as session: + session: AsyncSession + data = await session.execute(sql) + data = await data.all() + return data + + async def create(self, data: list[PlayerHiscoreData]): + data = [d.model_dump() for d in data] + table = PlayerHiscoreDataDB + sql = insert(table) + async with self._get_session() as session: + session: AsyncSession + await session.execute(sql) + return None diff --git a/src/app/repositories/player.py b/src/app/repositories/player.py new file mode 100644 index 0000000..e69de29 diff --git a/src/app/repositories/skills.py b/src/app/repositories/skills.py new file mode 100644 index 0000000..f5bd07b --- /dev/null +++ b/src/app/repositories/skills.py @@ -0,0 +1,37 @@ +from sqlalchemy import insert, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.app.repositories.abc import ABCRepo +from src.app.schemas.input.skills import PlayerSkills +from src.database.models.skills import PlayerSkills as PlayerSkillsDB +from src.database.models.skills import Skills as SkillsDB + + +class SkillsRepo(ABCRepo): + def __init__(self) -> None: + super().__init__() + + async def request(self, skill_id: int = None): + table = SkillsDB + sql = select(table) + if skill_id: + sql = sql.where(table.skill_id == skill_id) + async with self._get_session() as session: + session: AsyncSession + data = await session.execute(sql) + data = await data.all() + return data + + +class PlayerSkillsRepo(ABCRepo): + def __init__(self) -> None: + super().__init__() + + async def create(self, data: list[PlayerSkills]): + data = [d.model_dump() for d in data] + table = PlayerSkillsDB + sql = insert(table).values(data) + async with self._get_session() as session: + session: AsyncSession + await session.execute(sql) + return None diff --git a/src/app/schemas/input/__init__.py b/src/app/schemas/input/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/app/schemas/input/activities.py b/src/app/schemas/input/activities.py new file mode 100644 index 0000000..6b90da3 --- /dev/null +++ b/src/app/schemas/input/activities.py @@ -0,0 +1,16 @@ +from pydantic import BaseModel, ConfigDict + + +class Activities(BaseModel): + model_config = ConfigDict(from_attributes=True) + + activity_id: int + activity_name: str + + +class PlayerActivities(BaseModel): + model_config = ConfigDict(from_attributes=True) + + scraper_id: int + activity_id: int + activity_value: int diff --git a/src/app/schemas/highscores.py b/src/app/schemas/input/highscore.py similarity index 60% rename from src/app/schemas/highscores.py rename to src/app/schemas/input/highscore.py index 0732dc5..4d4f919 100644 --- a/src/app/schemas/highscores.py +++ b/src/app/schemas/input/highscore.py @@ -1,11 +1,9 @@ from datetime import datetime from pydantic import BaseModel, ConfigDict -from typing import Optional -from datetime import date -class playerHiscoreData(BaseModel): +class PlayerHiscoreData(BaseModel): model_config = ConfigDict(from_attributes=True) # id: Optional[int] = None @@ -107,60 +105,3 @@ class playerHiscoreData(BaseModel): the_leviathan: int = 0 the_whisperer: int = 0 vardorvis: int = 0 - - -class scraperData(BaseModel): - model_config = ConfigDict(from_attributes=True) - - player_id: int - scraper_id: int - created_at: Optional[str] = None - record_date: Optional[str] = None - - -class skills(BaseModel): - model_config = ConfigDict(from_attributes=True) - - skill_id: int - skill_name: str - - -class activities(BaseModel): - model_config = ConfigDict(from_attributes=True) - - activity_id: int - activity_name: str - - -class playerSkills(BaseModel): - model_config = ConfigDict(from_attributes=True) - - scraper_id: int - skill_id: int - skill_value: int - - -class playerActivities(BaseModel): - model_config = ConfigDict(from_attributes=True) - - scraper_id: int - activity_id: int - activity_value: int - - -class player(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: int - name: str - possible_ban: Optional[bool] = None - confirmed_ban: Optional[bool] = None - confirmed_player: Optional[bool] = None - label_id: Optional[int] = None - label_jagex: Optional[int] = None - ironman: Optional[bool] = None - hardcore_ironman: Optional[bool] = None - ultimate_ironman: Optional[bool] = None - normalized_name: str - created_at: Optional[str] = None - updated_at: Optional[str] = None diff --git a/src/app/schemas/input/player.py b/src/app/schemas/input/player.py new file mode 100644 index 0000000..8b18339 --- /dev/null +++ b/src/app/schemas/input/player.py @@ -0,0 +1,21 @@ +from typing import Optional + +from pydantic import BaseModel, ConfigDict + + +class Player(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + name: str + possible_ban: Optional[bool] = None + confirmed_ban: Optional[bool] = None + confirmed_player: Optional[bool] = None + label_id: Optional[int] = None + label_jagex: Optional[int] = None + ironman: Optional[bool] = None + hardcore_ironman: Optional[bool] = None + ultimate_ironman: Optional[bool] = None + normalized_name: Optional[bool] = None + created_at: Optional[str] = None + updated_at: Optional[str] = None diff --git a/src/app/schemas/input/scraper_data.py b/src/app/schemas/input/scraper_data.py new file mode 100644 index 0000000..978f2d2 --- /dev/null +++ b/src/app/schemas/input/scraper_data.py @@ -0,0 +1,12 @@ +from typing import Optional + +from pydantic import BaseModel, ConfigDict + + +class ScraperData(BaseModel): + model_config = ConfigDict(from_attributes=True) + + player_id: int + scraper_id: int + created_at: Optional[str] = None + record_date: Optional[str] = None diff --git a/src/app/schemas/input/skills.py b/src/app/schemas/input/skills.py new file mode 100644 index 0000000..a56b4d3 --- /dev/null +++ b/src/app/schemas/input/skills.py @@ -0,0 +1,16 @@ +from pydantic import BaseModel, ConfigDict + + +class Skills(BaseModel): + model_config = ConfigDict(from_attributes=True) + + skill_id: int + skill_name: str + + +class PlayerSkills(BaseModel): + model_config = ConfigDict(from_attributes=True) + + scraper_id: int + skill_id: int + skill_value: int diff --git a/src/app/schemas/output/__init__.py b/src/app/schemas/output/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/core/__init__.py b/src/core/__init__.py index 4d49c29..e69de29 100644 --- a/src/core/__init__.py +++ b/src/core/__init__.py @@ -1 +0,0 @@ -from . import logging \ No newline at end of file diff --git a/src/database/models/activities.py b/src/database/models/activities.py new file mode 100644 index 0000000..53d49ff --- /dev/null +++ b/src/database/models/activities.py @@ -0,0 +1,49 @@ +from database.database import Base +from sqlalchemy import ( + Column, + ForeignKey, + Integer, + String, +) +from sqlalchemy.dialects.mysql import BIGINT, TINYINT +from sqlalchemy.schema import UniqueConstraint + + +# CREATE TABLE activities ( +# activity_id TINYINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, # < 255 +# activity_name VARCHAR(50) NOT NULL, +# UNIQUE KEY unique_activity_name (activity_name) +# ); +class Activities(Base): + __tablename__ = "activities" + + activity_id = Column(TINYINT, primary_key=True, autoincrement=True) + activity_name = Column(String(50), nullable=False) + + __table_args__ = (UniqueConstraint("activity_name", name="unique_activity_name"),) + + +# CREATE TABLE player_activities ( +# scraper_id BIGINT UNSIGNED NOT NULL, +# activity_id TINYINT UNSIGNED NOT NULL, +# activity_value INT UNSIGNED NOT NULL DEFAULT 0, # some guy could get over 65k kc +# FOREIGN KEY (scraper_id) REFERENCES scraper_data(scraper_id) ON DELETE CASCADE, +# FOREIGN KEY (activity_id) REFERENCES activities(activity_id) ON DELETE CASCADE, +# PRIMARY KEY (scraper_id, activity_id) +# ); + + +class PlayerActivities(Base): + __tablename__ = "player_activities" + + scraper_id = Column( + BIGINT, + ForeignKey("scraper_data.scraper_id", ondelete="CASCADE"), + primary_key=True, + ) + activity_id = Column( + TINYINT, + ForeignKey("activities.activity_id", ondelete="CASCADE"), + primary_key=True, + ) + activity_value = Column(Integer, nullable=False, default=0) diff --git a/src/database/models/highscores.py b/src/database/models/highscores.py index 9c2e172..571e28a 100644 --- a/src/database/models/highscores.py +++ b/src/database/models/highscores.py @@ -1,17 +1,12 @@ +from database.database import Base from sqlalchemy import ( Column, Date, DateTime, Integer, func, - ForeignKey, ) -from sqlalchemy.dialects.mysql import BIGINT, TINYINT, SMALLINT -from sqlalchemy.orm import relationship -from sqlalchemy.schema import UniqueConstraint - -from database.database import Base -from sqlalchemy import String +from sqlalchemy.dialects.mysql import BIGINT class PlayerHiscoreData(Base): @@ -116,107 +111,3 @@ class PlayerHiscoreData(Base): the_leviathan = Column(Integer, default=0) the_whisperer = Column(Integer, default=0) vardorvis = Column(Integer, default=0) - - -# CREATE TABLE scraper_data ( -# scraper_id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, -# created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, -# player_id SMALLINT UNSIGNED NOT NULL, -# record_date DATE AS (DATE(created_at)) STORED, -# UNIQUE KEY unique_player_per_day (player_id, record_date) -# ); -class ScraperData(Base): - __tablename__ = "scraper_data" - - scraper_id = Column(BIGINT, primary_key=True, autoincrement=True) - created_at = Column(DateTime, nullable=False, server_default=func.now()) - player_id = Column(SMALLINT, nullable=False) - record_date = Column(Date, nullable=True, server_onupdate=func.current_date()) - player_skills = relationship("PlayerSkills", back_populates="scraper_data") - __table_args__ = ( - UniqueConstraint("player_id", "record_date", name="unique_player_per_day"), - ) - - -# CREATE TABLE skills ( -# skill_id TINYINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, # < 255 -# skill_name VARCHAR(50) NOT NULL, -# UNIQUE KEY unique_skill_name (skill_name) -# ); -class Skills(Base): - __tablename__ = "skills" - - skill_id = Column(TINYINT, primary_key=True, autoincrement=True) - skill_name = Column(String(50), nullable=False) - - player_skills = relationship("PlayerSkills", back_populates="skill") - - __table_args__ = (UniqueConstraint("skill_name", name="unique_skill_name"),) - - -# done -# CREATE TABLE activities ( -# activity_id TINYINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, # < 255 -# activity_name VARCHAR(50) NOT NULL, -# UNIQUE KEY unique_activity_name (activity_name) -# ); -class Activities(Base): - __tablename__ = "activities" - - activity_id = Column(TINYINT, primary_key=True, autoincrement=True) - activity_name = Column(String(50), nullable=False) - - __table_args__ = (UniqueConstraint("activity_name", name="unique_activity_name"),) - - -# CREATE TABLE player_skills ( -# scraper_id BIGINT UNSIGNED NOT NULL, -# skill_id TINYINT UNSIGNED NOT NULL, -# skill_value INT UNSIGNED NOT NULL DEFAULT 0, # < 200 000 000 -# FOREIGN KEY (scraper_id) REFERENCES scraper_data(scraper_id) ON DELETE CASCADE, -# FOREIGN KEY (skill_id) REFERENCES skills(skill_id) ON DELETE CASCADE, -# PRIMARY KEY (scraper_id, skill_id) -# ); -class PlayerSkills(Base): - __tablename__ = "player_skills" - - scraper_id = Column( - BIGINT, - ForeignKey("scraper_data.scraper_id", ondelete="CASCADE"), - primary_key=True, - ) - skill_id = Column( - TINYINT, - ForeignKey("skills.skill_id", ondelete="CASCADE"), - primary_key=True, - ) - skill_value = Column(Integer, nullable=False, default=0) - - scraper_data = relationship("ScraperData", back_populates="player_skills") - skill = relationship("Skills", back_populates="player_skills") - - -# CREATE TABLE player_activities ( -# scraper_id BIGINT UNSIGNED NOT NULL, -# activity_id TINYINT UNSIGNED NOT NULL, -# activity_value INT UNSIGNED NOT NULL DEFAULT 0, # some guy could get over 65k kc -# FOREIGN KEY (scraper_id) REFERENCES scraper_data(scraper_id) ON DELETE CASCADE, -# FOREIGN KEY (activity_id) REFERENCES activities(activity_id) ON DELETE CASCADE, -# PRIMARY KEY (scraper_id, activity_id) -# ); - - -class PlayerActivities(Base): - __tablename__ = "player_activities" - - scraper_id = Column( - BIGINT, - ForeignKey("scraper_data.scraper_id", ondelete="CASCADE"), - primary_key=True, - ) - activity_id = Column( - TINYINT, - ForeignKey("activities.activity_id", ondelete="CASCADE"), - primary_key=True, - ) - activity_value = Column(Integer, nullable=False, default=0) diff --git a/src/database/models/scraper_data.py b/src/database/models/scraper_data.py new file mode 100644 index 0000000..139473a --- /dev/null +++ b/src/database/models/scraper_data.py @@ -0,0 +1,29 @@ +from database.database import Base +from sqlalchemy import ( + Column, + Date, + DateTime, + func, +) +from sqlalchemy.dialects.mysql import BIGINT, SMALLINT +from sqlalchemy.schema import UniqueConstraint + + +# CREATE TABLE scraper_data ( +# scraper_id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, +# created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, +# player_id SMALLINT UNSIGNED NOT NULL, +# record_date DATE AS (DATE(created_at)) STORED, +# UNIQUE KEY unique_player_per_day (player_id, record_date) +# ); +class ScraperData(Base): + __tablename__ = "scraper_data" + + scraper_id = Column(BIGINT, primary_key=True, autoincrement=True) + created_at = Column(DateTime, nullable=False, server_default=func.now()) + player_id = Column(SMALLINT, nullable=False) + record_date = Column(Date, nullable=True, server_onupdate=func.current_date()) + + __table_args__ = ( + UniqueConstraint("player_id", "record_date", name="unique_player_per_day"), + ) diff --git a/src/database/models/skills.py b/src/database/models/skills.py new file mode 100644 index 0000000..5301202 --- /dev/null +++ b/src/database/models/skills.py @@ -0,0 +1,47 @@ +from database.database import Base +from sqlalchemy import ( + Column, + ForeignKey, + Integer, + String, +) +from sqlalchemy.dialects.mysql import BIGINT, TINYINT +from sqlalchemy.schema import UniqueConstraint + + +# CREATE TABLE skills ( +# skill_id TINYINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, # < 255 +# skill_name VARCHAR(50) NOT NULL, +# UNIQUE KEY unique_skill_name (skill_name) +# ); +class Skills(Base): + __tablename__ = "skills" + + skill_id = Column(TINYINT, primary_key=True, autoincrement=True) + skill_name = Column(String(50), nullable=False) + + __table_args__ = (UniqueConstraint("skill_name", name="unique_skill_name"),) + + +# CREATE TABLE player_skills ( +# scraper_id BIGINT UNSIGNED NOT NULL, +# skill_id TINYINT UNSIGNED NOT NULL, +# skill_value INT UNSIGNED NOT NULL DEFAULT 0, # < 200 000 000 +# FOREIGN KEY (scraper_id) REFERENCES scraper_data(scraper_id) ON DELETE CASCADE, +# FOREIGN KEY (skill_id) REFERENCES skills(skill_id) ON DELETE CASCADE, +# PRIMARY KEY (scraper_id, skill_id) +# ); +class PlayerSkills(Base): + __tablename__ = "player_skills" + + scraper_id = Column( + BIGINT, + ForeignKey("scraper_data.scraper_id", ondelete="CASCADE"), + primary_key=True, + ) + skill_id = Column( + TINYINT, + ForeignKey("skills.skill_id", ondelete="CASCADE"), + primary_key=True, + ) + skill_value = Column(Integer, nullable=False, default=0) diff --git a/src/main.py b/src/main.py index 23c6cf0..5028bdf 100644 --- a/src/main.py +++ b/src/main.py @@ -6,33 +6,31 @@ from asyncio import Queue from aiokafka import AIOKafkaConsumer, AIOKafkaProducer -from sqlalchemy import select from app.schemas.highscores import ( - playerHiscoreData as playerHiscoreDataSchema, + player as playerSchema, +) +from app.schemas.highscores import ( playerActivities as playerActivitiesSchema, - scraperData as scraperDataSchema, +) +from app.schemas.highscores import ( + playerHiscoreData as playerHiscoreDataSchema, +) +from app.schemas.highscores import ( playerSkills as playerSkillsSchema, - player as playerSchema, +) +from app.schemas.highscores import ( + scraperData as scraperDataSchema, ) from core.config import settings from database.database import get_session from database.models.highscores import ( - PlayerActivities, - PlayerSkills, - ScraperData, - Skills, Activities, + Skills, ) -from database.models.player import Player -from sqlalchemy import insert, update +from pydantic import BaseModel +from sqlalchemy import select from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.sql.expression import Insert, Update - -from pydantic import BaseModel - -import os -import debugpy # if os.getenv("ENABLE_DEBUGPY") == "true": # debugpy.listen(("0.0.0.0", 5678)) @@ -255,6 +253,16 @@ async def check_and_update_activity_cache(batch: list[Message], session: AsyncSe async def insert_data(batch: list[Message], error_queue: Queue): + """ + 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates + 2. start transaction + 3. for each player insert into scraper_data + 4. for each player get the scraper_id from scraper_data + 5. insert into player_skills (scraper_id, skill_id) values (), () + 6. insert into player_activities (scraper_id, activity_id) values (), () + + step 5 & 6 must be batched for all players at once + """ # debugpy.breakpoint() global SKILL_NAMES, ACTIVITY_NAMES, LAST_SKILL_NAMES_UPDATE, LAST_ACTIVITY_NAMES_UPDATE From dfee319a4f9fc136264714f9e639a0c5ec492986 Mon Sep 17 00:00:00 2001 From: extreme4all <40169115+extreme4all@users.noreply.github.com> Date: Sun, 25 Feb 2024 12:46:41 +0100 Subject: [PATCH 14/39] seperate kafka --- src/kafka.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 src/kafka.py diff --git a/src/kafka.py b/src/kafka.py new file mode 100644 index 0000000..50242d8 --- /dev/null +++ b/src/kafka.py @@ -0,0 +1,52 @@ +import asyncio +import json +import logging +from asyncio import Queue + +from aiokafka import AIOKafkaConsumer, AIOKafkaProducer +from core.config import settings + +logger = logging.getLogger(__name__) + + +async def kafka_consumer(topic: str, group: str): + consumer = AIOKafkaConsumer( + topic, + bootstrap_servers=[settings.KAFKA_HOST], + group_id=group, + value_deserializer=lambda x: json.loads(x.decode("utf-8")), + auto_offset_reset="earliest", + ) + await consumer.start() + return consumer + + +async def kafka_producer(): + producer = AIOKafkaProducer( + bootstrap_servers=[settings.KAFKA_HOST], + value_serializer=lambda v: json.dumps(v).encode(), + acks="all", + ) + await producer.start() + return producer + + +async def receive_messages( + consumer: AIOKafkaConsumer, receive_queue: Queue, error_queue: Queue +): + async for message in consumer: + if error_queue.qsize() > 100: + await asyncio.sleep(1) + continue + value = message.value + await receive_queue.put(value) + + +async def send_messages(topic: str, producer: AIOKafkaProducer, send_queue: Queue): + while True: + if send_queue.empty(): + await asyncio.sleep(1) + continue + message = await send_queue.get() + await producer.send(topic, value=message) + send_queue.task_done() From 59ee44d368aacd85e01eb82784ae47dff1516697 Mon Sep 17 00:00:00 2001 From: extreme4all <40169115+extreme4all@users.noreply.github.com> Date: Sun, 25 Feb 2024 12:46:49 +0100 Subject: [PATCH 15/39] docstrings --- src/app/repositories/activities.py | 35 +++++-- src/app/repositories/highscore.py | 51 +++++++-- src/app/repositories/skills.py | 31 +++++- src/main.py | 159 +++++++---------------------- 4 files changed, 132 insertions(+), 144 deletions(-) diff --git a/src/app/repositories/activities.py b/src/app/repositories/activities.py index 6278369..2cf6e0c 100644 --- a/src/app/repositories/activities.py +++ b/src/app/repositories/activities.py @@ -2,16 +2,27 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.app.repositories.abc import ABCRepo -from src.app.schemas.input.activities import playerActivities +from src.app.schemas.input.activities import PlayerActivities from src.database.models.activities import Activities as ActivitiesDB -from src.database.models.activities import PlayerActivities as playerActivitiesDB +from src.database.models.activities import PlayerActivities as PlayerActivitiesDB class ActivitiesRepo(ABCRepo): + """Repository for managing activity data.""" + def __init__(self) -> None: + """Initializes the ActivitiesRepo instance.""" super().__init__() - async def request(self, activity_id: int = None): + async def request(self, activity_id: int = None) -> list[ActivitiesDB]: + """Retrieves activity data from the database. + + Args: + activity_id: Optional activity ID to filter by. + + Returns: + A list of ActivitiesDB objects representing the retrieved activities. + """ table = ActivitiesDB sql = select(table) @@ -19,21 +30,29 @@ async def request(self, activity_id: int = None): sql = sql.where(table.activity_id == activity_id) async with self._get_session() as session: - session: AsyncSession + session: AsyncSession # Type annotation for clarity data = await session.execute(sql) data = await data.all() return data class PlayerActivitiesRepo(ABCRepo): + """Repository for managing player activity data.""" + def __init__(self) -> None: + """Initializes the PlayerActivitiesRepo instance.""" super().__init__() - async def create(self, data: list[playerActivities]): + async def create(self, data: list[PlayerActivities]) -> None: + """Creates new player activity entries in the database. + + Args: + data: A list of PlayerActivities objects containing player activity information. + """ data = [d.model_dump() for d in data] - table = playerActivitiesDB + table = PlayerActivitiesDB sql = insert(table).values(data) + async with self._get_session() as session: - session: AsyncSession + session: AsyncSession # Type annotation for clarity await session.execute(sql) - return None diff --git a/src/app/repositories/highscore.py b/src/app/repositories/highscore.py index a7af387..085c44a 100644 --- a/src/app/repositories/highscore.py +++ b/src/app/repositories/highscore.py @@ -1,31 +1,60 @@ -from sqlalchemy import insert, select +from sqlalchemy import insert, select, update from sqlalchemy.ext.asyncio import AsyncSession from src.app.repositories.abc import ABCRepo from src.app.schemas.input.highscore import PlayerHiscoreData +from src.app.schemas.input.player import Player from src.database.models.highscores import PlayerHiscoreData as PlayerHiscoreDataDB +from src.database.models.player import Player as PlayerDB class HighscoreRepo(ABCRepo): + """Repository for managing highscore data.""" + def __init__(self) -> None: + """Initializes the HighscoreRepo instance.""" super().__init__() - async def request(self, id: list[int] = None): + async def request(self, id: list[int] = None) -> list[PlayerHiscoreDataDB]: + """Retrieves highscore data from the database. + + Args: + id: Optional list of highscore IDs to filter by. + + Returns: + A list of PlayerHiscoreDataDB objects representing the retrieved highscores. + """ table = PlayerHiscoreDataDB sql = select(table) - sql = sql.where(table.id.in_(id)) if id else sql + if id: + sql = sql.where(table.id.in_(id)) async with self._get_session() as session: - session: AsyncSession + session: AsyncSession # Type annotation for clarity data = await session.execute(sql) data = await data.all() return data - async def create(self, data: list[PlayerHiscoreData]): - data = [d.model_dump() for d in data] - table = PlayerHiscoreDataDB - sql = insert(table) + async def create( + self, highscore_data: list[PlayerHiscoreData], player_data: list[Player] + ) -> None: + """Creates new highscore entries and updates player data in the database. + + Args: + highscore_data: A list of PlayerHiscoreData objects containing highscore information. + player_data: A list of Player objects containing player information to update. + """ + table_highscore = PlayerHiscoreDataDB + table_player = PlayerDB + highscore_data = [d.model_dump() for d in highscore_data] + + sql_insert = insert(table_highscore).values(highscore_data) + sql_update = update(table_player) async with self._get_session() as session: - session: AsyncSession - await session.execute(sql) - return None + session: AsyncSession # Type annotation for clarity + await session.execute(sql_insert) # Insert highscore data + + for player in player_data: + sql_update = sql_update.where(table_player.id == player.id) + sql_update = sql_update.values(player.model_dump()) + await session.execute(sql_update) # Update player data diff --git a/src/app/repositories/skills.py b/src/app/repositories/skills.py index f5bd07b..c60737f 100644 --- a/src/app/repositories/skills.py +++ b/src/app/repositories/skills.py @@ -8,30 +8,51 @@ class SkillsRepo(ABCRepo): + """Repository for managing skill data.""" + def __init__(self) -> None: + """Initializes the SkillsRepo instance.""" super().__init__() - async def request(self, skill_id: int = None): + async def request(self, skill_id: int = None) -> list[SkillsDB]: + """Retrieves skill data from the database. + + Args: + skill_id: Optional skill ID to filter by. + + Returns: + A list of SkillsDB objects representing the retrieved skills. + """ table = SkillsDB sql = select(table) + if skill_id: sql = sql.where(table.skill_id == skill_id) + async with self._get_session() as session: - session: AsyncSession + session: AsyncSession # Type annotation for clarity data = await session.execute(sql) data = await data.all() return data class PlayerSkillsRepo(ABCRepo): + """Repository for managing player skill data.""" + def __init__(self) -> None: + """Initializes the PlayerSkillsRepo instance.""" super().__init__() - async def create(self, data: list[PlayerSkills]): + async def create(self, data: list[PlayerSkills]) -> None: + """Creates new player skill entries in the database. + + Args: + data: A list of PlayerSkills objects containing player skill information. + """ data = [d.model_dump() for d in data] table = PlayerSkillsDB sql = insert(table).values(data) + async with self._get_session() as session: - session: AsyncSession + session: AsyncSession # Type annotation for clarity await session.execute(sql) - return None diff --git a/src/main.py b/src/main.py index 5028bdf..4a9b74b 100644 --- a/src/main.py +++ b/src/main.py @@ -4,23 +4,8 @@ import time import traceback from asyncio import Queue +from datetime import datetime, timedelta -from aiokafka import AIOKafkaConsumer, AIOKafkaProducer -from app.schemas.highscores import ( - player as playerSchema, -) -from app.schemas.highscores import ( - playerActivities as playerActivitiesSchema, -) -from app.schemas.highscores import ( - playerHiscoreData as playerHiscoreDataSchema, -) -from app.schemas.highscores import ( - playerSkills as playerSkillsSchema, -) -from app.schemas.highscores import ( - scraperData as scraperDataSchema, -) from core.config import settings from database.database import get_session from database.models.highscores import ( @@ -32,10 +17,10 @@ from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.asyncio import AsyncSession -# if os.getenv("ENABLE_DEBUGPY") == "true": -# debugpy.listen(("0.0.0.0", 5678)) -# print("Waiting for debugger to attach...") -# debugpy.wait_for_client() +from src import kafka +from src.app.repositories.highscore import HighscoreRepo +from src.app.schemas.input.highscore import PlayerHiscoreData +from src.app.schemas.input.player import Player logger = logging.getLogger(__name__) @@ -52,8 +37,6 @@ class NewDataSchema(BaseModel): player: playerSchema -from datetime import datetime, timedelta - # Global variables to cache the skill and activity names SKILL_NAMES: list[playerSkillsSchema] = [] ACTIVITY_NAMES: list[playerActivitiesSchema] = [] @@ -65,51 +48,6 @@ class NewDataSchema(BaseModel): LAST_ACTIVITY_NAMES_UPDATE = datetime.min -async def kafka_consumer(topic: str, group: str): - consumer = AIOKafkaConsumer( - topic, - bootstrap_servers=[settings.KAFKA_HOST], - group_id=group, - value_deserializer=lambda x: json.loads(x.decode("utf-8")), - auto_offset_reset="earliest", - ) - await consumer.start() - return consumer - - -async def kafka_producer(): - producer = AIOKafkaProducer( - bootstrap_servers=[settings.KAFKA_HOST], - value_serializer=lambda v: json.dumps(v).encode(), - acks="all", - ) - await producer.start() - return producer - - -async def receive_messages( - consumer: AIOKafkaConsumer, receive_queue: Queue, error_queue: Queue -): - async for message in consumer: - if error_queue.qsize() > 100: - await asyncio.sleep(1) - continue - value = message.value - await receive_queue.put(value) - - -async def send_messages(topic: str, producer: AIOKafkaProducer, send_queue: Queue): - while True: - if send_queue.empty(): - await asyncio.sleep(1) - continue - message = await send_queue.get() - # Convert the Message object to a JSON serializable dictionary - message_dict = message.model_dump_json() - await producer.send(topic, value=message_dict) - send_queue.task_done() - - def log_speed( counter: int, start_time: float, _queue: Queue, topic: str, interval: int = 15 ) -> tuple[float, int]: @@ -135,44 +73,28 @@ def log_speed( return time.time(), 0 -# async def insert_data(batch: list[dict], error_queue: Queue): -# try: -# highscores: list[dict] = [msg.get("hiscores") for msg in batch] -# players: list[dict] = [msg.get("player") for msg in batch] - -# highscores = [playerHiscoreDataSchema(**hs) for hs in highscores if hs] -# highscores = [hs.model_dump(mode="json") for hs in highscores] - -# session: AsyncSession = await get_session() - -# logger.info(f"Received: {len(players)=}, {len(highscores)=}") - -# # start a transaction -# async with session.begin(): -# # insert into table values () -# insert_sql: Insert = insert(PlayerHiscoreData) -# insert_sql = insert_sql.values(highscores) -# insert_sql = insert_sql.prefix_with("ignore") -# await session.execute(insert_sql) -# # update table -# for player in players: -# update_sql: Update = update(Player) -# update_sql = update_sql.where(Player.id == player.get("id")) -# update_sql = update_sql.values(player) -# await session.execute(update_sql) -# except (OperationalError, IntegrityError) as e: -# for message in batch: -# await error_queue.put(message) +async def insert_data_v1(batch: list[Message], error_queue: Queue): + try: + highscores = [ + PlayerHiscoreData(**msg.hiscores) for msg in batch if msg.hiscores + ] + players = [Player(**msg.player) for msg in batch] + logger.info(f"Received: {len(players)=}, {len(highscores)=}") + repo = HighscoreRepo() + await repo.create(highscore_data=highscores, player_data=players) + except (OperationalError, IntegrityError) as e: + for message in batch: + await error_queue.put(message) -# logger.error({"error": e}) -# logger.info(f"error_qsize={error_queue.qsize()}, {message=}") -# except Exception as e: -# for message in batch: -# await error_queue.put(message) + logger.error({"error": e}) + logger.info(f"error_qsize={error_queue.qsize()}, {message=}") + except Exception as e: + for message in batch: + await error_queue.put(message) -# logger.error({"error": e}) -# logger.debug(f"Traceback: \n{traceback.format_exc()}") -# logger.info(f"error_qsize={error_queue.qsize()}, {message=}") + logger.error({"error": e}) + logger.debug(f"Traceback: \n{traceback.format_exc()}") + logger.info(f"error_qsize={error_queue.qsize()}, {message=}") async def check_and_update_skill_cache(batch: list[Message], session: AsyncSession): @@ -252,7 +174,7 @@ async def check_and_update_activity_cache(batch: list[Message], session: AsyncSe return activity_ids -async def insert_data(batch: list[Message], error_queue: Queue): +async def insert_data_v2(batch: list[Message], error_queue: Queue): """ 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates 2. start transaction @@ -260,11 +182,15 @@ async def insert_data(batch: list[Message], error_queue: Queue): 4. for each player get the scraper_id from scraper_data 5. insert into player_skills (scraper_id, skill_id) values (), () 6. insert into player_activities (scraper_id, activity_id) values (), () - + step 5 & 6 must be batched for all players at once """ # debugpy.breakpoint() - global SKILL_NAMES, ACTIVITY_NAMES, LAST_SKILL_NAMES_UPDATE, LAST_ACTIVITY_NAMES_UPDATE + global \ + SKILL_NAMES, \ + ACTIVITY_NAMES, \ + LAST_SKILL_NAMES_UPDATE, \ + LAST_ACTIVITY_NAMES_UPDATE try: session: AsyncSession = await get_session() @@ -319,7 +245,6 @@ async def transform_data( new_data_list = [] for old_data in old_data_list: - # Query the cache to get the skill and activity IDs skill_ids = ( {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} @@ -450,15 +375,8 @@ async def process_data(receive_queue: Queue, error_queue: Queue): # Get a message from the chosen queue message: dict = await receive_queue.get() - # debugpy.breakpoint() - # make sure the message has the 'hiscores' key and it's not None - if "hiscores" not in message or message["hiscores"] is None: - continue - # Ensure the 'player.normalized_name' key exists in the message - if "player" in message and "normalized_name" not in message["player"]: - message["player"]["normalized_name"] = "" # or some default value - message: Message = Message(**message) + # TODO fix test data if settings.ENV != "PRD": player = message.player @@ -476,7 +394,8 @@ async def process_data(receive_queue: Queue, error_queue: Queue): # insert data in batches of N or interval of N if len(batch) > 100 or now - start_time > 15: async with semaphore: - await insert_data(batch=batch, error_queue=error_queue) + await insert_data_v1(batch=batch, error_queue=error_queue) + await insert_data_v2(batch=batch, error_queue=error_queue) batch = [] receive_queue.task_done() @@ -485,19 +404,19 @@ async def process_data(receive_queue: Queue, error_queue: Queue): async def main(): # get kafka engine - consumer = await kafka_consumer(topic="scraper", group="highscore-worker") - producer = await kafka_producer() + consumer = await kafka.kafka_consumer(topic="scraper", group="highscore-worker") + producer = await kafka.kafka_producer() receive_queue = Queue(maxsize=100) send_queue = Queue(maxsize=100) asyncio.create_task( - receive_messages( + kafka.receive_messages( consumer=consumer, receive_queue=receive_queue, error_queue=send_queue ) ) asyncio.create_task( - send_messages(topic="scraper", producer=producer, send_queue=send_queue) + kafka.send_messages(topic="scraper", producer=producer, send_queue=send_queue) ) asyncio.create_task( process_data(receive_queue=receive_queue, error_queue=send_queue) From 490c76f07188f9a187575c43867510d3ff654520 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Tue, 27 Feb 2024 15:15:30 -0500 Subject: [PATCH 16/39] checkpoint --- src/main.py | 91 +++++++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/src/main.py b/src/main.py index 4a9b74b..f6c268d 100644 --- a/src/main.py +++ b/src/main.py @@ -9,8 +9,8 @@ from core.config import settings from database.database import get_session from database.models.highscores import ( - Activities, - Skills, + Activities as ActivitiesDB, + Skills as SkillsDB, ) from pydantic import BaseModel from sqlalchemy import select @@ -18,28 +18,32 @@ from sqlalchemy.ext.asyncio import AsyncSession from src import kafka +# schemas import from src.app.repositories.highscore import HighscoreRepo from src.app.schemas.input.highscore import PlayerHiscoreData from src.app.schemas.input.player import Player +from src.app.schemas.input.scraper_data import ScraperData +from src.app.schemas.input.activities import Activities, PlayerActivities +from src.app.schemas.input.skills import Skills, PlayerSkills logger = logging.getLogger(__name__) class Message(BaseModel): - hiscores: playerHiscoreDataSchema | None - player: playerSchema | None + hiscores: PlayerHiscoreData | None + player: Player | None class NewDataSchema(BaseModel): - scraper_data: scraperDataSchema - player_skills: list[playerSkillsSchema] - player_activities: list[playerActivitiesSchema] - player: playerSchema + scraper_data: ScraperData + player_skills: list[PlayerSkills] + player_activities: list[PlayerActivities] + player: Player # Global variables to cache the skill and activity names -SKILL_NAMES: list[playerSkillsSchema] = [] -ACTIVITY_NAMES: list[playerActivitiesSchema] = [] +SKILL_NAMES: list[Skills] = [] +ACTIVITY_NAMES: list[Activities] = [] # Global variables for the locks SKILL_NAMES_LOCK = asyncio.Lock() ACTIVITY_NAMES_LOCK = asyncio.Lock() @@ -185,42 +189,42 @@ async def insert_data_v2(batch: list[Message], error_queue: Queue): step 5 & 6 must be batched for all players at once """ - # debugpy.breakpoint() - global \ - SKILL_NAMES, \ - ACTIVITY_NAMES, \ - LAST_SKILL_NAMES_UPDATE, \ - LAST_ACTIVITY_NAMES_UPDATE - try: session: AsyncSession = await get_session() - # # Check and update the skill and activity caches - # if ( - # await check_and_update_skill_cache(batch, session) is None - # or await check_and_update_activity_cache(batch, session) is None - # ): - # return - - batch_return = await transform_data(batch, session) + # Step 1: Check for duplicates in scraper_data[player_id, record_date], remove all duplicates + for message in batch: + existing_data = await session.query(ScraperData).filter( + ScraperData.player_id == message.player_id, + ScraperData.record_date == message.record_date + ).first() + if existing_data: + session.delete(existing_data) + await session.commit() + + # Step 2: Start transaction async with session.begin(): - for new_data in batch_return: - # insert into scraper_data table - scraper_data = new_data.scraper_data + for message in batch: + # Step 3: For each player insert into scraper_data + scraper_data = ScraperData( + player_id=message.player_id, + record_date=message.record_date + ) session.add(scraper_data) + await session.flush() - # insert into player_skills table - player_skills = new_data.player_skills - session.bulk_save_objects(player_skills) + # Step 4: For each player get the scraper_id from scraper_data + scraper_id = scraper_data.scraper_id - # insert into player_activities table - player_activities = new_data.player_activities - session.bulk_save_objects(player_activities) + # Step 5 & 6: Insert into player_skills and player_activities + # Assuming you have the skills and activities data in the message + player_skills = [PlayerSkill(scraper_id=scraper_id, skill_id=skill_id) for skill_id in message.skills] + player_activities = [PlayerActivity(scraper_id=scraper_id, activity_id=activity_id) for activity_id in message.activities] - # update Player table - player = new_data.player - session.merge(player) + session.bulk_save_objects(player_skills) + session.bulk_save_objects(player_activities) + # Commit the transaction await session.commit() except (OperationalError, IntegrityError) as e: for message in batch: @@ -236,7 +240,6 @@ async def insert_data_v2(batch: list[Message], error_queue: Queue): logger.debug(f"Traceback: \n{traceback.format_exc()}") logger.info(f"error_qsize={error_queue.qsize()}, {message=}") - async def transform_data( old_data_list: list[Message], session: AsyncSession ) -> NewDataSchema: @@ -324,16 +327,15 @@ async def transform_data( new_data_list.append(new_data) return new_data_list - - +## todo: verify this is rigth async def update_skill_names(session: AsyncSession): global SKILL_NAMES, SKILL_NAMES_LOCK async with SKILL_NAMES_LOCK: if SKILL_NAMES is None: - skill_records = await session.execute(select(Skills)) + skill_records = await session.execute(select(SkillsDB)) SKILL_NAMES = [ - playerSkillsSchema(**record) for record in skill_records.scalars().all() + SkillsDB(**record) for record in skill_records.scalars().all() ] @@ -342,13 +344,12 @@ async def update_activity_names(session: AsyncSession): async with ACTIVITY_NAMES_LOCK: if ACTIVITY_NAMES is None: - activity_records = await session.execute(select(Activities)) + activity_records = await session.execute(select(ActivitiesDB)) ACTIVITY_NAMES = [ - playerActivitiesSchema(**record) + ActivitiesDB(**record) for record in activity_records.scalars().all() ] - async def process_data(receive_queue: Queue, error_queue: Queue): # Initialize counter and start time counter = 0 From 894cbc284980adea78e1bad326a5997e0a866696 Mon Sep 17 00:00:00 2001 From: extreme4all <40169115+extreme4all@users.noreply.github.com> Date: Sat, 24 Feb 2024 20:40:01 +0100 Subject: [PATCH 17/39] left some notes behind --- src/app/schemas/input/highscore.py | 1 + src/main.py | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/app/schemas/input/highscore.py b/src/app/schemas/input/highscore.py index 4d4f919..6cc8743 100644 --- a/src/app/schemas/input/highscore.py +++ b/src/app/schemas/input/highscore.py @@ -1,4 +1,5 @@ from datetime import datetime +from typing import Optional from pydantic import BaseModel, ConfigDict diff --git a/src/main.py b/src/main.py index f6c268d..e5e5e50 100644 --- a/src/main.py +++ b/src/main.py @@ -6,14 +6,16 @@ from asyncio import Queue from datetime import datetime, timedelta + from core.config import settings from database.database import get_session from database.models.highscores import ( Activities as ActivitiesDB, Skills as SkillsDB, ) +from database.models.player import Player from pydantic import BaseModel -from sqlalchemy import select +from sqlalchemy import insert, select, update from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.asyncio import AsyncSession @@ -76,8 +78,19 @@ def log_speed( # Return the current time and reset the counter to zero return time.time(), 0 - async def insert_data_v1(batch: list[Message], error_queue: Queue): + """ + 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates + 2. start transaction + 3. for each player insert into scraper_data + 4. for each player get the scraper_id from scraper_data + 5. insert into player_skills (scraper_id, skill_id) values (), () + 6. insert into player_activities (scraper_id, activity_id) values (), () + + step 5 & 6 must be batched for all players at once + """ + session: AsyncSession = await get_session() + logger.debug(batch[:1]) try: highscores = [ PlayerHiscoreData(**msg.hiscores) for msg in batch if msg.hiscores From 457351c71dc0d062ca45f07bd2c8a72537a40334 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Tue, 27 Feb 2024 16:13:29 -0500 Subject: [PATCH 18/39] Add launch configurations for development and running the application --- .vscode/launch.json | 25 ++++++++++ .vscode/tasks.json | 17 +++---- Dockerfile | 4 +- Makefile | 14 +++++- docker-compose-dev.yml | 111 +++++++++++++++++++++++++++++++++++++++++ docker-compose.yml | 8 +-- 6 files changed, 161 insertions(+), 18 deletions(-) create mode 100644 docker-compose-dev.yml diff --git a/.vscode/launch.json b/.vscode/launch.json index fafe6d5..f0973e3 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -27,5 +27,30 @@ "console": "integratedTerminal", "justMyCode": true }, + { + "name": "Compose Up Dev", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/src/main.py", // replace with your script + "console": "integratedTerminal", + "justMyCode": true, + "preLaunchTask": "compose-up-dev" // name of the task to run before launching + }, + { + "name": "Run Dev", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/src/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "args": ["--root_path", "/", "--api_port", "5000"], + "env": { + "KAFKA_HOST": "kafka:9092", + "DATABASE_URL": "mysql+aiomysql://root:root_bot_buster@mysql:3306/playerdata", + "POOL_TIMEOUT": "30", + "POOL_RECYCLE": "30" + } + } ] } + diff --git a/.vscode/tasks.json b/.vscode/tasks.json index 08a8c36..ee19907 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -6,17 +6,12 @@ "type": "shell", "command": "docker-compose down --volumes && docker-compose up --build -d", "isBackground": true, - "problemMatcher": { - "owner": "custom", - "pattern": { - "regexp": "^(.*)$" - }, - "background": { - "activeOnStart": true, - "beginsPattern": "^(.*Starting development server.*)$", - "endsPattern": "^(.*Attaching to.*)$" - } - } + }, + { + "label": "compose-up-dev", + "type": "shell", + "command": "docker-compose -f docker-compose-dev.yml down --volumes && docker-compose -f docker-compose-dev.yml up --build -d", + "isBackground": true, } ] } \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 47413f3..8ccac08 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,8 +13,8 @@ WORKDIR /project COPY ./requirements.txt /project RUN pip install --no-cache-dir -r requirements.txt -# PTVSD is a Python debugger that can be used in a container -ARG INSTALL_PTVSD=false +# # PTVSD is a Python debugger that can be used in a container +# ARG INSTALL_PTVSD=false RUN if [ "$INSTALL_PTVSD" = "true" ] ; then pip install debugpy ; fi # copy the scripts to the folder diff --git a/Makefile b/Makefile index 6bce355..6e3a830 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,9 @@ build: up: docker-compose up -d +updev: + docker-compose -f docker-compose-dev.yml up -d + down: docker-compose down @@ -13,4 +16,13 @@ clean: docker-compose down --volumes cleanbuild: clean - docker-compose up --build \ No newline at end of file + docker-compose up --build + +create: + python3 -m venv .venv + +activate: + source .venv/bin/activate + +requirements: + pip install -r requirements.txt \ No newline at end of file diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml new file mode 100644 index 0000000..11b8e37 --- /dev/null +++ b/docker-compose-dev.yml @@ -0,0 +1,111 @@ +version: '3' +services: + kafka: + container_name: kafka + image: bitnami/kafka:3.5.1-debian-11-r3 + environment: + - ALLOW_PLAINTEXT_LISTENER=yes + - KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093,EXTERNAL://:9094 + - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT + - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092,EXTERNAL://localhost:9094 + - KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=false + # volumes: + # - ./kafka:/bitnami/kafka:rw + expose: + - 9094 + - 9092 + healthcheck: + test: ["CMD", "kafka-topics.sh", "--list", "--bootstrap-server", "localhost:9092"] + interval: 30s + timeout: 10s + retries: 5 + networks: + - botdetector-network + + kafdrop: + container_name: kafdrop + image: obsidiandynamics/kafdrop:latest + environment: + - KAFKA_BROKERCONNECT=kafka:9092 + - JVM_OPTS=-Xms32M -Xmx64M + - SERVER_SERVLET_CONTEXTPATH=/ + ports: + - 9999:9000 + restart: on-failure + networks: + - botdetector-network + depends_on: + kafka: + condition: service_healthy + + kafka_setup: + container_name: kafka_setup + image: bot-detector/kafka_setup + build: + context: ./kafka_setup + command: ["python", "setup_kafka.py"] + environment: + - KAFKA_BROKER=kafka:9092 + networks: + - botdetector-network + depends_on: + kafka: + condition: service_healthy + + mysql: + container_name: database + build: + context: ./mysql + image: bot-detector/mysql:latest + environment: + - MYSQL_ROOT_PASSWORD=root_bot_buster + volumes: + - ./mysql/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d + # - ./mysql/mount:/var/lib/mysql # creates persistence + # ports: + # - 3306:3306 + expose: + - 3306 + networks: + - botdetector-network + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost -u root -proot_bot_buster"] + interval: 10s + retries: 3 + start_period: 30s + timeout: 5s + + # worker: + # container_name: worker + # image: bot-detector/highscore_worker + # build: + # context: . + # dockerfile: Dockerfile + # target: base + # args: + # root_path: / + # api_port: 5000 + # # INSTALL_PTVSD: true + # # command: bash -c "apt update && apt install -y curl && sleep infinity" + # command: python src/main.py + # # ports: + # # - 5678:5678 + # environment: + # - KAFKA_HOST=kafka:9092 + # - DATABASE_URL=mysql+aiomysql://root:root_bot_buster@mysql:3306/playerdata + # - POOL_TIMEOUT=30 + # - POOL_RECYCLE=30 + # # - ENABLE_DEBUGPY=true + # # - PYDEVD_DISABLE_FILE_VALIDATION=1 + # networks: + # - botdetector-network + # volumes: + # - ./src:/project/src + # depends_on: + # kafka: + # condition: service_healthy + # mysql: + # condition: service_healthy + +networks: + botdetector-network: diff --git a/docker-compose.yml b/docker-compose.yml index bb7d8e9..9e19330 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -88,15 +88,15 @@ services: INSTALL_PTVSD: true # command: bash -c "apt update && apt install -y curl && sleep infinity" command: python src/main.py - ports: - - 5678:5678 + # ports: + # - 5678:5678 environment: - KAFKA_HOST=kafka:9092 - DATABASE_URL=mysql+aiomysql://root:root_bot_buster@mysql:3306/playerdata - POOL_TIMEOUT=30 - POOL_RECYCLE=30 - - ENABLE_DEBUGPY=true - - PYDEVD_DISABLE_FILE_VALIDATION=1 + # - ENABLE_DEBUGPY=true + # - PYDEVD_DISABLE_FILE_VALIDATION=1 networks: - botdetector-network volumes: From 0f6df52c38e335a79f6ee2321cd5a3b1750d4186 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Tue, 27 Feb 2024 17:05:30 -0500 Subject: [PATCH 19/39] working debug --- .vscode/launch.json | 7 ++++--- Makefile | 6 +++--- docker-compose-dev.yml | 39 ++++++++++++++++++++++----------------- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index f0973e3..ad89780 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -45,11 +45,12 @@ "justMyCode": true, "args": ["--root_path", "/", "--api_port", "5000"], "env": { - "KAFKA_HOST": "kafka:9092", - "DATABASE_URL": "mysql+aiomysql://root:root_bot_buster@mysql:3306/playerdata", + "KAFKA_HOST": "localhost:9094", + "DATABASE_URL": "mysql+aiomysql://root:root_bot_buster@localhost:3306/playerdata", "POOL_TIMEOUT": "30", "POOL_RECYCLE": "30" - } + }, + "cwd": "${workspaceFolder}/" } ] } diff --git a/Makefile b/Makefile index 6e3a830..e74f0ee 100644 --- a/Makefile +++ b/Makefile @@ -6,9 +6,6 @@ build: up: docker-compose up -d -updev: - docker-compose -f docker-compose-dev.yml up -d - down: docker-compose down @@ -18,6 +15,9 @@ clean: cleanbuild: clean docker-compose up --build +updev: clean + docker-compose -f docker-compose-dev.yml up -d + create: python3 -m venv .venv diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 11b8e37..a39f159 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -14,6 +14,9 @@ services: expose: - 9094 - 9092 + ports: + # - 9092:9092 + - 9094:9094 healthcheck: test: ["CMD", "kafka-topics.sh", "--list", "--bootstrap-server", "localhost:9092"] interval: 30s @@ -22,21 +25,21 @@ services: networks: - botdetector-network - kafdrop: - container_name: kafdrop - image: obsidiandynamics/kafdrop:latest - environment: - - KAFKA_BROKERCONNECT=kafka:9092 - - JVM_OPTS=-Xms32M -Xmx64M - - SERVER_SERVLET_CONTEXTPATH=/ - ports: - - 9999:9000 - restart: on-failure - networks: - - botdetector-network - depends_on: - kafka: - condition: service_healthy + # kafdrop: + # container_name: kafdrop + # image: obsidiandynamics/kafdrop:latest + # environment: + # - KAFKA_BROKERCONNECT=kafka:9092 + # - JVM_OPTS=-Xms32M -Xmx64M + # - SERVER_SERVLET_CONTEXTPATH=/ + # ports: + # - 9999:9000 + # restart: on-failure + # networks: + # - botdetector-network + # depends_on: + # kafka: + # condition: service_healthy kafka_setup: container_name: kafka_setup @@ -48,6 +51,8 @@ services: - KAFKA_BROKER=kafka:9092 networks: - botdetector-network + # ports: + # - 9092:9092 depends_on: kafka: condition: service_healthy @@ -62,8 +67,8 @@ services: volumes: - ./mysql/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d # - ./mysql/mount:/var/lib/mysql # creates persistence - # ports: - # - 3306:3306 + ports: + - 3306:3306 expose: - 3306 networks: From 8b7074f6f69e4559f6fb3bd45938254b9d8f9070 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Tue, 27 Feb 2024 18:13:59 -0500 Subject: [PATCH 20/39] Refactor due to circular import error, Update Kafka setup and add my_kafka module --- kafka_setup/setup_kafka.py | 2 +- src/kafka_setup/setup_kafka.py | 134 +++++++++++++++++++++++++++++++++ src/{kafka.py => my_kafka.py} | 5 +- 3 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 src/kafka_setup/setup_kafka.py rename src/{kafka.py => my_kafka.py} (82%) diff --git a/kafka_setup/setup_kafka.py b/kafka_setup/setup_kafka.py index 1cca1c9..c99de1f 100644 --- a/kafka_setup/setup_kafka.py +++ b/kafka_setup/setup_kafka.py @@ -4,7 +4,7 @@ import zipfile from queue import Queue -from kafka import KafkaProducer +from my_kafka import KafkaProducer from kafka.admin import KafkaAdminClient, NewTopic diff --git a/src/kafka_setup/setup_kafka.py b/src/kafka_setup/setup_kafka.py new file mode 100644 index 0000000..c99de1f --- /dev/null +++ b/src/kafka_setup/setup_kafka.py @@ -0,0 +1,134 @@ +# setup_kafka.py +import json +import os +import zipfile +from queue import Queue + +from my_kafka import KafkaProducer +from kafka.admin import KafkaAdminClient, NewTopic + + +def create_topics(): + # Get the Kafka broker address from the environment variable + kafka_broker = os.environ.get("KAFKA_BROKER", "localhost:9094") + + # Create Kafka topics + admin_client = KafkaAdminClient(bootstrap_servers=kafka_broker) + + topics = admin_client.list_topics() + print("existing topics", topics) + + if not topics == []: + admin_client.delete_topics(topics) + + res = admin_client.create_topics( + [ + NewTopic( + name="player", + num_partitions=3, + replication_factor=1, + ), + NewTopic( + name="scraper", + num_partitions=4, + replication_factor=1, + ), + NewTopic( + name="reports", + num_partitions=4, + replication_factor=1, + ), + ] + ) + + print("created_topic", res) + + topics = admin_client.list_topics() + print("all topics", topics) + return + + +def extract_zip(extract_to: str): + current_dir = "./kafka_data" # Get the current working directory + + # Find zip file in the current directory + zip_files = [file for file in os.listdir(current_dir) if file.endswith(".zip")] + + if not zip_files: + print("No zip file found in the current directory") + return + + # Select the first zip file found + for zip_file in zip_files: + print(f"extracting: {zip_file}") + zip_file_path = os.path.join(current_dir, zip_file) + # Create the extraction directory if it doesn't exist + if not os.path.exists(extract_to): + os.makedirs(extract_to) + + # Extract zipfile + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(extract_to) + return + + +def get_messages_from_json(path: str, send_queue: Queue): + paths = [] + for file_name in os.listdir(path): + print(f"{file_name=}") + if file_name.endswith(".json"): + file_path = os.path.join(path, file_name) + paths.append(file_path) + + for _path in paths: + print(f"{_path=}") + with open(_path) as file: + data = json.load(file) + print(f"{_path}:{len(data)}") + _ = [send_queue.put(item=d) for d in data] + return + + +def kafka_producer(): + kafka_broker = os.environ.get("KAFKA_BROKER", "localhost:9094") + producer = KafkaProducer( + bootstrap_servers=kafka_broker, + value_serializer=lambda x: json.dumps(x).encode(), + ) + return producer + + +def send_messages(producer: KafkaProducer, send_queue: Queue, topic: str = "scraper"): + while True: + if send_queue.empty(): + break + + if send_queue.qsize() % 100 == 0: + print(f"{send_queue.qsize()=}") + message = send_queue.get() + producer.send(topic=topic, value=message) + send_queue.task_done() + + +def insert_data(): + send_queue = Queue() + extract_to = "kafka_data" + producer = kafka_producer() + + print("extract_zip") + extract_zip(extract_to) + print("get_messages_from_json") + get_messages_from_json(extract_to, send_queue=send_queue) + print("send_messages") + send_messages(producer=producer, send_queue=send_queue) + + +def main(): + print("create_topics") + create_topics() + print("insert_data") + insert_data() + print("done") + + +main() diff --git a/src/kafka.py b/src/my_kafka.py similarity index 82% rename from src/kafka.py rename to src/my_kafka.py index 50242d8..0f18878 100644 --- a/src/kafka.py +++ b/src/my_kafka.py @@ -2,6 +2,9 @@ import json import logging from asyncio import Queue +from main import Message +from app.schemas.input.highscore import PlayerHiscoreData +from app.schemas.input.player import Player from aiokafka import AIOKafkaConsumer, AIOKafkaProducer from core.config import settings @@ -24,7 +27,7 @@ async def kafka_consumer(topic: str, group: str): async def kafka_producer(): producer = AIOKafkaProducer( bootstrap_servers=[settings.KAFKA_HOST], - value_serializer=lambda v: json.dumps(v).encode(), + value_serializer=lambda v: v.model_dump_json().encode() if isinstance(v, (Message, PlayerHiscoreData, Player)) else json.dumps(v).encode(), acks="all", ) await producer.start() From aba6336fa4be2497b77e040dc2e60f0e04ba406f Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Tue, 27 Feb 2024 18:14:51 -0500 Subject: [PATCH 21/39] refactor for debug, reverting functions as _v1 and making a _v2 --- src/app/repositories/abc.py | 2 +- src/app/repositories/highscore.py | 10 +-- src/main.py | 140 +++++++++++++++++++++--------- 3 files changed, 106 insertions(+), 46 deletions(-) diff --git a/src/app/repositories/abc.py b/src/app/repositories/abc.py index f6c6051..f187a68 100644 --- a/src/app/repositories/abc.py +++ b/src/app/repositories/abc.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod -from src.database.database import get_session +from database.database import get_session class ABCRepo(ABC): diff --git a/src/app/repositories/highscore.py b/src/app/repositories/highscore.py index 085c44a..38ec8aa 100644 --- a/src/app/repositories/highscore.py +++ b/src/app/repositories/highscore.py @@ -1,11 +1,11 @@ from sqlalchemy import insert, select, update from sqlalchemy.ext.asyncio import AsyncSession -from src.app.repositories.abc import ABCRepo -from src.app.schemas.input.highscore import PlayerHiscoreData -from src.app.schemas.input.player import Player -from src.database.models.highscores import PlayerHiscoreData as PlayerHiscoreDataDB -from src.database.models.player import Player as PlayerDB +from app.repositories.abc import ABCRepo +from app.schemas.input.highscore import PlayerHiscoreData +from app.schemas.input.player import Player +from database.models.highscores import PlayerHiscoreData as PlayerHiscoreDataDB +from database.models.player import Player as PlayerDB class HighscoreRepo(ABCRepo): diff --git a/src/main.py b/src/main.py index e5e5e50..7e6b601 100644 --- a/src/main.py +++ b/src/main.py @@ -9,24 +9,24 @@ from core.config import settings from database.database import get_session -from database.models.highscores import ( - Activities as ActivitiesDB, - Skills as SkillsDB, -) +from database.models.highscores import PlayerHiscoreData from database.models.player import Player +from database.models.skills import PlayerSkills as PlayerSkillsDB, Skills as SkillsDB +from database.models.activities import PlayerActivities as PlayerActivitiesDB, Activities as ActivitiesDB from pydantic import BaseModel -from sqlalchemy import insert, select, update +from sqlalchemy import insert, update from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.sql.expression import Insert, Update -from src import kafka +import my_kafka as my_kafka # schemas import -from src.app.repositories.highscore import HighscoreRepo -from src.app.schemas.input.highscore import PlayerHiscoreData -from src.app.schemas.input.player import Player -from src.app.schemas.input.scraper_data import ScraperData -from src.app.schemas.input.activities import Activities, PlayerActivities -from src.app.schemas.input.skills import Skills, PlayerSkills +from app.repositories.highscore import HighscoreRepo +from app.schemas.input.highscore import PlayerHiscoreData +from app.schemas.input.player import Player +from app.schemas.input.scraper_data import ScraperData +from app.schemas.input.activities import Activities, PlayerActivities +from app.schemas.input.skills import Skills, PlayerSkills logger = logging.getLogger(__name__) @@ -79,26 +79,30 @@ def log_speed( return time.time(), 0 async def insert_data_v1(batch: list[Message], error_queue: Queue): - """ - 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates - 2. start transaction - 3. for each player insert into scraper_data - 4. for each player get the scraper_id from scraper_data - 5. insert into player_skills (scraper_id, skill_id) values (), () - 6. insert into player_activities (scraper_id, activity_id) values (), () - - step 5 & 6 must be batched for all players at once - """ - session: AsyncSession = await get_session() - logger.debug(batch[:1]) try: - highscores = [ - PlayerHiscoreData(**msg.hiscores) for msg in batch if msg.hiscores - ] - players = [Player(**msg.player) for msg in batch] + highscores:list[dict] = [msg.get("hiscores") for msg in batch] + players:list[dict] = [msg.get("player") for msg in batch] + + highscores = [PlayerHiscoreData(**hs) for hs in highscores if hs] + highscores = [hs.model_dump(mode="json") for hs in highscores ] + + session: AsyncSession = await get_session() + logger.info(f"Received: {len(players)=}, {len(highscores)=}") - repo = HighscoreRepo() - await repo.create(highscore_data=highscores, player_data=players) + + # start a transaction + async with session.begin(): + # insert into table values () + insert_sql:Insert = insert(PlayerHiscoreData) # fixing v1, currently debugging here + insert_sql = insert_sql.values(highscores) + insert_sql = insert_sql.prefix_with("ignore") + await session.execute(insert_sql) + # update table + for player in players: + update_sql:Update = update(Player) + update_sql = update_sql.where(Player.id == player.get("id")) + update_sql = update_sql.values(player) + await session.execute(update_sql) except (OperationalError, IntegrityError) as e: for message in batch: await error_queue.put(message) @@ -363,7 +367,57 @@ async def update_activity_names(session: AsyncSession): for record in activity_records.scalars().all() ] -async def process_data(receive_queue: Queue, error_queue: Queue): +async def process_data_v1(receive_queue: Queue, error_queue: Queue): + # Initialize counter and start time + counter = 0 + start_time = time.time() + + # limit the number of async insert_data calls + semaphore = asyncio.Semaphore(5) + + batch = [] + # Run indefinitely + while True: + start_time, counter = log_speed( + counter=counter, + start_time=start_time, + _queue=receive_queue, + topic="scraper", + interval=15 + ) + + # Check if queue is empty + if receive_queue.empty(): + await asyncio.sleep(1) + continue + + # Get a message from the chosen queue + message: dict = await receive_queue.get() + + #TODO fix test data + if settings.ENV != "PRD": + player = message.get("player") + player_id = player.get("id") + MIN_PLAYER_ID = 0 + MAX_PLAYER_ID = 300 + if not (MIN_PLAYER_ID < player_id <= MAX_PLAYER_ID): + continue + + # batch message + batch.append(message) + + now = time.time() + + # insert data in batches of N or interval of N + if len(batch) > 100 or now-start_time > 15: + async with semaphore: + await insert_data_v1(batch=batch, error_queue=error_queue) + batch = [] + + receive_queue.task_done() + counter += 1 + +async def process_data_v2(receive_queue: Queue, error_queue: Queue): # Initialize counter and start time counter = 0 start_time = time.time() @@ -388,15 +442,18 @@ async def process_data(receive_queue: Queue, error_queue: Queue): continue # Get a message from the chosen queue - message: dict = await receive_queue.get() message: Message = Message(**message) # TODO fix test data if settings.ENV != "PRD": - player = message.player - player_id = player.id # Access the 'id' attribute directly + continue_flag = False MIN_PLAYER_ID = 0 MAX_PLAYER_ID = 300 + # original + + player = message.player + player_id = player.id # Access the 'id' attribute directly + if not (MIN_PLAYER_ID < player_id <= MAX_PLAYER_ID): continue @@ -408,7 +465,6 @@ async def process_data(receive_queue: Queue, error_queue: Queue): # insert data in batches of N or interval of N if len(batch) > 100 or now - start_time > 15: async with semaphore: - await insert_data_v1(batch=batch, error_queue=error_queue) await insert_data_v2(batch=batch, error_queue=error_queue) batch = [] @@ -418,24 +474,28 @@ async def process_data(receive_queue: Queue, error_queue: Queue): async def main(): # get kafka engine - consumer = await kafka.kafka_consumer(topic="scraper", group="highscore-worker") - producer = await kafka.kafka_producer() + consumer = await my_kafka.kafka_consumer(topic="scraper", group="highscore-worker") + producer = await my_kafka.kafka_producer() receive_queue = Queue(maxsize=100) send_queue = Queue(maxsize=100) asyncio.create_task( - kafka.receive_messages( + my_kafka.receive_messages( consumer=consumer, receive_queue=receive_queue, error_queue=send_queue ) ) asyncio.create_task( - kafka.send_messages(topic="scraper", producer=producer, send_queue=send_queue) + my_kafka.send_messages(topic="scraper", producer=producer, send_queue=send_queue) ) asyncio.create_task( - process_data(receive_queue=receive_queue, error_queue=send_queue) + process_data_v1(receive_queue=receive_queue, error_queue=send_queue) ) + # asyncio.create_task( + # process_data_v2(receive_queue=receive_queue, error_queue=send_queue) + # ) + while True: await asyncio.sleep(60) From cb176fc7c055b209e5b0d3e31a3493165b46ee91 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Tue, 27 Feb 2024 18:20:04 -0500 Subject: [PATCH 22/39] working v1 --- src/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main.py b/src/main.py index 7e6b601..de92fa1 100644 --- a/src/main.py +++ b/src/main.py @@ -9,8 +9,8 @@ from core.config import settings from database.database import get_session -from database.models.highscores import PlayerHiscoreData -from database.models.player import Player +from database.models.highscores import PlayerHiscoreData as PlayerHiscoreDataDB +from database.models.player import Player as PlayerDB from database.models.skills import PlayerSkills as PlayerSkillsDB, Skills as SkillsDB from database.models.activities import PlayerActivities as PlayerActivitiesDB, Activities as ActivitiesDB from pydantic import BaseModel @@ -93,14 +93,14 @@ async def insert_data_v1(batch: list[Message], error_queue: Queue): # start a transaction async with session.begin(): # insert into table values () - insert_sql:Insert = insert(PlayerHiscoreData) # fixing v1, currently debugging here + insert_sql:Insert = insert(PlayerHiscoreDataDB) # fixing v1, currently debugging here insert_sql = insert_sql.values(highscores) insert_sql = insert_sql.prefix_with("ignore") await session.execute(insert_sql) # update table for player in players: - update_sql:Update = update(Player) - update_sql = update_sql.where(Player.id == player.get("id")) + update_sql:Update = update(PlayerDB) + update_sql = update_sql.where(PlayerDB.id == player.get("id")) update_sql = update_sql.values(player) await session.execute(update_sql) except (OperationalError, IntegrityError) as e: From 038b0b64dd137eaae31c2d62ba37380a5d553b8e Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Tue, 27 Feb 2024 20:18:52 -0500 Subject: [PATCH 23/39] currently on transform data --- src/main.py | 287 +++++++++++++++++++++++----------------------------- 1 file changed, 129 insertions(+), 158 deletions(-) diff --git a/src/main.py b/src/main.py index de92fa1..a8f4a7e 100644 --- a/src/main.py +++ b/src/main.py @@ -14,7 +14,7 @@ from database.models.skills import PlayerSkills as PlayerSkillsDB, Skills as SkillsDB from database.models.activities import PlayerActivities as PlayerActivitiesDB, Activities as ActivitiesDB from pydantic import BaseModel -from sqlalchemy import insert, update +from sqlalchemy import insert, update, select from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.sql.expression import Insert, Update @@ -53,7 +53,6 @@ class NewDataSchema(BaseModel): LAST_SKILL_NAMES_UPDATE = datetime.min LAST_ACTIVITY_NAMES_UPDATE = datetime.min - def log_speed( counter: int, start_time: float, _queue: Queue, topic: str, interval: int = 15 ) -> tuple[float, int]: @@ -118,84 +117,86 @@ async def insert_data_v1(batch: list[Message], error_queue: Queue): logger.info(f"error_qsize={error_queue.qsize()}, {message=}") -async def check_and_update_skill_cache(batch: list[Message], session: AsyncSession): - global SKILL_NAMES, LAST_SKILL_NAMES_UPDATE, SKILL_NAMES_LOCK, ACTIVITY_NAMES - - # Query the cache to get the skill IDs - skill_ids = {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} - - missing_skills = [ - skill - for message in batch - for skill in message.hiscores.model_fields.keys() - if skill - not in ["timestamp", "Player_id"] + [skill.skill_name for skill in SKILL_NAMES] - and skill not in skill_ids - ] - if missing_skills: - # Check if the cache was updated less than 10 minutes ago - if datetime.now() - LAST_SKILL_NAMES_UPDATE < timedelta(minutes=10): - logger.warning( - "Skill names cache update was called less than 10 minutes ago. Skipping batch." - ) - return None # Or however you want to handle this case - - # Update the skill names cache - async with SKILL_NAMES_LOCK: - await update_skill_names(session) - LAST_SKILL_NAMES_UPDATE = datetime.now() - - # Query the cache again to get the updated skill IDs - skill_ids = ( - {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} - ) - - return skill_ids - - -async def check_and_update_activity_cache(batch: list[Message], session: AsyncSession): - global ACTIVITY_NAMES, LAST_ACTIVITY_NAMES_UPDATE, ACTIVITY_NAMES_LOCK, SKILL_NAMES - - # Query the cache to get the activity IDs - activity_ids = ( - {activity.name: activity.id for activity in ACTIVITY_NAMES} - if ACTIVITY_NAMES - else {} - ) - - # Check if any activity name in any message is not found in the cache - missing_activities = [ - activity - for message in batch - for activity in message.hiscores.model_fields.keys() - if activity - not in ["timestamp", "Player_id"] + [skill.skill_name for skill in SKILL_NAMES] - and activity not in activity_ids - ] - if missing_activities: - # Check if the cache was updated less than 10 minutes ago - if datetime.now() - LAST_ACTIVITY_NAMES_UPDATE < timedelta(minutes=10): - logger.warning( - "Activity names cache update was called less than 10 minutes ago. Skipping batch." - ) - return None # Or however you want to handle this case - - # Update the activity names cache - async with ACTIVITY_NAMES_LOCK: - await update_activity_names(session) - LAST_ACTIVITY_NAMES_UPDATE = datetime.now() - - # Query the cache again to get the updated activity IDs - activity_ids = ( - {activity.name: activity.id for activity in ACTIVITY_NAMES} - if ACTIVITY_NAMES - else {} - ) - - return activity_ids - - -async def insert_data_v2(batch: list[Message], error_queue: Queue): +# async def check_and_update_skill_cache(batch: list[Message], session: AsyncSession): +# global SKILL_NAMES, LAST_SKILL_NAMES_UPDATE, SKILL_NAMES_LOCK, ACTIVITY_NAMES + +# # Query the cache to get the skill IDs +# skill_ids = {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} + +# missing_skills = [ +# skill +# for message in batch +# if message.hiscores is not None +# for skill in message.hiscores.model_fields.keys() +# if skill +# not in ["timestamp", "Player_id"] + [skill.skill_name for skill in SKILL_NAMES] +# and skill not in skill_ids +# ] +# if missing_skills: +# # Check if the cache was updated less than 10 minutes ago +# if datetime.now() - LAST_SKILL_NAMES_UPDATE < timedelta(minutes=10): +# logger.warning( +# "Skill names cache update was called less than 10 minutes ago. Skipping batch." +# ) +# return None # Or however you want to handle this case + +# # Update the skill names cache +# async with SKILL_NAMES_LOCK: +# await update_skill_names(session) +# LAST_SKILL_NAMES_UPDATE = datetime.now() + +# # Query the cache again to get the updated skill IDs +# skill_ids = ( +# {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} +# ) + +# return skill_ids + + +# async def check_and_update_activity_cache(batch: list[Message], session: AsyncSession): +# global ACTIVITY_NAMES, LAST_ACTIVITY_NAMES_UPDATE, ACTIVITY_NAMES_LOCK, SKILL_NAMES + +# # Query the cache to get the activity IDs +# activity_ids = ( +# {activity.name: activity.id for activity in ACTIVITY_NAMES} +# if ACTIVITY_NAMES +# else {} +# ) + +# # Check if any activity name in any message is not found in the cache +# missing_activities = [ +# activity +# for message in batch +# if message.hiscores is not None +# for activity in message.hiscores.model_fields.keys() +# if activity +# not in ["timestamp", "Player_id"] + [skill.skill_name for skill in SKILL_NAMES] +# and activity not in activity_ids +# ] +# if missing_activities: +# # Check if the cache was updated less than 10 minutes ago +# if datetime.now() - LAST_ACTIVITY_NAMES_UPDATE < timedelta(minutes=10): +# logger.warning( +# "Activity names cache update was called less than 10 minutes ago. Skipping batch." +# ) +# return None # Or however you want to handle this case + +# # Update the activity names cache +# async with ACTIVITY_NAMES_LOCK: +# await update_activity_names(session) +# LAST_ACTIVITY_NAMES_UPDATE = datetime.now() + +# # Query the cache again to get the updated activity IDs +# activity_ids = ( +# {activity.name: activity.id for activity in ACTIVITY_NAMES} +# if ACTIVITY_NAMES +# else {} +# ) + +# return activity_ids + + +async def insert_data_v2(batch: list[dict], error_queue: Queue): """ 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates 2. start transaction @@ -207,42 +208,11 @@ async def insert_data_v2(batch: list[Message], error_queue: Queue): step 5 & 6 must be batched for all players at once """ try: + + messages = [Message(**msg) for msg in batch] session: AsyncSession = await get_session() - - # Step 1: Check for duplicates in scraper_data[player_id, record_date], remove all duplicates - for message in batch: - existing_data = await session.query(ScraperData).filter( - ScraperData.player_id == message.player_id, - ScraperData.record_date == message.record_date - ).first() - if existing_data: - session.delete(existing_data) - await session.commit() - - # Step 2: Start transaction - async with session.begin(): - for message in batch: - # Step 3: For each player insert into scraper_data - scraper_data = ScraperData( - player_id=message.player_id, - record_date=message.record_date - ) - session.add(scraper_data) - await session.flush() - - # Step 4: For each player get the scraper_id from scraper_data - scraper_id = scraper_data.scraper_id - - # Step 5 & 6: Insert into player_skills and player_activities - # Assuming you have the skills and activities data in the message - player_skills = [PlayerSkill(scraper_id=scraper_id, skill_id=skill_id) for skill_id in message.skills] - player_activities = [PlayerActivity(scraper_id=scraper_id, activity_id=activity_id) for activity_id in message.activities] - - session.bulk_save_objects(player_skills) - session.bulk_save_objects(player_activities) - - # Commit the transaction - await session.commit() + transformed_data: list[NewDataSchema] = await transform_data(messages, session) + print(transformed_data) except (OperationalError, IntegrityError) as e: for message in batch: await error_queue.put(message) @@ -265,15 +235,6 @@ async def transform_data( new_data_list = [] for old_data in old_data_list: - # Query the cache to get the skill and activity IDs - skill_ids = ( - {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} - ) - activity_ids = ( - {activity.name: activity.id for activity in ACTIVITY_NAMES} - if ACTIVITY_NAMES - else {} - ) # Transform the old data format into the new format new_data = NewDataSchema( @@ -296,8 +257,8 @@ async def transform_data( [ { "skill_id": ( - skill_ids[skill.name] - if skill.name in skill_ids + skill[skill.name] + if skill.name in skill else None ), "skill_value": ( @@ -315,8 +276,8 @@ async def transform_data( [ { "activity_id": ( - activity_ids[activity.name] - if activity.name in activity_ids + activity[activity.name] + if activity.name in activity else None ), "activity_value": ( @@ -349,23 +310,27 @@ async def update_skill_names(session: AsyncSession): global SKILL_NAMES, SKILL_NAMES_LOCK async with SKILL_NAMES_LOCK: - if SKILL_NAMES is None: - skill_records = await session.execute(select(SkillsDB)) - SKILL_NAMES = [ - SkillsDB(**record) for record in skill_records.scalars().all() - ] + if SKILL_NAMES is None or not SKILL_NAMES: + try: + skill_records = await session.execute(select(SkillsDB)) + SKILL_NAMES = [Skills(**record.__dict__) for record in skill_records.scalars().all()] + print(SKILL_NAMES) + except Exception as e: + print(f"Error updating skill names: {e}") async def update_activity_names(session: AsyncSession): global ACTIVITY_NAMES, ACTIVITY_NAMES_LOCK async with ACTIVITY_NAMES_LOCK: - if ACTIVITY_NAMES is None: - activity_records = await session.execute(select(ActivitiesDB)) - ACTIVITY_NAMES = [ - ActivitiesDB(**record) - for record in activity_records.scalars().all() - ] + try: + if ACTIVITY_NAMES is None or not ACTIVITY_NAMES: + activity_records = await session.execute(select(ActivitiesDB)) + ACTIVITY_NAMES = [Activities(**record.__dict__) for record in activity_records.scalars().all()] + print(ACTIVITY_NAMES) + except Exception as e: + print(f"Error updating activity names: {e}") + async def process_data_v1(receive_queue: Queue, error_queue: Queue): # Initialize counter and start time @@ -425,7 +390,7 @@ async def process_data_v2(receive_queue: Queue, error_queue: Queue): # limit the number of async insert_data calls semaphore = asyncio.Semaphore(5) - batch: list[Message] = [] + batch = [] # Run indefinitely while True: start_time, counter = log_speed( @@ -433,7 +398,7 @@ async def process_data_v2(receive_queue: Queue, error_queue: Queue): start_time=start_time, _queue=receive_queue, topic="scraper", - interval=15, + interval=15 ) # Check if queue is empty @@ -442,32 +407,28 @@ async def process_data_v2(receive_queue: Queue, error_queue: Queue): continue # Get a message from the chosen queue - message: Message = Message(**message) - - # TODO fix test data + message: dict = await receive_queue.get() + + #TODO fix test data if settings.ENV != "PRD": - continue_flag = False + player = message.get("player") + player_id = player.get("id") MIN_PLAYER_ID = 0 MAX_PLAYER_ID = 300 - # original - - player = message.player - player_id = player.id # Access the 'id' attribute directly - if not (MIN_PLAYER_ID < player_id <= MAX_PLAYER_ID): continue - + # batch message batch.append(message) now = time.time() # insert data in batches of N or interval of N - if len(batch) > 100 or now - start_time > 15: + if len(batch) > 100 or now-start_time > 15: async with semaphore: await insert_data_v2(batch=batch, error_queue=error_queue) batch = [] - + receive_queue.task_done() counter += 1 @@ -480,6 +441,8 @@ async def main(): receive_queue = Queue(maxsize=100) send_queue = Queue(maxsize=100) + + asyncio.create_task( my_kafka.receive_messages( consumer=consumer, receive_queue=receive_queue, error_queue=send_queue @@ -488,17 +451,25 @@ async def main(): asyncio.create_task( my_kafka.send_messages(topic="scraper", producer=producer, send_queue=send_queue) ) - asyncio.create_task( - process_data_v1(receive_queue=receive_queue, error_queue=send_queue) - ) - # asyncio.create_task( - # process_data_v2(receive_queue=receive_queue, error_queue=send_queue) + # process_data_v1(receive_queue=receive_queue, error_queue=send_queue) # ) + asyncio.create_task( + process_data_v2(receive_queue=receive_queue, error_queue=send_queue) + ) + while True: await asyncio.sleep(60) +async def init(): + session = await get_session() + try: + await update_skill_names(session) + await update_activity_names(session) + finally: + await session.close() if __name__ == "__main__": + asyncio.run(init()) asyncio.run(main()) From 7effe05a7a2535aad33d29ff4b84e52bea9181c7 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Wed, 28 Feb 2024 06:56:49 -0500 Subject: [PATCH 24/39] bug fixes --- src/app/schemas/input/activities.py | 4 +- src/app/schemas/input/scraper_data.py | 2 +- src/app/schemas/input/skills.py | 4 +- src/main.py | 154 +++++++++++--------------- 4 files changed, 72 insertions(+), 92 deletions(-) diff --git a/src/app/schemas/input/activities.py b/src/app/schemas/input/activities.py index 6b90da3..21f20bc 100644 --- a/src/app/schemas/input/activities.py +++ b/src/app/schemas/input/activities.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, ConfigDict - +from typing import Optional class Activities(BaseModel): model_config = ConfigDict(from_attributes=True) @@ -11,6 +11,6 @@ class Activities(BaseModel): class PlayerActivities(BaseModel): model_config = ConfigDict(from_attributes=True) - scraper_id: int + scraper_id: Optional[int] = None activity_id: int activity_value: int diff --git a/src/app/schemas/input/scraper_data.py b/src/app/schemas/input/scraper_data.py index 978f2d2..2156fd7 100644 --- a/src/app/schemas/input/scraper_data.py +++ b/src/app/schemas/input/scraper_data.py @@ -7,6 +7,6 @@ class ScraperData(BaseModel): model_config = ConfigDict(from_attributes=True) player_id: int - scraper_id: int + scraper_id: Optional[int] = None created_at: Optional[str] = None record_date: Optional[str] = None diff --git a/src/app/schemas/input/skills.py b/src/app/schemas/input/skills.py index a56b4d3..06f9309 100644 --- a/src/app/schemas/input/skills.py +++ b/src/app/schemas/input/skills.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, ConfigDict - +from typing import Optional class Skills(BaseModel): model_config = ConfigDict(from_attributes=True) @@ -11,6 +11,6 @@ class Skills(BaseModel): class PlayerSkills(BaseModel): model_config = ConfigDict(from_attributes=True) - scraper_id: int + scraper_id: Optional[int] = None skill_id: int skill_value: int diff --git a/src/main.py b/src/main.py index a8f4a7e..759185c 100644 --- a/src/main.py +++ b/src/main.py @@ -6,6 +6,7 @@ from asyncio import Queue from datetime import datetime, timedelta +import concurrent.futures from core.config import settings from database.database import get_session @@ -13,6 +14,7 @@ from database.models.player import Player as PlayerDB from database.models.skills import PlayerSkills as PlayerSkillsDB, Skills as SkillsDB from database.models.activities import PlayerActivities as PlayerActivitiesDB, Activities as ActivitiesDB +from database.models.scraper_data import ScraperData as ScraperDataDB from pydantic import BaseModel from sqlalchemy import insert, update, select from sqlalchemy.exc import IntegrityError, OperationalError @@ -53,6 +55,15 @@ class NewDataSchema(BaseModel): LAST_SKILL_NAMES_UPDATE = datetime.min LAST_ACTIVITY_NAMES_UPDATE = datetime.min +class SingletonLoop: + _loop = None + + @classmethod + def get_loop(cls): + if cls._loop is None: + cls._loop = asyncio.get_running_loop() + return cls._loop + def log_speed( counter: int, start_time: float, _queue: Queue, topic: str, interval: int = 15 ) -> tuple[float, int]: @@ -197,6 +208,7 @@ async def insert_data_v1(batch: list[Message], error_queue: Queue): async def insert_data_v2(batch: list[dict], error_queue: Queue): + global SKILL_NAMES, ACTIVITY_NAMES """ 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates 2. start transaction @@ -210,13 +222,48 @@ async def insert_data_v2(batch: list[dict], error_queue: Queue): try: messages = [Message(**msg) for msg in batch] + # session: AsyncSession = await get_session() + # transformed_data: list[NewDataSchema] = await transform_data(messages, session) + new_data_list = [] + + # remove duplicates and remove players with no hiscores + messages = [msg for msg in messages if msg.hiscores is not None] + messages = [msg for msg in messages if msg.player is not None] + + start_time = time.time() + new_data_list = [transform_message_to_new_data(msg) for msg in messages] + end_time = time.time() + session: AsyncSession = await get_session() - transformed_data: list[NewDataSchema] = await transform_data(messages, session) - print(transformed_data) + print(f"Time taken: {end_time - start_time} seconds") + # async with session.begin(): + for new_data in new_data_list: + # Map ScraperData to ScraperDataDB + scraper_data_db = ScraperDataDB(**new_data.scraper_data.model_dump()) + session.add(scraper_data_db) + await session.flush() # Flush the session to get the ID of the newly inserted scraper_data_db + + # Map Player to PlayerDB + player_db = PlayerDB(**new_data.player.model_dump()) + session.add(player_db) + + # Map each PlayerSkills to PlayerSkillsDB + for player_skill in new_data.player_skills: + player_skill_db = PlayerSkillsDB(scraper_id=scraper_data_db.scraper_id, **player_skill.model_dump()) + session.add(player_skill_db) + + # Map each PlayerActivities to PlayerActivitiesDB + for player_activity in new_data.player_activities: + player_activity_db = PlayerActivitiesDB(scraper_id=scraper_data_db.scraper_id, **player_activity.model_dump()) + session.add(player_activity_db) + + await session.commit() + except (OperationalError, IntegrityError) as e: for message in batch: await error_queue.put(message) + logger.error({"error": e}) logger.error({"error": e}) logger.info(f"error_qsize={error_queue.qsize()}, {message=}") except Exception as e: @@ -227,85 +274,21 @@ async def insert_data_v2(batch: list[dict], error_queue: Queue): logger.debug(f"Traceback: \n{traceback.format_exc()}") logger.info(f"error_qsize={error_queue.qsize()}, {message=}") -async def transform_data( - old_data_list: list[Message], session: AsyncSession -) -> NewDataSchema: - global SKILL_NAMES, ACTIVITY_NAMES, LAST_CACHE_UPDATE - - new_data_list = [] - - for old_data in old_data_list: - - # Transform the old data format into the new format - new_data = NewDataSchema( - **{ - "scraper_data": { - "scraper_id": old_data.player.id if old_data.player else None, - "created_at": ( - old_data.hiscores.timestamp.isoformat() - if old_data.hiscores - else None - ), - "player_id": ( - old_data.hiscores.Player_id if old_data.hiscores else None - ), - "record_date": ( - datetime.utcnow().isoformat() if old_data.hiscores else None - ), - }, - "player_skills": ( - [ - { - "skill_id": ( - skill[skill.name] - if skill.name in skill - else None - ), - "skill_value": ( - getattr(old_data.hiscores, skill.name, None) - if old_data.hiscores - else None - ), - } - for skill in SKILL_NAMES - ] - if SKILL_NAMES - else [] - ), - "player_activities": ( - [ - { - "activity_id": ( - activity[activity.name] - if activity.name in activity - else None - ), - "activity_value": ( - getattr(old_data.hiscores, activity.name, None) - if old_data.hiscores - else None - ), - } - for activity in ACTIVITY_NAMES - ] - if ACTIVITY_NAMES - else [] - ), - "player": { - "id": old_data.hiscores.Player_id if old_data.hiscores else None, - "name": old_data.player.name if old_data.player else None, - "normalized_name": ( - old_data.player.normalized_name if old_data.player else None - ), - }, - } - ) +def transform_message_to_new_data(msg): + scraper_data = ScraperData(player_id=msg.player.id) - logger.debug(f"Transformed data: {new_data}") - new_data_list.append(new_data) + # Create a set of the attribute names in msg.hiscores + hiscores_attributes = set(msg.hiscores.__dict__.keys()) + + # Only create PlayerSkills and PlayerActivities objects for skills and activities in hiscores_attributes + player_skills = [PlayerSkills(skill_id=skill.skill_id, skill_value=getattr(msg.hiscores, skill.skill_name)) for skill in SKILL_NAMES if skill.skill_name in hiscores_attributes] + player_activities = [PlayerActivities(activity_id=activity.activity_id, activity_value=getattr(msg.hiscores, activity.activity_name)) for activity in ACTIVITY_NAMES if activity.activity_name in hiscores_attributes] + + player = Player(**msg.player.model_dump()) + new_data = NewDataSchema(scraper_data=scraper_data, player_skills=player_skills, player_activities=player_activities, player=player) + + return new_data - return new_data_list -## todo: verify this is rigth async def update_skill_names(session: AsyncSession): global SKILL_NAMES, SKILL_NAMES_LOCK @@ -314,11 +297,10 @@ async def update_skill_names(session: AsyncSession): try: skill_records = await session.execute(select(SkillsDB)) SKILL_NAMES = [Skills(**record.__dict__) for record in skill_records.scalars().all()] - print(SKILL_NAMES) + # print(SKILL_NAMES) except Exception as e: print(f"Error updating skill names: {e}") - async def update_activity_names(session: AsyncSession): global ACTIVITY_NAMES, ACTIVITY_NAMES_LOCK @@ -327,11 +309,10 @@ async def update_activity_names(session: AsyncSession): if ACTIVITY_NAMES is None or not ACTIVITY_NAMES: activity_records = await session.execute(select(ActivitiesDB)) ACTIVITY_NAMES = [Activities(**record.__dict__) for record in activity_records.scalars().all()] - print(ACTIVITY_NAMES) + # print(ACTIVITY_NAMES) except Exception as e: print(f"Error updating activity names: {e}") - async def process_data_v1(receive_queue: Queue, error_queue: Queue): # Initialize counter and start time counter = 0 @@ -426,7 +407,8 @@ async def process_data_v2(receive_queue: Queue, error_queue: Queue): # insert data in batches of N or interval of N if len(batch) > 100 or now-start_time > 15: async with semaphore: - await insert_data_v2(batch=batch, error_queue=error_queue) + task = asyncio.create_task(insert_data_v2(batch=batch, error_queue=error_queue)) + await task batch = [] receive_queue.task_done() @@ -441,8 +423,6 @@ async def main(): receive_queue = Queue(maxsize=100) send_queue = Queue(maxsize=100) - - asyncio.create_task( my_kafka.receive_messages( consumer=consumer, receive_queue=receive_queue, error_queue=send_queue @@ -451,7 +431,7 @@ async def main(): asyncio.create_task( my_kafka.send_messages(topic="scraper", producer=producer, send_queue=send_queue) ) - # asyncio.create_task( + # loop.create_task( # process_data_v1(receive_queue=receive_queue, error_queue=send_queue) # ) From 86974efa63229c32ddc16cf44b664483380be0af Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Wed, 28 Feb 2024 20:32:23 -0500 Subject: [PATCH 25/39] fixed import --- src/kafka_setup/setup_kafka.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kafka_setup/setup_kafka.py b/src/kafka_setup/setup_kafka.py index c99de1f..1cca1c9 100644 --- a/src/kafka_setup/setup_kafka.py +++ b/src/kafka_setup/setup_kafka.py @@ -4,7 +4,7 @@ import zipfile from queue import Queue -from my_kafka import KafkaProducer +from kafka import KafkaProducer from kafka.admin import KafkaAdminClient, NewTopic From 8f365b4608dbe81247f46744d8c9dc47836c7775 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Wed, 28 Feb 2024 22:06:10 -0500 Subject: [PATCH 26/39] i believe its working --- docker-compose.yml | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 9e19330..b529b77 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,6 +14,9 @@ services: expose: - 9094 - 9092 + ports: + - 9092:9092 + - 9094:9094 healthcheck: test: ["CMD", "kafka-topics.sh", "--list", "--bootstrap-server", "localhost:9092"] interval: 30s @@ -22,21 +25,21 @@ services: networks: - botdetector-network - kafdrop: - container_name: kafdrop - image: obsidiandynamics/kafdrop:latest - environment: - - KAFKA_BROKERCONNECT=kafka:9092 - - JVM_OPTS=-Xms32M -Xmx64M - - SERVER_SERVLET_CONTEXTPATH=/ - ports: - - 9999:9000 - restart: on-failure - networks: - - botdetector-network - depends_on: - kafka: - condition: service_healthy + # kafdrop: + # container_name: kafdrop + # image: obsidiandynamics/kafdrop:latest + # environment: + # - KAFKA_BROKERCONNECT=kafka:9092 + # - JVM_OPTS=-Xms32M -Xmx64M + # - SERVER_SERVLET_CONTEXTPATH=/ + # ports: + # - 9999:9000 + # restart: on-failure + # networks: + # - botdetector-network + # depends_on: + # kafka: + # condition: service_healthy kafka_setup: container_name: kafka_setup From 9a2c37681c429b7f172fb8f42b943c9c7398e74d Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Wed, 28 Feb 2024 22:06:58 -0500 Subject: [PATCH 27/39] working imports --- docker-compose.yml | 4 +- kafka_setup/setup_kafka.py | 2 +- src/app/schemas/input/scraper_data.py | 6 +-- src/database/database.py | 8 ++++ src/database/models/activities.py | 2 +- src/database/models/scraper_data.py | 15 ++++--- src/database/models/skills.py | 2 +- src/main.py | 65 +++++++++++++++++---------- 8 files changed, 67 insertions(+), 37 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index b529b77..60ef33d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -65,8 +65,8 @@ services: volumes: - ./mysql/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d # - ./mysql/mount:/var/lib/mysql # creates persistence - # ports: - # - 3306:3306 + ports: + - 3306:3306 expose: - 3306 networks: diff --git a/kafka_setup/setup_kafka.py b/kafka_setup/setup_kafka.py index c99de1f..1cca1c9 100644 --- a/kafka_setup/setup_kafka.py +++ b/kafka_setup/setup_kafka.py @@ -4,7 +4,7 @@ import zipfile from queue import Queue -from my_kafka import KafkaProducer +from kafka import KafkaProducer from kafka.admin import KafkaAdminClient, NewTopic diff --git a/src/app/schemas/input/scraper_data.py b/src/app/schemas/input/scraper_data.py index 2156fd7..8f1e1a9 100644 --- a/src/app/schemas/input/scraper_data.py +++ b/src/app/schemas/input/scraper_data.py @@ -1,12 +1,12 @@ from typing import Optional from pydantic import BaseModel, ConfigDict - +from datetime import datetime class ScraperData(BaseModel): model_config = ConfigDict(from_attributes=True) player_id: int scraper_id: Optional[int] = None - created_at: Optional[str] = None - record_date: Optional[str] = None + created_at: str = datetime.now().isoformat() #datetime.now().strftime('%Y-%m-%d %H:%M:%S') + # record_date: Optional[str] = None diff --git a/src/database/database.py b/src/database/database.py index e198895..d16735c 100644 --- a/src/database/database.py +++ b/src/database/database.py @@ -4,6 +4,14 @@ from core.config import settings +class SessionContextManager: + async def __aenter__(self): + self.session = await get_session() + return self.session + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.session.close() + # Create an async SQLAlchemy engine engine = create_async_engine( settings.DATABASE_URL, diff --git a/src/database/models/activities.py b/src/database/models/activities.py index 53d49ff..b44089c 100644 --- a/src/database/models/activities.py +++ b/src/database/models/activities.py @@ -20,7 +20,7 @@ class Activities(Base): activity_id = Column(TINYINT, primary_key=True, autoincrement=True) activity_name = Column(String(50), nullable=False) - __table_args__ = (UniqueConstraint("activity_name", name="unique_activity_name"),) + # __table_args__ = (UniqueConstraint("activity_name", name="unique_activity_name"),) # CREATE TABLE player_activities ( diff --git a/src/database/models/scraper_data.py b/src/database/models/scraper_data.py index 139473a..3e81776 100644 --- a/src/database/models/scraper_data.py +++ b/src/database/models/scraper_data.py @@ -7,7 +7,8 @@ ) from sqlalchemy.dialects.mysql import BIGINT, SMALLINT from sqlalchemy.schema import UniqueConstraint - +from datetime import datetime +from typing import Optional # CREATE TABLE scraper_data ( # scraper_id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, @@ -22,8 +23,12 @@ class ScraperData(Base): scraper_id = Column(BIGINT, primary_key=True, autoincrement=True) created_at = Column(DateTime, nullable=False, server_default=func.now()) player_id = Column(SMALLINT, nullable=False) - record_date = Column(Date, nullable=True, server_onupdate=func.current_date()) + # record_date = Column(Date, nullable=True, server_onupdate=func.current_date()) + + # __table_args__ = ( + # UniqueConstraint("player_id", "record_date", name="unique_player_per_day"), + # ) - __table_args__ = ( - UniqueConstraint("player_id", "record_date", name="unique_player_per_day"), - ) + # __table_args__ = ( + # UniqueConstraint("player_id", "record_date", name="unique_player_per_day"), + # ) diff --git a/src/database/models/skills.py b/src/database/models/skills.py index 5301202..16132f5 100644 --- a/src/database/models/skills.py +++ b/src/database/models/skills.py @@ -20,7 +20,7 @@ class Skills(Base): skill_id = Column(TINYINT, primary_key=True, autoincrement=True) skill_name = Column(String(50), nullable=False) - __table_args__ = (UniqueConstraint("skill_name", name="unique_skill_name"),) + # __table_args__ = (UniqueConstraint("skill_name", name="unique_skill_name"),) # CREATE TABLE player_skills ( diff --git a/src/main.py b/src/main.py index 759185c..4fc63ca 100644 --- a/src/main.py +++ b/src/main.py @@ -20,6 +20,7 @@ from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.sql.expression import Insert, Update +from database.database import SessionContextManager import my_kafka as my_kafka # schemas import @@ -34,6 +35,9 @@ class Message(BaseModel): + def __hash__(self): + return hash((self.player.id, self.hiscores, self.player)) + hiscores: PlayerHiscoreData | None player: Player | None @@ -227,6 +231,7 @@ async def insert_data_v2(batch: list[dict], error_queue: Queue): new_data_list = [] # remove duplicates and remove players with no hiscores + # messages = list(set(messages)) messages = [msg for msg in messages if msg.hiscores is not None] messages = [msg for msg in messages if msg.player is not None] @@ -234,30 +239,43 @@ async def insert_data_v2(batch: list[dict], error_queue: Queue): new_data_list = [transform_message_to_new_data(msg) for msg in messages] end_time = time.time() - session: AsyncSession = await get_session() + # session: AsyncSession = await get_session() print(f"Time taken: {end_time - start_time} seconds") # async with session.begin(): - for new_data in new_data_list: - # Map ScraperData to ScraperDataDB - scraper_data_db = ScraperDataDB(**new_data.scraper_data.model_dump()) - session.add(scraper_data_db) - await session.flush() # Flush the session to get the ID of the newly inserted scraper_data_db - - # Map Player to PlayerDB - player_db = PlayerDB(**new_data.player.model_dump()) - session.add(player_db) - - # Map each PlayerSkills to PlayerSkillsDB - for player_skill in new_data.player_skills: - player_skill_db = PlayerSkillsDB(scraper_id=scraper_data_db.scraper_id, **player_skill.model_dump()) - session.add(player_skill_db) - - # Map each PlayerActivities to PlayerActivitiesDB - for player_activity in new_data.player_activities: - player_activity_db = PlayerActivitiesDB(scraper_id=scraper_data_db.scraper_id, **player_activity.model_dump()) - session.add(player_activity_db) - - await session.commit() + async with SessionContextManager() as session: + for new_data in new_data_list: + # Map ScraperData to ScraperDataDB + scraper_data_db = ScraperDataDB( + **new_data.scraper_data.model_dump() + ) + session.add(scraper_data_db) + await session.flush() # Flush the session to get the ID of the newly inserted scraper_data_db + + # Map Player to PlayerDB + stmt = select(PlayerDB).where(PlayerDB.id == new_data.player.id) + result = await session.execute(stmt) + player_db = result.scalars().first() + + if player_db is None: + player_db = PlayerDB(**new_data.player.model_dump()) + session.add(player_db) + else: + for key, value in new_data.player.model_dump().items(): + setattr(player_db, key, value) + + for player_skill in new_data.player_skills: + player_skill_dict = player_skill.model_dump() + player_skill_dict.pop('scraper_id', None) + player_skill_db = PlayerSkillsDB(scraper_id=scraper_data_db.scraper_id, **player_skill_dict) + session.add(player_skill_db) + + for player_activity in new_data.player_activities: + player_activity_dict = player_activity.model_dump() + player_activity_dict.pop('scraper_id', None) + player_activity_db = PlayerActivitiesDB(scraper_id=scraper_data_db.scraper_id, **player_activity_dict) + session.add(player_activity_db) + + await session.commit() except (OperationalError, IntegrityError) as e: for message in batch: @@ -407,8 +425,7 @@ async def process_data_v2(receive_queue: Queue, error_queue: Queue): # insert data in batches of N or interval of N if len(batch) > 100 or now-start_time > 15: async with semaphore: - task = asyncio.create_task(insert_data_v2(batch=batch, error_queue=error_queue)) - await task + await insert_data_v2(batch=batch, error_queue=error_queue) batch = [] receive_queue.task_done() From daa45b0b23bfa8d8422e23adbc800d0258bfda15 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Wed, 28 Feb 2024 22:14:13 -0500 Subject: [PATCH 28/39] add all to session after loop --- src/main.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/main.py b/src/main.py index 4fc63ca..a7179bf 100644 --- a/src/main.py +++ b/src/main.py @@ -243,6 +243,9 @@ async def insert_data_v2(batch: list[dict], error_queue: Queue): print(f"Time taken: {end_time - start_time} seconds") # async with session.begin(): async with SessionContextManager() as session: + player_skills_dbs = [] + player_activities_dbs = [] + for new_data in new_data_list: # Map ScraperData to ScraperDataDB scraper_data_db = ScraperDataDB( @@ -267,13 +270,16 @@ async def insert_data_v2(batch: list[dict], error_queue: Queue): player_skill_dict = player_skill.model_dump() player_skill_dict.pop('scraper_id', None) player_skill_db = PlayerSkillsDB(scraper_id=scraper_data_db.scraper_id, **player_skill_dict) - session.add(player_skill_db) + player_skills_dbs.append(player_skill_db) for player_activity in new_data.player_activities: player_activity_dict = player_activity.model_dump() player_activity_dict.pop('scraper_id', None) player_activity_db = PlayerActivitiesDB(scraper_id=scraper_data_db.scraper_id, **player_activity_dict) - session.add(player_activity_db) + player_activities_dbs.append(player_activity_db) + + session.add_all(player_skills_dbs) + session.add_all(player_activities_dbs) await session.commit() From c3fd07967383c9952aeb916eaa1c618450815983 Mon Sep 17 00:00:00 2001 From: RusticPotato Date: Wed, 28 Feb 2024 23:08:16 -0500 Subject: [PATCH 29/39] pass session, error handling sessions --- src/main.py | 100 ++++++++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/src/main.py b/src/main.py index a7179bf..f668f65 100644 --- a/src/main.py +++ b/src/main.py @@ -211,7 +211,7 @@ async def insert_data_v1(batch: list[Message], error_queue: Queue): # return activity_ids -async def insert_data_v2(batch: list[dict], error_queue: Queue): +async def insert_data_v2(session, batch: list[dict], error_queue: Queue): global SKILL_NAMES, ACTIVITY_NAMES """ 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates @@ -242,55 +242,58 @@ async def insert_data_v2(batch: list[dict], error_queue: Queue): # session: AsyncSession = await get_session() print(f"Time taken: {end_time - start_time} seconds") # async with session.begin(): - async with SessionContextManager() as session: - player_skills_dbs = [] - player_activities_dbs = [] - - for new_data in new_data_list: - # Map ScraperData to ScraperDataDB - scraper_data_db = ScraperDataDB( - **new_data.scraper_data.model_dump() - ) - session.add(scraper_data_db) - await session.flush() # Flush the session to get the ID of the newly inserted scraper_data_db - - # Map Player to PlayerDB - stmt = select(PlayerDB).where(PlayerDB.id == new_data.player.id) - result = await session.execute(stmt) - player_db = result.scalars().first() - - if player_db is None: - player_db = PlayerDB(**new_data.player.model_dump()) - session.add(player_db) - else: - for key, value in new_data.player.model_dump().items(): - setattr(player_db, key, value) - - for player_skill in new_data.player_skills: - player_skill_dict = player_skill.model_dump() - player_skill_dict.pop('scraper_id', None) - player_skill_db = PlayerSkillsDB(scraper_id=scraper_data_db.scraper_id, **player_skill_dict) - player_skills_dbs.append(player_skill_db) - - for player_activity in new_data.player_activities: - player_activity_dict = player_activity.model_dump() - player_activity_dict.pop('scraper_id', None) - player_activity_db = PlayerActivitiesDB(scraper_id=scraper_data_db.scraper_id, **player_activity_dict) - player_activities_dbs.append(player_activity_db) - - session.add_all(player_skills_dbs) - session.add_all(player_activities_dbs) - - await session.commit() + player_skills_dbs = [] + player_activities_dbs = [] + + for new_data in new_data_list: + # Map ScraperData to ScraperDataDB + print(f"Scraper data: {new_data.scraper_data}") + scraper_data_db = ScraperDataDB( + **new_data.scraper_data.model_dump() + ) + session.add(scraper_data_db) + await session.flush() # Flush the session to get the ID of the newly inserted scraper_data_db + + # Map Player to PlayerDB + stmt = select(PlayerDB).where(PlayerDB.id == new_data.player.id) + result = await session.execute(stmt) + player_db = result.scalars().first() + + if player_db is None: + player_db = PlayerDB(**new_data.player.model_dump()) + session.add(player_db) + else: + for key, value in new_data.player.model_dump().items(): + setattr(player_db, key, value) + + for player_skill in new_data.player_skills: + player_skill_dict = player_skill.model_dump() + player_skill_dict.pop('scraper_id', None) + player_skill_db = PlayerSkillsDB(scraper_id=scraper_data_db.scraper_id, **player_skill_dict) + player_skills_dbs.append(player_skill_db) + + for player_activity in new_data.player_activities: + player_activity_dict = player_activity.model_dump() + player_activity_dict.pop('scraper_id', None) + player_activity_db = PlayerActivitiesDB(scraper_id=scraper_data_db.scraper_id, **player_activity_dict) + player_activities_dbs.append(player_activity_db) + + session.add_all(player_skills_dbs) + session.add_all(player_activities_dbs) + + await session.commit() except (OperationalError, IntegrityError) as e: + if not session.is_active: + session = await get_session() for message in batch: await error_queue.put(message) - logger.error({"error": e}) logger.error({"error": e}) logger.info(f"error_qsize={error_queue.qsize()}, {message=}") except Exception as e: + if not session.is_active: + session = await get_session() for message in batch: await error_queue.put(message) @@ -428,11 +431,14 @@ async def process_data_v2(receive_queue: Queue, error_queue: Queue): now = time.time() - # insert data in batches of N or interval of N - if len(batch) > 100 or now-start_time > 15: - async with semaphore: - await insert_data_v2(batch=batch, error_queue=error_queue) - batch = [] + # Create a session outside the loop + async with SessionContextManager() as session: + # insert data in batches of N or interval of N + if len(batch) > 100 or now-start_time > 15: + async with semaphore: + # Pass the session to insert_data_v2 + await insert_data_v2(session, batch=batch, error_queue=error_queue) + batch = [] receive_queue.task_done() counter += 1 From d6b5f02539e035949d95f4be5d7cb46802007140 Mon Sep 17 00:00:00 2001 From: extreme4all <> Date: Fri, 1 Mar 2024 00:01:59 +0100 Subject: [PATCH 30/39] working v1 --- Makefile | 4 + docker-compose.yml | 2 + kafka_setup/setup_kafka.py | 2 +- src/app/repositories/abc.py | 4 +- src/app/repositories/activities.py | 4 +- src/app/repositories/highscore.py | 20 +- src/app/repositories/scraper_data.py | 37 +++ src/app/repositories/skills.py | 4 +- src/app/schemas/input/highscore.py | 1 - src/app/schemas/input/message.py | 8 + src/core/__init__.py | 1 + src/database/database.py | 7 +- src/main.py | 427 +++++++-------------------- 13 files changed, 176 insertions(+), 345 deletions(-) create mode 100644 src/app/repositories/scraper_data.py create mode 100644 src/app/schemas/input/message.py diff --git a/Makefile b/Makefile index e74f0ee..493baaa 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,10 @@ up: down: docker-compose down +docker-restart: + docker compose down + docker compose up --build -d + clean: docker-compose down --volumes diff --git a/docker-compose.yml b/docker-compose.yml index 9e19330..89ac12a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -62,6 +62,8 @@ services: volumes: - ./mysql/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d # - ./mysql/mount:/var/lib/mysql # creates persistence + ports: + - 3307:3306 # ports: # - 3306:3306 expose: diff --git a/kafka_setup/setup_kafka.py b/kafka_setup/setup_kafka.py index c99de1f..1cca1c9 100644 --- a/kafka_setup/setup_kafka.py +++ b/kafka_setup/setup_kafka.py @@ -4,7 +4,7 @@ import zipfile from queue import Queue -from my_kafka import KafkaProducer +from kafka import KafkaProducer from kafka.admin import KafkaAdminClient, NewTopic diff --git a/src/app/repositories/abc.py b/src/app/repositories/abc.py index f187a68..c615ab2 100644 --- a/src/app/repositories/abc.py +++ b/src/app/repositories/abc.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod -from database.database import get_session +from database.database import SessionFactory class ABCRepo(ABC): @@ -9,7 +9,7 @@ class ABCRepo(ABC): """ async def _get_session(self): - return await get_session() + return SessionFactory() @abstractmethod async def create(self, data): diff --git a/src/app/repositories/activities.py b/src/app/repositories/activities.py index 2cf6e0c..bbb7336 100644 --- a/src/app/repositories/activities.py +++ b/src/app/repositories/activities.py @@ -29,7 +29,7 @@ async def request(self, activity_id: int = None) -> list[ActivitiesDB]: if activity_id: sql = sql.where(table.activity_id == activity_id) - async with self._get_session() as session: + async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity data = await session.execute(sql) data = await data.all() @@ -53,6 +53,6 @@ async def create(self, data: list[PlayerActivities]) -> None: table = PlayerActivitiesDB sql = insert(table).values(data) - async with self._get_session() as session: + async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity await session.execute(sql) diff --git a/src/app/repositories/highscore.py b/src/app/repositories/highscore.py index 38ec8aa..8d90dd0 100644 --- a/src/app/repositories/highscore.py +++ b/src/app/repositories/highscore.py @@ -1,11 +1,11 @@ -from sqlalchemy import insert, select, update -from sqlalchemy.ext.asyncio import AsyncSession - from app.repositories.abc import ABCRepo from app.schemas.input.highscore import PlayerHiscoreData from app.schemas.input.player import Player +from database.database import SessionFactory from database.models.highscores import PlayerHiscoreData as PlayerHiscoreDataDB from database.models.player import Player as PlayerDB +from sqlalchemy import insert, select, update +from sqlalchemy.ext.asyncio import AsyncSession class HighscoreRepo(ABCRepo): @@ -29,7 +29,7 @@ async def request(self, id: list[int] = None) -> list[PlayerHiscoreDataDB]: if id: sql = sql.where(table.id.in_(id)) - async with self._get_session() as session: + async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity data = await session.execute(sql) data = await data.all() @@ -49,8 +49,11 @@ async def create( highscore_data = [d.model_dump() for d in highscore_data] sql_insert = insert(table_highscore).values(highscore_data) + sql_insert = sql_insert.prefix_with("IGNORE") + sql_update = update(table_player) - async with self._get_session() as session: + + async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity await session.execute(sql_insert) # Insert highscore data @@ -58,3 +61,10 @@ async def create( sql_update = sql_update.where(table_player.id == player.id) sql_update = sql_update.values(player.model_dump()) await session.execute(sql_update) # Update player data + await session.commit() + + async def update(self, id, data): + return await super().update(id, data) + + async def delete(self, id): + return await super().delete(id) diff --git a/src/app/repositories/scraper_data.py b/src/app/repositories/scraper_data.py new file mode 100644 index 0000000..5959083 --- /dev/null +++ b/src/app/repositories/scraper_data.py @@ -0,0 +1,37 @@ +from sqlalchemy import insert, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.app.repositories.abc import ABCRepo +from src.app.schemas.input.scraper_data import ScraperData +from src.database.models.scraper_data import ScraperData as ScraperDataDB + + +class ScraperDataRepo(ABCRepo): + """Repository for managing skill data.""" + + def __init__(self) -> None: + """Initializes the SkillsRepo instance.""" + super().__init__() + + async def request(self, scraper_id: int = None) -> list[ScraperData]: + """ """ + table = ScraperDataDB + sql = select(table) + sql = sql.where(ScraperDataDB.scraper_id == scraper_id) + + async with await self._get_session() as session: + session: AsyncSession + data = await session.execute(sql) + data = await data.all() + return [ScraperData(**d) for d in data] + + async def create(self, data: list[ScraperData]) -> None: + table = ScraperDataDB + sql = insert(table) + sql = sql.values([d.model_dump() for d in data]) + sql = sql.prefix_with("IGNORE") + + async with await self._get_session() as session: + session: AsyncSession + data = await session.execute(sql) + return diff --git a/src/app/repositories/skills.py b/src/app/repositories/skills.py index c60737f..5fe5c14 100644 --- a/src/app/repositories/skills.py +++ b/src/app/repositories/skills.py @@ -29,7 +29,7 @@ async def request(self, skill_id: int = None) -> list[SkillsDB]: if skill_id: sql = sql.where(table.skill_id == skill_id) - async with self._get_session() as session: + async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity data = await session.execute(sql) data = await data.all() @@ -53,6 +53,6 @@ async def create(self, data: list[PlayerSkills]) -> None: table = PlayerSkillsDB sql = insert(table).values(data) - async with self._get_session() as session: + async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity await session.execute(sql) diff --git a/src/app/schemas/input/highscore.py b/src/app/schemas/input/highscore.py index 6cc8743..4d4f919 100644 --- a/src/app/schemas/input/highscore.py +++ b/src/app/schemas/input/highscore.py @@ -1,5 +1,4 @@ from datetime import datetime -from typing import Optional from pydantic import BaseModel, ConfigDict diff --git a/src/app/schemas/input/message.py b/src/app/schemas/input/message.py new file mode 100644 index 0000000..9b1e7ce --- /dev/null +++ b/src/app/schemas/input/message.py @@ -0,0 +1,8 @@ +from app.schemas.input.highscore import PlayerHiscoreData +from app.schemas.input.player import Player +from pydantic import BaseModel + + +class Message(BaseModel): + hiscores: PlayerHiscoreData | None + player: Player | None diff --git a/src/core/__init__.py b/src/core/__init__.py index e69de29..5496911 100644 --- a/src/core/__init__.py +++ b/src/core/__init__.py @@ -0,0 +1 @@ +from . import logging diff --git a/src/database/database.py b/src/database/database.py index e198895..bae4d41 100644 --- a/src/database/database.py +++ b/src/database/database.py @@ -1,9 +1,8 @@ +from core.config import settings from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker -from core.config import settings - # Create an async SQLAlchemy engine engine = create_async_engine( settings.DATABASE_URL, @@ -23,8 +22,8 @@ # async def get_session() -> AsyncSession: # async with SessionFactory() as session: # yield session -async def get_session() -> AsyncSession: - return SessionFactory() +async def get_session(): + yield SessionFactory() Base = declarative_base() diff --git a/src/main.py b/src/main.py index 759185c..c99e875 100644 --- a/src/main.py +++ b/src/main.py @@ -1,69 +1,26 @@ import asyncio -import json import logging import time import traceback from asyncio import Queue -from datetime import datetime, timedelta -import concurrent.futures +import my_kafka as my_kafka +# schemas import +from app.repositories.highscore import HighscoreRepo +from app.schemas.input.message import Message from core.config import settings from database.database import get_session -from database.models.highscores import PlayerHiscoreData as PlayerHiscoreDataDB +from database.models.activities import PlayerActivities as PlayerActivitiesDB from database.models.player import Player as PlayerDB -from database.models.skills import PlayerSkills as PlayerSkillsDB, Skills as SkillsDB -from database.models.activities import PlayerActivities as PlayerActivitiesDB, Activities as ActivitiesDB from database.models.scraper_data import ScraperData as ScraperDataDB -from pydantic import BaseModel -from sqlalchemy import insert, update, select +from database.models.skills import PlayerSkills as PlayerSkillsDB from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.sql.expression import Insert, Update - -import my_kafka as my_kafka -# schemas import -from app.repositories.highscore import HighscoreRepo -from app.schemas.input.highscore import PlayerHiscoreData -from app.schemas.input.player import Player -from app.schemas.input.scraper_data import ScraperData -from app.schemas.input.activities import Activities, PlayerActivities -from app.schemas.input.skills import Skills, PlayerSkills logger = logging.getLogger(__name__) -class Message(BaseModel): - hiscores: PlayerHiscoreData | None - player: Player | None - - -class NewDataSchema(BaseModel): - scraper_data: ScraperData - player_skills: list[PlayerSkills] - player_activities: list[PlayerActivities] - player: Player - - -# Global variables to cache the skill and activity names -SKILL_NAMES: list[Skills] = [] -ACTIVITY_NAMES: list[Activities] = [] -# Global variables for the locks -SKILL_NAMES_LOCK = asyncio.Lock() -ACTIVITY_NAMES_LOCK = asyncio.Lock() -# Global variable to track when the cache was last updated -LAST_SKILL_NAMES_UPDATE = datetime.min -LAST_ACTIVITY_NAMES_UPDATE = datetime.min - -class SingletonLoop: - _loop = None - - @classmethod - def get_loop(cls): - if cls._loop is None: - cls._loop = asyncio.get_running_loop() - return cls._loop - def log_speed( counter: int, start_time: float, _queue: Queue, topic: str, interval: int = 15 ) -> tuple[float, int]: @@ -88,31 +45,16 @@ def log_speed( # Return the current time and reset the counter to zero return time.time(), 0 + async def insert_data_v1(batch: list[Message], error_queue: Queue): try: - highscores:list[dict] = [msg.get("hiscores") for msg in batch] - players:list[dict] = [msg.get("player") for msg in batch] - - highscores = [PlayerHiscoreData(**hs) for hs in highscores if hs] - highscores = [hs.model_dump(mode="json") for hs in highscores ] + highscores = [msg.hiscores for msg in batch if msg.hiscores] + players = [msg.player for msg in batch if msg.player] - session: AsyncSession = await get_session() - logger.info(f"Received: {len(players)=}, {len(highscores)=}") - # start a transaction - async with session.begin(): - # insert into table values () - insert_sql:Insert = insert(PlayerHiscoreDataDB) # fixing v1, currently debugging here - insert_sql = insert_sql.values(highscores) - insert_sql = insert_sql.prefix_with("ignore") - await session.execute(insert_sql) - # update table - for player in players: - update_sql:Update = update(PlayerDB) - update_sql = update_sql.where(PlayerDB.id == player.get("id")) - update_sql = update_sql.values(player) - await session.execute(update_sql) + repo = HighscoreRepo() + await repo.create(highscore_data=highscores, player_data=players) except (OperationalError, IntegrityError) as e: for message in batch: await error_queue.put(message) @@ -128,192 +70,81 @@ async def insert_data_v1(batch: list[Message], error_queue: Queue): logger.info(f"error_qsize={error_queue.qsize()}, {message=}") -# async def check_and_update_skill_cache(batch: list[Message], session: AsyncSession): -# global SKILL_NAMES, LAST_SKILL_NAMES_UPDATE, SKILL_NAMES_LOCK, ACTIVITY_NAMES - -# # Query the cache to get the skill IDs -# skill_ids = {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} - -# missing_skills = [ -# skill -# for message in batch -# if message.hiscores is not None -# for skill in message.hiscores.model_fields.keys() -# if skill -# not in ["timestamp", "Player_id"] + [skill.skill_name for skill in SKILL_NAMES] -# and skill not in skill_ids -# ] -# if missing_skills: -# # Check if the cache was updated less than 10 minutes ago -# if datetime.now() - LAST_SKILL_NAMES_UPDATE < timedelta(minutes=10): -# logger.warning( -# "Skill names cache update was called less than 10 minutes ago. Skipping batch." -# ) -# return None # Or however you want to handle this case - -# # Update the skill names cache -# async with SKILL_NAMES_LOCK: -# await update_skill_names(session) -# LAST_SKILL_NAMES_UPDATE = datetime.now() - -# # Query the cache again to get the updated skill IDs -# skill_ids = ( -# {skill.name: skill.id for skill in SKILL_NAMES} if SKILL_NAMES else {} -# ) - -# return skill_ids - - -# async def check_and_update_activity_cache(batch: list[Message], session: AsyncSession): -# global ACTIVITY_NAMES, LAST_ACTIVITY_NAMES_UPDATE, ACTIVITY_NAMES_LOCK, SKILL_NAMES - -# # Query the cache to get the activity IDs -# activity_ids = ( -# {activity.name: activity.id for activity in ACTIVITY_NAMES} -# if ACTIVITY_NAMES -# else {} -# ) - -# # Check if any activity name in any message is not found in the cache -# missing_activities = [ -# activity -# for message in batch -# if message.hiscores is not None -# for activity in message.hiscores.model_fields.keys() -# if activity -# not in ["timestamp", "Player_id"] + [skill.skill_name for skill in SKILL_NAMES] -# and activity not in activity_ids -# ] -# if missing_activities: -# # Check if the cache was updated less than 10 minutes ago -# if datetime.now() - LAST_ACTIVITY_NAMES_UPDATE < timedelta(minutes=10): -# logger.warning( -# "Activity names cache update was called less than 10 minutes ago. Skipping batch." -# ) -# return None # Or however you want to handle this case - -# # Update the activity names cache -# async with ACTIVITY_NAMES_LOCK: -# await update_activity_names(session) -# LAST_ACTIVITY_NAMES_UPDATE = datetime.now() - -# # Query the cache again to get the updated activity IDs -# activity_ids = ( -# {activity.name: activity.id for activity in ACTIVITY_NAMES} -# if ACTIVITY_NAMES -# else {} -# ) - -# return activity_ids - - -async def insert_data_v2(batch: list[dict], error_queue: Queue): - global SKILL_NAMES, ACTIVITY_NAMES - """ - 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates - 2. start transaction - 3. for each player insert into scraper_data - 4. for each player get the scraper_id from scraper_data - 5. insert into player_skills (scraper_id, skill_id) values (), () - 6. insert into player_activities (scraper_id, activity_id) values (), () - - step 5 & 6 must be batched for all players at once - """ - try: - - messages = [Message(**msg) for msg in batch] - # session: AsyncSession = await get_session() - # transformed_data: list[NewDataSchema] = await transform_data(messages, session) - new_data_list = [] - - # remove duplicates and remove players with no hiscores - messages = [msg for msg in messages if msg.hiscores is not None] - messages = [msg for msg in messages if msg.player is not None] - - start_time = time.time() - new_data_list = [transform_message_to_new_data(msg) for msg in messages] - end_time = time.time() - - session: AsyncSession = await get_session() - print(f"Time taken: {end_time - start_time} seconds") - # async with session.begin(): - for new_data in new_data_list: - # Map ScraperData to ScraperDataDB - scraper_data_db = ScraperDataDB(**new_data.scraper_data.model_dump()) - session.add(scraper_data_db) - await session.flush() # Flush the session to get the ID of the newly inserted scraper_data_db - - # Map Player to PlayerDB - player_db = PlayerDB(**new_data.player.model_dump()) - session.add(player_db) - - # Map each PlayerSkills to PlayerSkillsDB - for player_skill in new_data.player_skills: - player_skill_db = PlayerSkillsDB(scraper_id=scraper_data_db.scraper_id, **player_skill.model_dump()) - session.add(player_skill_db) - - # Map each PlayerActivities to PlayerActivitiesDB - for player_activity in new_data.player_activities: - player_activity_db = PlayerActivitiesDB(scraper_id=scraper_data_db.scraper_id, **player_activity.model_dump()) - session.add(player_activity_db) - - await session.commit() - - except (OperationalError, IntegrityError) as e: - for message in batch: - await error_queue.put(message) - - logger.error({"error": e}) - logger.error({"error": e}) - logger.info(f"error_qsize={error_queue.qsize()}, {message=}") - except Exception as e: - for message in batch: - await error_queue.put(message) - - logger.error({"error": e}) - logger.debug(f"Traceback: \n{traceback.format_exc()}") - logger.info(f"error_qsize={error_queue.qsize()}, {message=}") - -def transform_message_to_new_data(msg): - scraper_data = ScraperData(player_id=msg.player.id) - - # Create a set of the attribute names in msg.hiscores - hiscores_attributes = set(msg.hiscores.__dict__.keys()) - - # Only create PlayerSkills and PlayerActivities objects for skills and activities in hiscores_attributes - player_skills = [PlayerSkills(skill_id=skill.skill_id, skill_value=getattr(msg.hiscores, skill.skill_name)) for skill in SKILL_NAMES if skill.skill_name in hiscores_attributes] - player_activities = [PlayerActivities(activity_id=activity.activity_id, activity_value=getattr(msg.hiscores, activity.activity_name)) for activity in ACTIVITY_NAMES if activity.activity_name in hiscores_attributes] - - player = Player(**msg.player.model_dump()) - new_data = NewDataSchema(scraper_data=scraper_data, player_skills=player_skills, player_activities=player_activities, player=player) - - return new_data - -async def update_skill_names(session: AsyncSession): - global SKILL_NAMES, SKILL_NAMES_LOCK - - async with SKILL_NAMES_LOCK: - if SKILL_NAMES is None or not SKILL_NAMES: - try: - skill_records = await session.execute(select(SkillsDB)) - SKILL_NAMES = [Skills(**record.__dict__) for record in skill_records.scalars().all()] - # print(SKILL_NAMES) - except Exception as e: - print(f"Error updating skill names: {e}") - -async def update_activity_names(session: AsyncSession): - global ACTIVITY_NAMES, ACTIVITY_NAMES_LOCK - - async with ACTIVITY_NAMES_LOCK: - try: - if ACTIVITY_NAMES is None or not ACTIVITY_NAMES: - activity_records = await session.execute(select(ActivitiesDB)) - ACTIVITY_NAMES = [Activities(**record.__dict__) for record in activity_records.scalars().all()] - # print(ACTIVITY_NAMES) - except Exception as e: - print(f"Error updating activity names: {e}") - -async def process_data_v1(receive_queue: Queue, error_queue: Queue): +# async def insert_data_v2(batch: list[dict], error_queue: Queue): +# """ +# 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates +# 2. start transaction +# 3. for each player insert into scraper_data +# 4. for each player get the scraper_id from scraper_data +# 5. insert into player_skills (scraper_id, skill_id) values (), () +# 6. insert into player_activities (scraper_id, activity_id) values (), () + +# step 5 & 6 must be batched for all players at once +# """ +# try: +# messages = [Message(**msg) for msg in batch] +# # session: AsyncSession = await get_session() +# # transformed_data: list[NewDataSchema] = await transform_data(messages, session) +# new_data_list = [] + +# # remove duplicates and remove players with no hiscores +# messages = [msg for msg in messages if msg.hiscores is not None] +# messages = [msg for msg in messages if msg.player is not None] + +# start_time = time.time() + +# new_data_list = [transform_message_to_new_data(msg) for msg in messages] +# end_time = time.time() + +# session: AsyncSession = await get_session() +# print(f"Time taken: {end_time - start_time} seconds") +# # async with session.begin(): +# for new_data in new_data_list: +# # Map ScraperData to ScraperDataDB +# scraper_data_db = ScraperDataDB(**new_data.scraper_data.model_dump()) +# session.add(scraper_data_db) +# await ( +# session.flush() +# ) # Flush the session to get the ID of the newly inserted scraper_data_db + +# # Map Player to PlayerDB +# player_db = PlayerDB(**new_data.player.model_dump()) +# session.add(player_db) + +# # Map each PlayerSkills to PlayerSkillsDB +# for player_skill in new_data.player_skills: +# player_skill_db = PlayerSkillsDB( +# scraper_id=scraper_data_db.scraper_id, **player_skill.model_dump() +# ) +# session.add(player_skill_db) + +# # Map each PlayerActivities to PlayerActivitiesDB +# for player_activity in new_data.player_activities: +# player_activity_db = PlayerActivitiesDB( +# scraper_id=scraper_data_db.scraper_id, +# **player_activity.model_dump(), +# ) +# session.add(player_activity_db) + +# await session.commit() + +# except (OperationalError, IntegrityError) as e: +# for message in batch: +# await error_queue.put(message) + +# logger.error({"error": e}) +# logger.error({"error": e}) +# logger.info(f"error_qsize={error_queue.qsize()}, {message=}") +# except Exception as e: +# for message in batch: +# await error_queue.put(message) + +# logger.error({"error": e}) +# logger.debug(f"Traceback: \n{traceback.format_exc()}") +# logger.info(f"error_qsize={error_queue.qsize()}, {message=}") + + +async def process_data(receive_queue: Queue, error_queue: Queue): # Initialize counter and start time counter = 0 start_time = time.time() @@ -329,7 +160,7 @@ async def process_data_v1(receive_queue: Queue, error_queue: Queue): start_time=start_time, _queue=receive_queue, topic="scraper", - interval=15 + interval=15, ) # Check if queue is empty @@ -338,79 +169,29 @@ async def process_data_v1(receive_queue: Queue, error_queue: Queue): continue # Get a message from the chosen queue - message: dict = await receive_queue.get() - - #TODO fix test data + message = await receive_queue.get() + message = Message(**message) + + # TODO fix test data if settings.ENV != "PRD": - player = message.get("player") - player_id = player.get("id") + player_id = message.player.id MIN_PLAYER_ID = 0 MAX_PLAYER_ID = 300 if not (MIN_PLAYER_ID < player_id <= MAX_PLAYER_ID): continue - + # batch message batch.append(message) now = time.time() # insert data in batches of N or interval of N - if len(batch) > 100 or now-start_time > 15: + if len(batch) > 100 or now - start_time > 15: async with semaphore: await insert_data_v1(batch=batch, error_queue=error_queue) + # await insert_data_v2(batch=batch, error_queue=error_queue) batch = [] - - receive_queue.task_done() - counter += 1 - -async def process_data_v2(receive_queue: Queue, error_queue: Queue): - # Initialize counter and start time - counter = 0 - start_time = time.time() - - # limit the number of async insert_data calls - semaphore = asyncio.Semaphore(5) - - batch = [] - # Run indefinitely - while True: - start_time, counter = log_speed( - counter=counter, - start_time=start_time, - _queue=receive_queue, - topic="scraper", - interval=15 - ) - - # Check if queue is empty - if receive_queue.empty(): - await asyncio.sleep(1) - continue - # Get a message from the chosen queue - message: dict = await receive_queue.get() - - #TODO fix test data - if settings.ENV != "PRD": - player = message.get("player") - player_id = player.get("id") - MIN_PLAYER_ID = 0 - MAX_PLAYER_ID = 300 - if not (MIN_PLAYER_ID < player_id <= MAX_PLAYER_ID): - continue - - # batch message - batch.append(message) - - now = time.time() - - # insert data in batches of N or interval of N - if len(batch) > 100 or now-start_time > 15: - async with semaphore: - task = asyncio.create_task(insert_data_v2(batch=batch, error_queue=error_queue)) - await task - batch = [] - receive_queue.task_done() counter += 1 @@ -429,27 +210,17 @@ async def main(): ) ) asyncio.create_task( - my_kafka.send_messages(topic="scraper", producer=producer, send_queue=send_queue) + my_kafka.send_messages( + topic="scraper", producer=producer, send_queue=send_queue + ) ) - # loop.create_task( - # process_data_v1(receive_queue=receive_queue, error_queue=send_queue) - # ) - asyncio.create_task( - process_data_v2(receive_queue=receive_queue, error_queue=send_queue) + process_data(receive_queue=receive_queue, error_queue=send_queue) ) while True: await asyncio.sleep(60) -async def init(): - session = await get_session() - try: - await update_skill_names(session) - await update_activity_names(session) - finally: - await session.close() if __name__ == "__main__": - asyncio.run(init()) asyncio.run(main()) From 77550ca3beea7743e9ddf8609eb5d12c3e48ac76 Mon Sep 17 00:00:00 2001 From: extreme4all <> Date: Fri, 1 Mar 2024 01:18:12 +0100 Subject: [PATCH 31/39] high level outline --- src/app/repositories/scraper_data.py | 79 ++++++++++++++++++++++++--- src/app/schemas/input/scraper_data.py | 12 ---- src/app/schemas/scraper_data.py | 17 ++++++ src/database/models/scraper_data.py | 8 ++- src/main.py | 25 +++++++++ 5 files changed, 118 insertions(+), 23 deletions(-) delete mode 100644 src/app/schemas/input/scraper_data.py create mode 100644 src/app/schemas/scraper_data.py diff --git a/src/app/repositories/scraper_data.py b/src/app/repositories/scraper_data.py index 5959083..c3587ea 100644 --- a/src/app/repositories/scraper_data.py +++ b/src/app/repositories/scraper_data.py @@ -1,8 +1,13 @@ -from sqlalchemy import insert, select +from datetime import datetime + +from app.schemas.scraper_data import ScraperCreate, ScraperData +from sqlalchemy import and_, insert, select from sqlalchemy.ext.asyncio import AsyncSession from src.app.repositories.abc import ABCRepo -from src.app.schemas.input.scraper_data import ScraperData +from src.app.repositories.activities import PlayerActivities +from src.app.repositories.skills import PlayerSkills +from src.app.schemas.input.highscore import PlayerHiscoreData from src.database.models.scraper_data import ScraperData as ScraperDataDB @@ -25,13 +30,71 @@ async def request(self, scraper_id: int = None) -> list[ScraperData]: data = await data.all() return [ScraperData(**d) for d in data] - async def create(self, data: list[ScraperData]) -> None: + async def create( + self, highscore_data: list[PlayerHiscoreData], player_data: list + ) -> None: + repo_skills = PlayerSkills() + repo_activities = PlayerActivities() + + date_fmt = "%Y-%m-%d %HH:%MM:%SS" + data = [ + { + "record": ScraperCreate( + player_id=d.Player_id, + created_at=datetime.strptime(d.timestamp, date_fmt), + ), + "highscore": d.model_dump(), + } + for d in highscore_data + ] + table = ScraperDataDB - sql = insert(table) - sql = sql.values([d.model_dump() for d in data]) - sql = sql.prefix_with("IGNORE") + + sql_insert = insert(table) + sql_insert = sql_insert.values([d["record"].model_dump() for d in data]) + sql_insert = sql_insert.prefix_with("IGNORE") + + sql_select = select(table) + sql_insert = sql_select.where async with await self._get_session() as session: session: AsyncSession - data = await session.execute(sql) - return + # insert scraperdata + await session.execute(sql_insert) + + # get scraper_id + for d in data: + _d: ScraperCreate = d["record"] + date = datetime.fromisoformat(_d.created_at).date() + sql_select = select(table) + sql_select = sql_select.where( + and_(table.player_id == _d.player_id, table.record_date == date) + ) + result = await session.execute(sql_select) + result = result.first() + scraper_id = "" # TODO from result + SKILLS = [] # hardcoded + ACTIVITIES = [] # hardcoded + + d["skills"] = [ + { + "name": s, + "skill_value": d["highscore"].get(s), + "scraper_id": scraper_id, + } + for s in SKILLS + ] + d["activities"] = [ + { + "name": a, + "activity_value": d["highscore"].get(a), + "scraper_id": scraper_id, + } + for a in ACTIVITIES + ] + d.pop("highscore") # cleanup memory + # TODO: bulk insert values with repo skills & activities + # REPO skills & activities must convert skill/activity_name to id + # REPO skills & activities must take a pydantic class so we should convert above to class name:str, skill/activity_value:int, scraper_id:int + await session.commit() + return data diff --git a/src/app/schemas/input/scraper_data.py b/src/app/schemas/input/scraper_data.py deleted file mode 100644 index 8f1e1a9..0000000 --- a/src/app/schemas/input/scraper_data.py +++ /dev/null @@ -1,12 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel, ConfigDict -from datetime import datetime - -class ScraperData(BaseModel): - model_config = ConfigDict(from_attributes=True) - - player_id: int - scraper_id: Optional[int] = None - created_at: str = datetime.now().isoformat() #datetime.now().strftime('%Y-%m-%d %H:%M:%S') - # record_date: Optional[str] = None diff --git a/src/app/schemas/scraper_data.py b/src/app/schemas/scraper_data.py new file mode 100644 index 0000000..77aecf2 --- /dev/null +++ b/src/app/schemas/scraper_data.py @@ -0,0 +1,17 @@ +from datetime import date, datetime + +from pydantic import BaseModel, ConfigDict + + +class ScraperCreate(BaseModel): + player_id: int + created_at: datetime + + +class ScraperData(ScraperCreate): + model_config = ConfigDict(from_attributes=True) + + # ScraperCreate.player_id + # ScraperCreate.created_at + scraper_id: int + record_date: date diff --git a/src/database/models/scraper_data.py b/src/database/models/scraper_data.py index 3e81776..012d70b 100644 --- a/src/database/models/scraper_data.py +++ b/src/database/models/scraper_data.py @@ -1,3 +1,6 @@ +from datetime import datetime +from typing import Optional + from database.database import Base from sqlalchemy import ( Column, @@ -7,8 +10,7 @@ ) from sqlalchemy.dialects.mysql import BIGINT, SMALLINT from sqlalchemy.schema import UniqueConstraint -from datetime import datetime -from typing import Optional + # CREATE TABLE scraper_data ( # scraper_id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, @@ -23,7 +25,7 @@ class ScraperData(Base): scraper_id = Column(BIGINT, primary_key=True, autoincrement=True) created_at = Column(DateTime, nullable=False, server_default=func.now()) player_id = Column(SMALLINT, nullable=False) - # record_date = Column(Date, nullable=True, server_onupdate=func.current_date()) + record_date = Column(Date, nullable=True) # __table_args__ = ( # UniqueConstraint("player_id", "record_date", name="unique_player_per_day"), diff --git a/src/main.py b/src/main.py index c99e875..415f936 100644 --- a/src/main.py +++ b/src/main.py @@ -8,6 +8,7 @@ # schemas import from app.repositories.highscore import HighscoreRepo +from app.repositories.scraper_data import ScraperDataRepo from app.schemas.input.message import Message from core.config import settings from database.database import get_session @@ -70,6 +71,30 @@ async def insert_data_v1(batch: list[Message], error_queue: Queue): logger.info(f"error_qsize={error_queue.qsize()}, {message=}") +async def insert_data_v2(batch: list[Message], error_queue: Queue): + try: + highscores = [msg.hiscores for msg in batch if msg.hiscores] + players = [msg.player for msg in batch if msg.player] + + logger.info(f"Received: {len(players)=}, {len(highscores)=}") + + repo = ScraperDataRepo() + await repo.create(highscore_data=highscores, player_data=players) + except (OperationalError, IntegrityError) as e: + for message in batch: + await error_queue.put(message) + + logger.error({"error": e}) + logger.info(f"error_qsize={error_queue.qsize()}, {message=}") + except Exception as e: + for message in batch: + await error_queue.put(message) + + logger.error({"error": e}) + logger.debug(f"Traceback: \n{traceback.format_exc()}") + logger.info(f"error_qsize={error_queue.qsize()}, {message=}") + + # async def insert_data_v2(batch: list[dict], error_queue: Queue): # """ # 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates From 69049e8ef806ab97e9b21a422b1c1c0f4ee1401e Mon Sep 17 00:00:00 2001 From: extreme4all <> Date: Fri, 1 Mar 2024 01:19:18 +0100 Subject: [PATCH 32/39] cleanup --- src/main.py | 74 ----------------------------------------------------- 1 file changed, 74 deletions(-) diff --git a/src/main.py b/src/main.py index 415f936..5a188c5 100644 --- a/src/main.py +++ b/src/main.py @@ -95,80 +95,6 @@ async def insert_data_v2(batch: list[Message], error_queue: Queue): logger.info(f"error_qsize={error_queue.qsize()}, {message=}") -# async def insert_data_v2(batch: list[dict], error_queue: Queue): -# """ -# 1. check for duplicates in scraper_data[player_id, record_date], remove all duplicates -# 2. start transaction -# 3. for each player insert into scraper_data -# 4. for each player get the scraper_id from scraper_data -# 5. insert into player_skills (scraper_id, skill_id) values (), () -# 6. insert into player_activities (scraper_id, activity_id) values (), () - -# step 5 & 6 must be batched for all players at once -# """ -# try: -# messages = [Message(**msg) for msg in batch] -# # session: AsyncSession = await get_session() -# # transformed_data: list[NewDataSchema] = await transform_data(messages, session) -# new_data_list = [] - -# # remove duplicates and remove players with no hiscores -# messages = [msg for msg in messages if msg.hiscores is not None] -# messages = [msg for msg in messages if msg.player is not None] - -# start_time = time.time() - -# new_data_list = [transform_message_to_new_data(msg) for msg in messages] -# end_time = time.time() - -# session: AsyncSession = await get_session() -# print(f"Time taken: {end_time - start_time} seconds") -# # async with session.begin(): -# for new_data in new_data_list: -# # Map ScraperData to ScraperDataDB -# scraper_data_db = ScraperDataDB(**new_data.scraper_data.model_dump()) -# session.add(scraper_data_db) -# await ( -# session.flush() -# ) # Flush the session to get the ID of the newly inserted scraper_data_db - -# # Map Player to PlayerDB -# player_db = PlayerDB(**new_data.player.model_dump()) -# session.add(player_db) - -# # Map each PlayerSkills to PlayerSkillsDB -# for player_skill in new_data.player_skills: -# player_skill_db = PlayerSkillsDB( -# scraper_id=scraper_data_db.scraper_id, **player_skill.model_dump() -# ) -# session.add(player_skill_db) - -# # Map each PlayerActivities to PlayerActivitiesDB -# for player_activity in new_data.player_activities: -# player_activity_db = PlayerActivitiesDB( -# scraper_id=scraper_data_db.scraper_id, -# **player_activity.model_dump(), -# ) -# session.add(player_activity_db) - -# await session.commit() - -# except (OperationalError, IntegrityError) as e: -# for message in batch: -# await error_queue.put(message) - -# logger.error({"error": e}) -# logger.error({"error": e}) -# logger.info(f"error_qsize={error_queue.qsize()}, {message=}") -# except Exception as e: -# for message in batch: -# await error_queue.put(message) - -# logger.error({"error": e}) -# logger.debug(f"Traceback: \n{traceback.format_exc()}") -# logger.info(f"error_qsize={error_queue.qsize()}, {message=}") - - async def process_data(receive_queue: Queue, error_queue: Queue): # Initialize counter and start time counter = 0 From c8ba48d074ccca8b193ae4dbf95562c401e53f7a Mon Sep 17 00:00:00 2001 From: extreme4all <> Date: Fri, 1 Mar 2024 01:20:59 +0100 Subject: [PATCH 33/39] ruff fix --- src/app/repositories/highscore.py | 1 - src/database/models/activities.py | 1 - src/database/models/scraper_data.py | 3 --- src/database/models/skills.py | 1 - src/main.py | 6 ------ 5 files changed, 12 deletions(-) diff --git a/src/app/repositories/highscore.py b/src/app/repositories/highscore.py index 8d90dd0..ad5f7b6 100644 --- a/src/app/repositories/highscore.py +++ b/src/app/repositories/highscore.py @@ -1,7 +1,6 @@ from app.repositories.abc import ABCRepo from app.schemas.input.highscore import PlayerHiscoreData from app.schemas.input.player import Player -from database.database import SessionFactory from database.models.highscores import PlayerHiscoreData as PlayerHiscoreDataDB from database.models.player import Player as PlayerDB from sqlalchemy import insert, select, update diff --git a/src/database/models/activities.py b/src/database/models/activities.py index b44089c..1053c13 100644 --- a/src/database/models/activities.py +++ b/src/database/models/activities.py @@ -6,7 +6,6 @@ String, ) from sqlalchemy.dialects.mysql import BIGINT, TINYINT -from sqlalchemy.schema import UniqueConstraint # CREATE TABLE activities ( diff --git a/src/database/models/scraper_data.py b/src/database/models/scraper_data.py index 012d70b..04f77df 100644 --- a/src/database/models/scraper_data.py +++ b/src/database/models/scraper_data.py @@ -1,5 +1,3 @@ -from datetime import datetime -from typing import Optional from database.database import Base from sqlalchemy import ( @@ -9,7 +7,6 @@ func, ) from sqlalchemy.dialects.mysql import BIGINT, SMALLINT -from sqlalchemy.schema import UniqueConstraint # CREATE TABLE scraper_data ( diff --git a/src/database/models/skills.py b/src/database/models/skills.py index 16132f5..b664999 100644 --- a/src/database/models/skills.py +++ b/src/database/models/skills.py @@ -6,7 +6,6 @@ String, ) from sqlalchemy.dialects.mysql import BIGINT, TINYINT -from sqlalchemy.schema import UniqueConstraint # CREATE TABLE skills ( diff --git a/src/main.py b/src/main.py index 5a188c5..d1903e2 100644 --- a/src/main.py +++ b/src/main.py @@ -11,13 +11,7 @@ from app.repositories.scraper_data import ScraperDataRepo from app.schemas.input.message import Message from core.config import settings -from database.database import get_session -from database.models.activities import PlayerActivities as PlayerActivitiesDB -from database.models.player import Player as PlayerDB -from database.models.scraper_data import ScraperData as ScraperDataDB -from database.models.skills import PlayerSkills as PlayerSkillsDB from sqlalchemy.exc import IntegrityError, OperationalError -from sqlalchemy.ext.asyncio import AsyncSession logger = logging.getLogger(__name__) From 6727d04122840e4ffe97da9ce1ba425d1e10c46f Mon Sep 17 00:00:00 2001 From: extreme4all <40169115+extreme4all@users.noreply.github.com> Date: Sun, 3 Mar 2024 00:59:41 +0100 Subject: [PATCH 34/39] fully working --- src/app/repositories/activities.py | 33 +++++-- src/app/repositories/highscore.py | 40 ++++---- src/app/repositories/scraper_data.py | 140 ++++++++++++--------------- src/app/repositories/skills.py | 35 +++++-- src/main.py | 54 ++++++++++- 5 files changed, 186 insertions(+), 116 deletions(-) diff --git a/src/app/repositories/activities.py b/src/app/repositories/activities.py index bbb7336..78f4d0a 100644 --- a/src/app/repositories/activities.py +++ b/src/app/repositories/activities.py @@ -1,11 +1,10 @@ +from app.repositories.abc import ABCRepo +from app.schemas.input.activities import Activities, PlayerActivities +from database.models.activities import Activities as ActivitiesDB +from database.models.activities import PlayerActivities as PlayerActivitiesDB from sqlalchemy import insert, select from sqlalchemy.ext.asyncio import AsyncSession -from src.app.repositories.abc import ABCRepo -from src.app.schemas.input.activities import PlayerActivities -from src.database.models.activities import Activities as ActivitiesDB -from src.database.models.activities import PlayerActivities as PlayerActivitiesDB - class ActivitiesRepo(ABCRepo): """Repository for managing activity data.""" @@ -14,7 +13,10 @@ def __init__(self) -> None: """Initializes the ActivitiesRepo instance.""" super().__init__() - async def request(self, activity_id: int = None) -> list[ActivitiesDB]: + async def create(self, data): + return await super().create(data) + + async def request(self, activity_id: int = None) -> list[Activities]: """Retrieves activity data from the database. Args: @@ -32,8 +34,14 @@ async def request(self, activity_id: int = None) -> list[ActivitiesDB]: async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity data = await session.execute(sql) - data = await data.all() - return data + data = data.scalars() + return [Activities(**d.__dict__) for d in data] + + async def update(self, id, data): + return await super().update(id, data) + + async def delete(self, id): + return await super().delete(id) class PlayerActivitiesRepo(ABCRepo): @@ -56,3 +64,12 @@ async def create(self, data: list[PlayerActivities]) -> None: async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity await session.execute(sql) + + async def request(self, id): + return await super().request(id) + + async def update(self, id, data): + return await super().update(id, data) + + async def delete(self, id): + return await super().delete(id) diff --git a/src/app/repositories/highscore.py b/src/app/repositories/highscore.py index ad5f7b6..84a8c05 100644 --- a/src/app/repositories/highscore.py +++ b/src/app/repositories/highscore.py @@ -14,26 +14,6 @@ def __init__(self) -> None: """Initializes the HighscoreRepo instance.""" super().__init__() - async def request(self, id: list[int] = None) -> list[PlayerHiscoreDataDB]: - """Retrieves highscore data from the database. - - Args: - id: Optional list of highscore IDs to filter by. - - Returns: - A list of PlayerHiscoreDataDB objects representing the retrieved highscores. - """ - table = PlayerHiscoreDataDB - sql = select(table) - if id: - sql = sql.where(table.id.in_(id)) - - async with await self._get_session() as session: - session: AsyncSession # Type annotation for clarity - data = await session.execute(sql) - data = await data.all() - return data - async def create( self, highscore_data: list[PlayerHiscoreData], player_data: list[Player] ) -> None: @@ -62,6 +42,26 @@ async def create( await session.execute(sql_update) # Update player data await session.commit() + async def request(self, id: list[int] = None) -> list[PlayerHiscoreDataDB]: + """Retrieves highscore data from the database. + + Args: + id: Optional list of highscore IDs to filter by. + + Returns: + A list of PlayerHiscoreDataDB objects representing the retrieved highscores. + """ + table = PlayerHiscoreDataDB + sql = select(table) + if id: + sql = sql.where(table.id.in_(id)) + + async with await self._get_session() as session: + session: AsyncSession # Type annotation for clarity + data = await session.execute(sql) + data = await data.all() + return data + async def update(self, id, data): return await super().update(id, data) diff --git a/src/app/repositories/scraper_data.py b/src/app/repositories/scraper_data.py index c3587ea..e919faa 100644 --- a/src/app/repositories/scraper_data.py +++ b/src/app/repositories/scraper_data.py @@ -1,14 +1,18 @@ -from datetime import datetime +import logging +from app.repositories.abc import ABCRepo +from app.schemas.input.activities import PlayerActivities +from app.schemas.input.player import Player +from app.schemas.input.skills import PlayerSkills from app.schemas.scraper_data import ScraperCreate, ScraperData -from sqlalchemy import and_, insert, select +from database.models.activities import PlayerActivities as PlayerActivitiesDB +from database.models.player import Player as PlayerDB +from database.models.scraper_data import ScraperData as ScraperDataDB +from database.models.skills import PlayerSkills as PlayerSkillsDB +from sqlalchemy import and_, insert, select, update from sqlalchemy.ext.asyncio import AsyncSession -from src.app.repositories.abc import ABCRepo -from src.app.repositories.activities import PlayerActivities -from src.app.repositories.skills import PlayerSkills -from src.app.schemas.input.highscore import PlayerHiscoreData -from src.database.models.scraper_data import ScraperData as ScraperDataDB +logger = logging.getLogger(__name__) class ScraperDataRepo(ABCRepo): @@ -18,83 +22,65 @@ def __init__(self) -> None: """Initializes the SkillsRepo instance.""" super().__init__() - async def request(self, scraper_id: int = None) -> list[ScraperData]: - """ """ - table = ScraperDataDB - sql = select(table) - sql = sql.where(ScraperDataDB.scraper_id == scraper_id) - - async with await self._get_session() as session: - session: AsyncSession - data = await session.execute(sql) - data = await data.all() - return [ScraperData(**d) for d in data] + async def request(self, id): + return await super().request(id) async def create( - self, highscore_data: list[PlayerHiscoreData], player_data: list + self, + highscore_data: list[ + tuple[list[PlayerSkills], list[PlayerActivities], ScraperCreate] + ], + player_data: list[Player], ) -> None: - repo_skills = PlayerSkills() - repo_activities = PlayerActivities() - - date_fmt = "%Y-%m-%d %HH:%MM:%SS" - data = [ - { - "record": ScraperCreate( - player_id=d.Player_id, - created_at=datetime.strptime(d.timestamp, date_fmt), - ), - "highscore": d.model_dump(), - } - for d in highscore_data - ] - table = ScraperDataDB - sql_insert = insert(table) - sql_insert = sql_insert.values([d["record"].model_dump() for d in data]) - sql_insert = sql_insert.prefix_with("IGNORE") - - sql_select = select(table) - sql_insert = sql_select.where - async with await self._get_session() as session: session: AsyncSession - # insert scraperdata - await session.execute(sql_insert) - - # get scraper_id - for d in data: - _d: ScraperCreate = d["record"] - date = datetime.fromisoformat(_d.created_at).date() - sql_select = select(table) - sql_select = sql_select.where( - and_(table.player_id == _d.player_id, table.record_date == date) + skills = [] + activities = [] + for data in highscore_data: + # insert scraperdata + await session.execute( + insert(table).values(data[2].model_dump()).prefix_with("ignore") + ) + scraper_record = await session.execute( + select(table).where( + and_( + table.player_id == data[2].player_id, + table.record_date == data[2].created_at.date(), + ) + ) ) - result = await session.execute(sql_select) - result = result.first() - scraper_id = "" # TODO from result - SKILLS = [] # hardcoded - ACTIVITIES = [] # hardcoded + scraper_record = scraper_record.scalars() + scraper_record = [ScraperData(**s.__dict__) for s in scraper_record] + assert len(scraper_record) == 1 + scraper_record = scraper_record[0] + + for d in data[0]: + d.scraper_id = scraper_record.scraper_id + skills.append(d.model_dump()) + for d in data[1]: + d.scraper_id = scraper_record.scraper_id + activities.append(d.model_dump()) + + await session.execute( + insert(PlayerActivitiesDB).values(activities).prefix_with("ignore") + ) + await session.execute( + insert(PlayerSkillsDB).values(skills).prefix_with("ignore") + ) - d["skills"] = [ - { - "name": s, - "skill_value": d["highscore"].get(s), - "scraper_id": scraper_id, - } - for s in SKILLS - ] - d["activities"] = [ - { - "name": a, - "activity_value": d["highscore"].get(a), - "scraper_id": scraper_id, - } - for a in ACTIVITIES - ] - d.pop("highscore") # cleanup memory - # TODO: bulk insert values with repo skills & activities - # REPO skills & activities must convert skill/activity_name to id - # REPO skills & activities must take a pydantic class so we should convert above to class name:str, skill/activity_value:int, scraper_id:int + for player in player_data: + await session.execute( + update(PlayerDB) + .values(player.model_dump()) + .where(PlayerDB.id == player.id) + ) await session.commit() - return data + return None + + async def update(self, id, data): + return await super().update(id, data) + + async def delete(self, id): + return await super().delete(id) diff --git a/src/app/repositories/skills.py b/src/app/repositories/skills.py index 5fe5c14..aa98ec2 100644 --- a/src/app/repositories/skills.py +++ b/src/app/repositories/skills.py @@ -1,10 +1,13 @@ +import logging + +from app.repositories.abc import ABCRepo +from app.schemas.input.skills import PlayerSkills, Skills +from database.models.skills import PlayerSkills as PlayerSkillsDB +from database.models.skills import Skills as SkillsDB from sqlalchemy import insert, select from sqlalchemy.ext.asyncio import AsyncSession -from src.app.repositories.abc import ABCRepo -from src.app.schemas.input.skills import PlayerSkills -from src.database.models.skills import PlayerSkills as PlayerSkillsDB -from src.database.models.skills import Skills as SkillsDB +logger = logging.getLogger(__name__) class SkillsRepo(ABCRepo): @@ -14,7 +17,10 @@ def __init__(self) -> None: """Initializes the SkillsRepo instance.""" super().__init__() - async def request(self, skill_id: int = None) -> list[SkillsDB]: + async def create(self, data): + return await super().create(data) + + async def request(self, skill_id: int = None) -> list[Skills]: """Retrieves skill data from the database. Args: @@ -32,8 +38,14 @@ async def request(self, skill_id: int = None) -> list[SkillsDB]: async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity data = await session.execute(sql) - data = await data.all() - return data + data = data.scalars() + return [Skills(**d.__dict__) for d in data] + + async def update(self, id, data): + return await super().update(id, data) + + async def delete(self, id): + return await super().delete(id) class PlayerSkillsRepo(ABCRepo): @@ -56,3 +68,12 @@ async def create(self, data: list[PlayerSkills]) -> None: async with await self._get_session() as session: session: AsyncSession # Type annotation for clarity await session.execute(sql) + + async def request(self, id): + return await super().request(id) + + async def update(self, id, data): + return await super().update(id, data) + + async def delete(self, id): + return await super().delete(id) diff --git a/src/main.py b/src/main.py index d1903e2..f41240a 100644 --- a/src/main.py +++ b/src/main.py @@ -5,11 +5,16 @@ from asyncio import Queue import my_kafka as my_kafka +from app.repositories.activities import ActivitiesRepo # schemas import from app.repositories.highscore import HighscoreRepo from app.repositories.scraper_data import ScraperDataRepo +from app.repositories.skills import SkillsRepo +from app.schemas.input.activities import Activities, PlayerActivities from app.schemas.input.message import Message +from app.schemas.input.skills import PlayerSkills, Skills +from app.schemas.scraper_data import ScraperCreate from core.config import settings from sqlalchemy.exc import IntegrityError, OperationalError @@ -72,8 +77,48 @@ async def insert_data_v2(batch: list[Message], error_queue: Queue): logger.info(f"Received: {len(players)=}, {len(highscores)=}") - repo = ScraperDataRepo() - await repo.create(highscore_data=highscores, player_data=players) + scraper_repo = ScraperDataRepo() + + skills_repo = SkillsRepo() + activities_repo = ActivitiesRepo() + + skills = {s.skill_name: s for s in await skills_repo.request()} + + activities = {a.activity_name: a for a in await activities_repo.request()} + + highscore_data = [] + scraper_data = [] + for highscore in highscores: + player_skills: list[PlayerSkills] = [] + player_activities: list[PlayerActivities] = [] + scraper_data = ScraperCreate( + player_id=highscore.Player_id, created_at=highscore.timestamp + ) + _highscore = highscore.model_dump() + assert isinstance(_highscore, dict) + logger.info(_highscore) + for k, v in _highscore.items(): + if k in skills.keys(): + skill = skills.get(k) + assert isinstance(skill, Skills) + player_skills.append( + PlayerSkills( + scraper_id=None, skill_id=skill.skill_id, skill_value=v + ) + ) + if k in activities.keys(): + activity = activities.get(k) + assert isinstance(activity, Activities) + player_activities.append( + PlayerActivities( + scraper_id=None, + activity_id=activity.activity_id, + activity_value=v, + ) + ) + highscore_data.append((player_skills, player_activities, scraper_data)) + logger.info(f"{highscore_data[0]}, {players[0]}") + await scraper_repo.create(highscore_data=highscore_data, player_data=players) except (OperationalError, IntegrityError) as e: for message in batch: await error_queue.put(message) @@ -133,8 +178,9 @@ async def process_data(receive_queue: Queue, error_queue: Queue): # insert data in batches of N or interval of N if len(batch) > 100 or now - start_time > 15: async with semaphore: - await insert_data_v1(batch=batch, error_queue=error_queue) - # await insert_data_v2(batch=batch, error_queue=error_queue) + # await insert_data_v1(batch=batch, error_queue=error_queue) + await insert_data_v2(batch=batch, error_queue=error_queue) + break batch = [] receive_queue.task_done() From 9132e877c473590a2c8c5dcbb79a45a042c61121 Mon Sep 17 00:00:00 2001 From: extreme4all <40169115+extreme4all@users.noreply.github.com> Date: Sun, 3 Mar 2024 11:20:44 +0100 Subject: [PATCH 35/39] remove breaks --- src/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main.py b/src/main.py index f41240a..85058fd 100644 --- a/src/main.py +++ b/src/main.py @@ -178,9 +178,8 @@ async def process_data(receive_queue: Queue, error_queue: Queue): # insert data in batches of N or interval of N if len(batch) > 100 or now - start_time > 15: async with semaphore: - # await insert_data_v1(batch=batch, error_queue=error_queue) + await insert_data_v1(batch=batch, error_queue=error_queue) await insert_data_v2(batch=batch, error_queue=error_queue) - break batch = [] receive_queue.task_done() From d08feca9e3dd04b2480bb8c52be8f1f28f7db16c Mon Sep 17 00:00:00 2001 From: extreme4all <40169115+extreme4all@users.noreply.github.com> Date: Sun, 3 Mar 2024 11:47:26 +0100 Subject: [PATCH 36/39] player_id => int --- mysql/docker-entrypoint-initdb.d/01_tables.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mysql/docker-entrypoint-initdb.d/01_tables.sql b/mysql/docker-entrypoint-initdb.d/01_tables.sql index b8c2c9c..f7bd314 100644 --- a/mysql/docker-entrypoint-initdb.d/01_tables.sql +++ b/mysql/docker-entrypoint-initdb.d/01_tables.sql @@ -128,8 +128,8 @@ CREATE TRIGGER `hiscore_date_OnInsert` BEFORE INSERT ON `playerHiscoreData` FOR CREATE TABLE scraper_data ( scraper_id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - player_id SMALLINT UNSIGNED NOT NULL, record_date DATE AS (DATE(created_at)) STORED, + player_id INT UNSIGNED NOT NULL, UNIQUE KEY unique_player_per_day (player_id, record_date) ); From 5f963b9de43086be3c31e981979b51921f58ad69 Mon Sep 17 00:00:00 2001 From: extreme4all <> Date: Sat, 9 Mar 2024 12:17:03 +0100 Subject: [PATCH 37/39] added execution time debug --- src/main.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/main.py b/src/main.py index 85058fd..736209e 100644 --- a/src/main.py +++ b/src/main.py @@ -3,6 +3,7 @@ import time import traceback from asyncio import Queue +from functools import wraps import my_kafka as my_kafka from app.repositories.activities import ActivitiesRepo @@ -21,6 +22,20 @@ logger = logging.getLogger(__name__) +def async_timeit(func): + @wraps(func) + async def wrapper(*args, **kwargs): + start_time = time.time() + result = await func(*args, **kwargs) + end_time = time.time() + logger.debug( + f"Execution time for {func.__name__}: {end_time - start_time} seconds" + ) + return result + + return wrapper + + def log_speed( counter: int, start_time: float, _queue: Queue, topic: str, interval: int = 15 ) -> tuple[float, int]: @@ -46,6 +61,7 @@ def log_speed( return time.time(), 0 +@async_timeit async def insert_data_v1(batch: list[Message], error_queue: Queue): try: highscores = [msg.hiscores for msg in batch if msg.hiscores] @@ -70,6 +86,7 @@ async def insert_data_v1(batch: list[Message], error_queue: Queue): logger.info(f"error_qsize={error_queue.qsize()}, {message=}") +@async_timeit async def insert_data_v2(batch: list[Message], error_queue: Queue): try: highscores = [msg.hiscores for msg in batch if msg.hiscores] From 16e8c7fbb70cb8ca870d9dd3a566067e28ffb51e Mon Sep 17 00:00:00 2001 From: extreme4all <> Date: Sat, 9 Mar 2024 13:18:52 +0100 Subject: [PATCH 38/39] add timing decorator --- src/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.py b/src/main.py index 736209e..124d6e0 100644 --- a/src/main.py +++ b/src/main.py @@ -113,7 +113,7 @@ async def insert_data_v2(batch: list[Message], error_queue: Queue): ) _highscore = highscore.model_dump() assert isinstance(_highscore, dict) - logger.info(_highscore) + # logger.info(_highscore) for k, v in _highscore.items(): if k in skills.keys(): skill = skills.get(k) @@ -134,7 +134,7 @@ async def insert_data_v2(batch: list[Message], error_queue: Queue): ) ) highscore_data.append((player_skills, player_activities, scraper_data)) - logger.info(f"{highscore_data[0]}, {players[0]}") + # logger.info(f"{highscore_data[0]}, {players[0]}") await scraper_repo.create(highscore_data=highscore_data, player_data=players) except (OperationalError, IntegrityError) as e: for message in batch: From 1940c98d9f1454636a587de24a1a7de7d6563232 Mon Sep 17 00:00:00 2001 From: extreme4all <40169115+extreme4all@users.noreply.github.com> Date: Sat, 16 Mar 2024 13:01:48 +0100 Subject: [PATCH 39/39] total will be greater than unsigned int --- mysql/docker-entrypoint-initdb.d/02_data.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mysql/docker-entrypoint-initdb.d/02_data.sql b/mysql/docker-entrypoint-initdb.d/02_data.sql index 69af2b2..172f519 100644 --- a/mysql/docker-entrypoint-initdb.d/02_data.sql +++ b/mysql/docker-entrypoint-initdb.d/02_data.sql @@ -55,7 +55,7 @@ SET ; INSERT INTO skills (skill_name) VALUES - ('total'), + -- ('total'), ('attack'), ('defence'), ('strength'),