From 4981fd00e3b57e89d4c94a3633329a2e23f476db Mon Sep 17 00:00:00 2001 From: "Martindale, Nathan" Date: Sun, 22 Oct 2023 10:12:31 -0400 Subject: [PATCH 1/7] Add portions of boilerplate for sqlalchemy store --- curifactory/dbschema.py | 74 +++++++++++++++++++++++++++++++++++++++++ curifactory/store.py | 17 ++++++++++ requirements.txt | 1 + 3 files changed, 92 insertions(+) create mode 100644 curifactory/dbschema.py diff --git a/curifactory/dbschema.py b/curifactory/dbschema.py new file mode 100644 index 0000000..5a8f632 --- /dev/null +++ b/curifactory/dbschema.py @@ -0,0 +1,74 @@ +"""The metadata and table definitions for sqlalchemy for the experiment store.""" + +# TODO: will need to decide beteen Core and ORM usage +# from sqlalchemy import MetaData, Table, Column + + +# metadata_obj = MetaData() +# +# store_info = Table( +# +# ) +# + +# https://docs.sqlalchemy.org/en/20/tutorial/metadata.html + +from sqlalchemy.orm import ( + DeclarativeBase, + ForeignKey, + Mapped, + mapped_column, + relationship, +) + +# TODO: it may be useful to have a module that just tracks per-version schemas, +# and this file loads in the correct version based on dbinfo? + + +class Base(DeclarativeBase): + pass + + +class DBInfo(Base): + """We probably expect to have a "version", "user", and "machine" properties.""" + + __tablename__ = "db_info" + + id: Mapped[int] = mapped_column(primary_key=True) + key: Mapped[str] + value: Mapped[str] + + +class Run(Base): + __tablename__ = "run" + + id: Mapped[int] = mapped_column( + primary_key=True + ) # TODO id should maybe actually be ref name + reference: Mapped[str] + experiment_name: Mapped[str] + run_number: Mapped[int] + timestamp: Mapped[str] # TODO: maybe we can make this an actual datetime stamp now? + commit: Mapped[str] + param_files: Mapped[list["RunParamFileNames"]] = relationship(back_populates="run") + + # params + + full_store: Mapped[bool] + status: Mapped[str] + cli: Mapped[str] + hostname: Mapped[str] + user: Mapped[str] + notes: Mapped[str] + + +class RunParamFileNames(Base): + __tablename__ = "run_param_file_name" + + id: Mapped[int] = mapped_column(primary_key=True) + + run_id = mapped_column(ForeignKey("run.id")) + run: Mapped[Run] = relationship(back_populates="param_files") + + +# TODO: need a run param_file_name to paramname and param hash diff --git a/curifactory/store.py b/curifactory/store.py index 9e5941f..5de349c 100644 --- a/curifactory/store.py +++ b/curifactory/store.py @@ -3,9 +3,26 @@ import json import os +from sqlalchemy import create_engine + from curifactory import utils +class SQLStore: + """EXPERIMENTAL, making an sqlite version of the data below.""" + + def __init__(self, manager_cache_path: str): + self.path = manager_cache_path + """The location to store the ``store.db``.""" + + if self.path[-1] != "/": + self.path += "/" + + self.path += "store.db" + + self.engine = create_engine(f"sqlite:///{self.path}") + + class ManagerStore: """Manages the mini database of metadata on previous experiment runs. This is how we keep track of experiment run numbers etc. A metadata block for each run is stored in diff --git a/requirements.txt b/requirements.txt index 65645d2..9e1d71d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,4 @@ pyarrow>=7 lxml openpyxl tables +sqlalchemy From ce9306f616c826ee90777f37192db91c85c595bc Mon Sep 17 00:00:00 2001 From: "Martindale, Nathan" Date: Sun, 29 Oct 2023 09:27:27 -0400 Subject: [PATCH 2/7] Change dbschema to use sqlalchemy core instead of orm --- curifactory/dbschema.py | 94 +++++++++++++---------------------------- 1 file changed, 29 insertions(+), 65 deletions(-) diff --git a/curifactory/dbschema.py b/curifactory/dbschema.py index 5a8f632..349ffc0 100644 --- a/curifactory/dbschema.py +++ b/curifactory/dbschema.py @@ -1,74 +1,38 @@ """The metadata and table definitions for sqlalchemy for the experiment store.""" # TODO: will need to decide beteen Core and ORM usage -# from sqlalchemy import MetaData, Table, Column +from sqlalchemy import Boolean, Column, DateTime, Integer, MetaData, String, Table +metadata_obj = MetaData() -# metadata_obj = MetaData() -# -# store_info = Table( -# -# ) -# - -# https://docs.sqlalchemy.org/en/20/tutorial/metadata.html - -from sqlalchemy.orm import ( - DeclarativeBase, - ForeignKey, - Mapped, - mapped_column, - relationship, +store_info = Table( + "store_info", + metadata_obj, + Column("key", String, primary_key=True), + Column("value", String), ) -# TODO: it may be useful to have a module that just tracks per-version schemas, -# and this file loads in the correct version based on dbinfo? - - -class Base(DeclarativeBase): - pass - - -class DBInfo(Base): - """We probably expect to have a "version", "user", and "machine" properties.""" - - __tablename__ = "db_info" - - id: Mapped[int] = mapped_column(primary_key=True) - key: Mapped[str] - value: Mapped[str] - - -class Run(Base): - __tablename__ = "run" - - id: Mapped[int] = mapped_column( - primary_key=True - ) # TODO id should maybe actually be ref name - reference: Mapped[str] - experiment_name: Mapped[str] - run_number: Mapped[int] - timestamp: Mapped[str] # TODO: maybe we can make this an actual datetime stamp now? - commit: Mapped[str] - param_files: Mapped[list["RunParamFileNames"]] = relationship(back_populates="run") - - # params - - full_store: Mapped[bool] - status: Mapped[str] - cli: Mapped[str] - hostname: Mapped[str] - user: Mapped[str] - notes: Mapped[str] - - -class RunParamFileNames(Base): - __tablename__ = "run_param_file_name" - - id: Mapped[int] = mapped_column(primary_key=True) - - run_id = mapped_column(ForeignKey("run.id")) - run: Mapped[Run] = relationship(back_populates="param_files") +runs_table = Table( + "run", + metadata_obj, + Column("id", Integer, primary_key=True), + Column("reference", String), + Column("experiment_name", String), + Column("run_number", Integer), + Column("timestamp", DateTime), + Column("commit", String), + Column("param_files", String), # NOTE: this will be a json.dumps, + # since this is likely to change in later cf versions, I don't want + # to bother correctly normalizing this part of the table, since I + # don't think there will be need to query on it anyway. + Column("full_store", Boolean), + Column("status", String), + Column("cli", String), + Column("hostname", String), + Column("user", String), + Column("notes", String), +) -# TODO: need a run param_file_name to paramname and param hash +# https://docs.sqlalchemy.org/en/20/tutorial/metadata.html +# https://docs.sqlalchemy.org/en/20/tutorial/data_insert.html#tutorial-core-insert From 9332e7e411de8a813bf8628712375337adede46a Mon Sep 17 00:00:00 2001 From: "Martindale, Nathan" Date: Sun, 29 Oct 2023 09:47:50 -0400 Subject: [PATCH 3/7] Implement get_run on sqlstore --- curifactory/store.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/curifactory/store.py b/curifactory/store.py index 5de349c..9ba5cb4 100644 --- a/curifactory/store.py +++ b/curifactory/store.py @@ -3,9 +3,10 @@ import json import os -from sqlalchemy import create_engine +from sqlalchemy import create_engine, select from curifactory import utils +from curifactory.dbschema import runs_table class SQLStore: @@ -22,6 +23,36 @@ def __init__(self, manager_cache_path: str): self.engine = create_engine(f"sqlite:///{self.path}") + def get_run(self, ref_name: str) -> tuple[dict, int]: + """Get the metadata block for the run with the specified reference name. + + Args: + ref_name (str): The run reference name, following the [experiment_name]_[run_number]_[timestamp] format. + + Returns: + A dictionary (metadata block) for the run with the requested reference name, and the + index of the run in the table. + """ + # https://docs.sqlalchemy.org/en/20/tutorial/data_select.html + + with self.engine.connect() as conn: + stmt = select(runs_table).where( + runs_table.c.reference == ref_name + ) # TODO: do I need to use prepare? + result = conn.execute(stmt) + + # if we didn't get any rows back, this run doesn't exist. + if len(result) == 0: + return None, -1 + + run = result[ + 0 + ]._asdict() # NOTE: documented function of namedtuple, _ here doesn't imply hidden/not intended for use + run.param_files = json.loads(run.param_files) + return run, run.id + + return None, -1 + class ManagerStore: """Manages the mini database of metadata on previous experiment runs. This is how we From 9c0e82ee75f851301bc6aff56130a26c9fb02c99 Mon Sep 17 00:00:00 2001 From: "Martindale, Nathan" Date: Mon, 6 Nov 2023 21:30:32 -0500 Subject: [PATCH 4/7] Add add_run function to SQLStore --- curifactory/dbschema.py | 2 ++ curifactory/store.py | 76 +++++++++++++++++++++++++++++++++++++++-- test/test_store.py | 15 ++++++++ 3 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 test/test_store.py diff --git a/curifactory/dbschema.py b/curifactory/dbschema.py index 349ffc0..46d75db 100644 --- a/curifactory/dbschema.py +++ b/curifactory/dbschema.py @@ -25,6 +25,8 @@ # since this is likely to change in later cf versions, I don't want # to bother correctly normalizing this part of the table, since I # don't think there will be need to query on it anyway. + Column("params", String), + Column("workdir_dirty", Boolean), Column("full_store", Boolean), Column("status", String), Column("cli", String), diff --git a/curifactory/store.py b/curifactory/store.py index 9ba5cb4..b6876b3 100644 --- a/curifactory/store.py +++ b/curifactory/store.py @@ -3,10 +3,10 @@ import json import os -from sqlalchemy import create_engine, select +from sqlalchemy import create_engine, func, insert, select from curifactory import utils -from curifactory.dbschema import runs_table +from curifactory.dbschema import metadata_obj, runs_table class SQLStore: @@ -23,6 +23,11 @@ def __init__(self, manager_cache_path: str): self.engine = create_engine(f"sqlite:///{self.path}") + self._ensure_tables() + + def _ensure_tables(self): + metadata_obj.create_all(self.engine) + def get_run(self, ref_name: str) -> tuple[dict, int]: """Get the metadata block for the run with the specified reference name. @@ -53,6 +58,73 @@ def get_run(self, ref_name: str) -> tuple[dict, int]: return None, -1 + def add_run(self, mngr) -> dict: + """Add a new metadata block to the store for the passed ``ArtifactManager`` instance. + + Note that this automatically calls the ``save()`` function. + + Args: + mngr (ArtifactManager): The manager to grab run metadata from. + + Returns: + The newly created dictionary (metadata block) for the current manager's run. + """ + + # get the new run number + with self.engine.connect() as conn: + stmt = select(func.count(runs_table.c.id)).where( + runs_table.c.experiment_name == mngr.experiment_name + ) + result = conn.execute(stmt) + + # TODO: (11/6/2023) none of these should be set here... + run_count = result.all()[0][0] + mngr.experiment_run_number = run_count + 1 + mngr.git_commit_hash = utils.get_current_commit() + mngr.git_workdir_dirty = utils.check_git_dirty_workingdir() + + # insert the new entry + with self.engine.connect() as conn: + stmt = insert(runs_table).values( + reference=mngr.get_reference_name(), + experiment_name=mngr.experiment_name, + run_number=mngr.experiment_run_number, + timestamp=mngr.run_timestamp, + commit=mngr.git_commit_hash, + workdir_dirty=mngr.git_workdir_dirty, + param_files=str(mngr.parameter_files), + params=str(mngr.param_file_param_sets), + full_store=mngr.store_full, + status="incomplete", + cli=mngr.run_line, + hostname=mngr.hostname, + notes=mngr.notes, + ) + conn.execute(stmt) + + # create the metadata block + run_dict = { + "reference": mngr.get_reference_name(), + "experiment_name": mngr.experiment_name, + "run_number": mngr.experiment_run_number, + "timestamp": mngr.get_str_timestamp(), + "commit": mngr.git_commit_hash, + "workdir_dirty": mngr.git_workdir_dirty, + "param_files": mngr.parameter_files, + "params": mngr.param_file_param_sets, + "full_store": mngr.store_full, + "status": "incomplete", + "cli": mngr.run_line, + "hostname": mngr.hostname, + "notes": mngr.notes, + } + + # sanitize reproduction cli command + if mngr.store_full: + run_dict = self._get_reproduction_line(mngr, run_dict) + + return run_dict + class ManagerStore: """Manages the mini database of metadata on previous experiment runs. This is how we diff --git a/test/test_store.py b/test/test_store.py new file mode 100644 index 0000000..1bc72b5 --- /dev/null +++ b/test/test_store.py @@ -0,0 +1,15 @@ +from curifactory.experiment import run_experiment +from curifactory.store import SQLStore + + +def test_add_run(configured_test_manager): + store = SQLStore(configured_test_manager.manager_cache_path) + + results, mngr = run_experiment( + "simple_cache", + ["simple_cache"], + param_set_names=["thing1", "thing2"], + mngr=configured_test_manager, + ) + + store.add_run(mngr) From 8c38d53d32f709afd81e96b067bc82e95d9d0a79 Mon Sep 17 00:00:00 2001 From: "Martindale, Nathan" Date: Mon, 6 Nov 2023 21:38:45 -0500 Subject: [PATCH 5/7] Potentially fix CI issues --- test/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/conftest.py b/test/conftest.py index ad557a2..da3c5bf 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -73,6 +73,7 @@ def configured_test_manager( ): shutil.rmtree("test/examples/data", ignore_errors=True) shutil.rmtree("test/examples/notebooks", ignore_errors=True) + shutil.rmtree("test/examples/reports", ignore_errors=True) mock = mocker.patch("curifactory.utils.get_configuration") mock.return_value = configuration @@ -80,6 +81,7 @@ def configured_test_manager( shutil.rmtree("test/examples/data", ignore_errors=True) shutil.rmtree("test/examples/notebooks", ignore_errors=True) + shutil.rmtree("test/examples/reports", ignore_errors=True) @pytest.fixture() From dd80c4abb580abdf777994362599da1811382839 Mon Sep 17 00:00:00 2001 From: "Martindale, Nathan" Date: Wed, 8 Nov 2023 20:14:04 -0500 Subject: [PATCH 6/7] Add missing reproduction line function to sql store --- curifactory/store.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/curifactory/store.py b/curifactory/store.py index b6876b3..f45a615 100644 --- a/curifactory/store.py +++ b/curifactory/store.py @@ -125,6 +125,26 @@ def add_run(self, mngr) -> dict: return run_dict + # NOTE: we have to call this both from add_run and update_run because manager stores itself on init, but if + # someone _later_ sets store_full (maybe in a live run) we need to be able to handle this being added to the run_info + def _get_reproduction_line(self, mngr, run: dict) -> dict: + sanitized_run_line = mngr.run_line + if "--overwrite " in sanitized_run_line: + sanitized_run_line = sanitized_run_line.replace("--overwrite ", "") + if sanitized_run_line.endswith("--overwrite"): + sanitized_run_line = sanitized_run_line[:-12] + sanitized_run_line = sanitized_run_line.replace("--store-full ", "") + if sanitized_run_line.endswith("--store-full"): + sanitized_run_line = sanitized_run_line[:-13] + + cache_path = mngr.get_run_output_path() + mngr.reproduction_line = ( + f"{sanitized_run_line} --cache {cache_path}/artifacts --dry-cache" + ) + + run["reproduce"] = mngr.reproduction_line + return run + class ManagerStore: """Manages the mini database of metadata on previous experiment runs. This is how we From 5e81fa5eae7194d4fb3993a0d715ecfbbace06f7 Mon Sep 17 00:00:00 2001 From: "Martindale, Nathan" Date: Wed, 8 Nov 2023 21:11:38 -0500 Subject: [PATCH 7/7] Create basic RunMetadata class This makes it a lot easier to interface with the sql database since we can use dataclasses' asdict, and have a function to sanitize if needed. Also this partially begins to address #65 --- curifactory/dbschema.py | 9 +++-- curifactory/run.py | 74 +++++++++++++++++++++++++++++++++++++++++ curifactory/store.py | 23 ++++++------- 3 files changed, 92 insertions(+), 14 deletions(-) create mode 100644 curifactory/run.py diff --git a/curifactory/dbschema.py b/curifactory/dbschema.py index 46d75db..456f2b6 100644 --- a/curifactory/dbschema.py +++ b/curifactory/dbschema.py @@ -15,8 +15,7 @@ runs_table = Table( "run", metadata_obj, - Column("id", Integer, primary_key=True), - Column("reference", String), + Column("reference", String, primary_key=True), Column("experiment_name", String), Column("run_number", Integer), Column("timestamp", DateTime), @@ -33,6 +32,12 @@ Column("hostname", String), Column("user", String), Column("notes", String), + Column("uncommited_patch", String), + Column("pip_freeze", String), + Column("conda_env", String), + Column("conda_env_full", String), + Column("os", String), + Column("reproduce", String), ) diff --git a/curifactory/run.py b/curifactory/run.py new file mode 100644 index 0000000..06c79be --- /dev/null +++ b/curifactory/run.py @@ -0,0 +1,74 @@ +"""Class for tracking run metadata.""" + +from dataclasses import dataclass +from datetime import datetime + + +@dataclass +class RunMetadata: + """Data structure for tracking all the relevant metadata for a run, + making it easier/providing a consistent interface for accessing + the information and converting it into formats necessary for saving. + """ + + reference: str + """The full reference name of the experiment, usually + ``[experiment_name]_[run_number]_[timestamp]``.""" + experiment_name: str + """The name of the experiment and/or the prefix used for caching.""" + run_number: int + """The run counter for experiments with the given name.""" + timestamp: datetime + """The datetime timestamp for when the manager is initialized (and usually + also when the experiment starts running.)""" + + param_files: list[str] + """The list of parameter file names (minus extension, as they would be + passed into the CLI.)""" + params: dict[str, list[list[str, str]]] + """A dictionary of parameter file names for keys, where each value is an array of arrays, + each inner array containing the parameter set name and the parameter set hash, for the + parameter sets that come from that parameter file. + + e.g. ``{"my_param_file": [ [ "my_param_set", "44b5e428e7165975a3e4f0d1674dbe5f" ] ] }`` + """ + full_store: bool + """Whether this store was being fully exported or not.""" + + commit: str + """The current git commit hash.""" + workdir_dirty: bool + """True if there are uncommited changes in the git repo.""" + uncommited_patch: str + """The output of ``git diff -p`` at runtime, to help more precisely reconstruct current codebase.""" + + status: str + """Ran status: success/incomplete/error/etc.""" + cli: str + """The CLI line this run was created with.""" + reproduce: str + """The translated CLI line to reproduce this run.""" + + hostname: str + """The name of the machine this experiment ran on.""" + user: str + """The name of the user account the experiment was run with.""" + notes: str + """User-entered notes associated with a session/run to output into the report etc.""" + + pip_freeze: str + """The output from a ``pip freeze`` command.""" + conda_env: str + """The output from ``conda env export --from-history``.""" + conda_env_full: str + """The output from ``conda env export``.""" + os: str + """The name of the current OS running curifactory.""" + + def as_sql_safe_dict(self) -> dict: + """Meant to be used when inserting/updating values in the Runs + sql table. + + The targeted column names can be found in dbschema.py. + """ + pass diff --git a/curifactory/store.py b/curifactory/store.py index f45a615..c541bb4 100644 --- a/curifactory/store.py +++ b/curifactory/store.py @@ -12,6 +12,7 @@ class SQLStore: """EXPERIMENTAL, making an sqlite version of the data below.""" + # TODO: (11/8/2023) make this take the full store path instead def __init__(self, manager_cache_path: str): self.path = manager_cache_path """The location to store the ``store.db``.""" @@ -26,9 +27,11 @@ def __init__(self, manager_cache_path: str): self._ensure_tables() def _ensure_tables(self): + """Check for the existence of (and create if necessary) all of the tables + listed in dbscheme.py""" metadata_obj.create_all(self.engine) - def get_run(self, ref_name: str) -> tuple[dict, int]: + def get_run(self, ref_name: str) -> dict: """Get the metadata block for the run with the specified reference name. Args: @@ -41,22 +44,18 @@ def get_run(self, ref_name: str) -> tuple[dict, int]: # https://docs.sqlalchemy.org/en/20/tutorial/data_select.html with self.engine.connect() as conn: - stmt = select(runs_table).where( - runs_table.c.reference == ref_name - ) # TODO: do I need to use prepare? + stmt = select(runs_table).where(runs_table.c.reference == ref_name) + # TODO: do I need to use prepare? result = conn.execute(stmt) # if we didn't get any rows back, this run doesn't exist. if len(result) == 0: - return None, -1 + return None - run = result[ - 0 - ]._asdict() # NOTE: documented function of namedtuple, _ here doesn't imply hidden/not intended for use + run = result[0]._asdict() + # NOTE: documented function of namedtuple, _ here doesn't imply hidden/not intended for use run.param_files = json.loads(run.param_files) - return run, run.id - - return None, -1 + return run def add_run(self, mngr) -> dict: """Add a new metadata block to the store for the passed ``ArtifactManager`` instance. @@ -72,7 +71,7 @@ def add_run(self, mngr) -> dict: # get the new run number with self.engine.connect() as conn: - stmt = select(func.count(runs_table.c.id)).where( + stmt = select(func.count()).where( runs_table.c.experiment_name == mngr.experiment_name ) result = conn.execute(stmt)