diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 1c0d5c1a..76d30044 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -3,8 +3,15 @@ updates: - package-ecosystem: pip directory: "/" schedule: - interval: daily + interval: monthly open-pull-requests-limit: 10 target-branch: master labels: - dependency_updates +- package-ecosystem: github-actions + directory: "/" + schedule: + interval: monthly + target-branch: master + labels: + - CI diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 291a54a8..76f70ebf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,17 +11,22 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 30 steps: - - uses: actions/checkout@v1 + + - uses: actions/checkout@v3 + - name: Set up Python 3.8 - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 with: python-version: 3.8 + cache: 'pip' + cache-dependency-path: | + **/requirements*.txt - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install --ignore-installed .[test,dev] + pip install .[test,dev] - name: Run pre-commit run: | @@ -44,4 +49,4 @@ jobs: - name: Run tests with pytest run: | # run tests with pytest, reporting coverage and timings - py.test -rs -vvv --durations=0 --cov=./modnet/ + pytest -m "not slow" -rs -vvv --durations=0 --cov=./modnet/ diff --git a/.gitignore b/.gitignore index f5b186f2..35cf3e19 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,12 @@ +### Custom +modnet/data/ +.mypy_cache + ### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class -modnet/data/ # Distribution / packaging build/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 753957da..6a752f83 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: - id: check-symlinks - id: end-of-file-fixer - - repo: https://gitlab.com/pycqa/flake8 - rev: '3.9.2' + - repo: https://github.com/pycqa/flake8 + rev: '6.0.0' hooks: - id: flake8 diff --git a/README.md b/README.md index a08c34ba..3457a3c0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # MODNet: Material Optimal Descriptor Network -[![arXiv](https://img.shields.io/badge/arXiv-2004.14766-brightgreen)](https://arxiv.org/abs/2004.14766) [![Build Status](https://img.shields.io/github/workflow/status/ppdebreuck/modnet/Run%20tests?logo=github)](https://github.com/ppdebreuck/modnet/actions?query=branch%3Amaster+) [![Read the Docs](https://img.shields.io/readthedocs/modnet)](https://modnet.readthedocs.io/en/latest/) +[![arXiv](https://img.shields.io/badge/arXiv-2004.14766-brightgreen)](https://arxiv.org/abs/2004.14766) [![Build Status](https://img.shields.io/github/actions/workflow/status/ppdebreuck/modnet/ci.yml?logo=github&branch=main)](https://github.com/ppdebreuck/modnet/actions?query=branch%3Amaster+) [![Read the Docs](https://img.shields.io/readthedocs/modnet)](https://modnet.readthedocs.io/en/latest/) ## Introduction @@ -47,14 +47,6 @@ Activate the environment: conda activate modnet ``` -Then, install pymatgen v2020.8.13 with conda, which will bundle several pre-built dependencies (e.g., numpy, scipy): - -```shell -conda install -c conda-forge pymatgen=2020.8.13 -``` - -(you could alternatively do this step with `pip install pymatgen==2020.8.13`). - Finally, install MODNet from PyPI with `pip`: ```shell diff --git a/docs/requirements.txt b/docs/requirements.txt index cfcee751..a77c0214 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,3 @@ -sphinx~=4.4 +sphinx~=5.3 sphinx-rtd-theme~=1.0 sphinxcontrib-napoleon~=0.7 diff --git a/modnet/__init__.py b/modnet/__init__.py index 3cb7d95e..fe087a7d 100644 --- a/modnet/__init__.py +++ b/modnet/__init__.py @@ -1 +1 @@ -__version__ = "0.1.13" +__version__ = "0.2.0~develop" diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index 0835668c..f90e7b6d 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -70,7 +70,7 @@ def featurize(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing pymatgen `Structure` objects. Returns: The featurized DataFrame. @@ -137,7 +137,7 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing pymatgen `Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame, or an empty @@ -184,7 +184,7 @@ def featurize_structure(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing pymatgen `Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. @@ -206,7 +206,7 @@ def featurize_site( Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing pymatgen `Structure` objects. aliases: optional dictionary to map matminer output column names to new aliases, mostly used for backwards-compatibility. diff --git a/modnet/featurizers/presets/__init__.py b/modnet/featurizers/presets/__init__.py index 26479a66..f1417fb7 100644 --- a/modnet/featurizers/presets/__init__.py +++ b/modnet/featurizers/presets/__init__.py @@ -1,8 +1,20 @@ -__all__ = ("FEATURIZER_PRESETS",) +__all__ = ( + "FEATURIZER_PRESETS", + "DEFAULT_FEATURIZER", + "DEFAULT_COMPOSITION_ONLY_FEATURIZER", +) +from typing import Dict, Type from .debreuck_2020 import DeBreuck2020Featurizer, CompositionOnlyFeaturizer +from .matminer_2023 import Matminer2023Featurizer, CompositionOnlyMatminer2023Featurizer +from modnet.featurizers import MODFeaturizer -FEATURIZER_PRESETS = { +DEFAULT_FEATURIZER: str = "Matminer2023" +DEFAULT_COMPOSITION_ONLY_FEATURIZER: str = "CompositionOnlyMatminer2023" + +FEATURIZER_PRESETS: Dict[str, Type[MODFeaturizer]] = { "DeBreuck2020": DeBreuck2020Featurizer, "CompositionOnly": CompositionOnlyFeaturizer, + "Matminer2023": Matminer2023Featurizer, + "CompositionOnlyMatminer2023": CompositionOnlyMatminer2023Featurizer, } diff --git a/modnet/featurizers/presets/debreuck_2020.py b/modnet/featurizers/presets/debreuck_2020.py index 8e888e3a..dd588f14 100644 --- a/modnet/featurizers/presets/debreuck_2020.py +++ b/modnet/featurizers/presets/debreuck_2020.py @@ -1,26 +1,59 @@ """ This submodule contains the DeBreuck2020Featurizer class. """ import numpy as np -from pymatgen.core.periodic_table import Element -from pymatgen.analysis.local_env import VoronoiNN import modnet.featurizers import contextlib +import warnings class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer): - """Featurizer presets used for the paper 'Machine learning - materials properties for small datasets' by Pierre-Paul De Breuck, - Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020). + """Featurizer presets used for the paper + + **Materials property prediction for limited datasets enabled + by feature selection and joint learning with MODNet**, + Pierre-Paul De Breuck, Geoffroy Hautier & Gian-Marco Rignanese + npj Comp. Mat. 7(1) 1-8 (2021) + 10.1038/s41524-021-00552-2 Uses most of the featurizers implemented by matminer at the time of writing with their default hyperparameters and presets. """ - def __init__(self, fast_oxid=False): - super().__init__() + package_version_requirements = {"matminer": "==0.6.2"} + + def __init__(self, fast_oxid: bool = False): + """Creates the featurizer and imports all featurizer functions. + + Parameters: + fast_oxid: Whether to use the accelerated oxidation state parameters within + pymatgen when constructing features that constrain oxidation states such + that all sites with the same species in a structure will have the same + oxidation state (recommended if featurizing any structure + with large unit cells). + + """ + import matminer + + if matminer.__version__ != self.package_version_requirements[ + "matminer" + ].replace("==", ""): + warnings.warn( + f"The {self.__class__.__name__} preset was written for and tested only with matminer{self.package_version_requirements['matminer']}.\n" + "Newer versions of matminer will not work, and older versions may not be compatible with newer MODNet versions due to other conflicts.\n" + "To use this featurizer robustly, please install `modnet==0.1.13` with its pinned dependencies.\n\n" + "This preset will now be initialised without importing matminer featurizers to enable use with existing previously featurized data, " + "but attempts to perform further featurization will result in an error." + ) + else: + super().__init__() + self.load_featurizers() + self.fast_oxid = fast_oxid + + def load_featurizers(self): with contextlib.redirect_stdout(None): + from pymatgen.analysis.local_env import VoronoiNN from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, @@ -117,13 +150,14 @@ def __init__(self, fast_oxid=False): OPSiteFingerprint(), VoronoiFingerprint(), ) - self.fast_oxid = fast_oxid def featurize_composition(self, df): """Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ + from pymatgen.core.periodic_table import Element + df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} @@ -151,18 +185,21 @@ def featurize_structure(self, df): df = super().featurize_structure(df) - dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][ - "distances" - ][:50] - for i, d in enumerate(dist): - _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( - d - ) - df[_rdf_key] = df[ - "RadialDistributionFunction|radial distribution function" - ].apply(lambda x: x["distribution"][i]) + if "RadialDistributionFunction|radial distribution function" in df: + dist = df["RadialDistributionFunction|radial distribution function"].iloc[ + 0 + ]["distances"][:50] + for i, d in enumerate(dist): + _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( + d + ) + df[_rdf_key] = df[ + "RadialDistributionFunction|radial distribution function" + ].apply(lambda x: x["distribution"][i]) - df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) + df = df.drop( + "RadialDistributionFunction|radial distribution function", axis=1 + ) _crystal_system = { "cubic": 1, @@ -210,6 +247,20 @@ def featurize_site(self, df): class CompositionOnlyFeaturizer(DeBreuck2020Featurizer): + """This subclass simply disables structure and site-level features + from the main `DeBreuck2020Featurizer` class. + + **Materials property prediction for limited datasets enabled + by feature selection and joint learning with MODNet** + Pierre-Paul De Breuck, Geoffroy Hautier & Gian-Marco Rignanese + npj Comp. Mat. 7(1) 1-8 (2021) + 10.1038/s41524-021-00552-2 + + Uses most of the featurizers implemented by matminer at the time of + writing with their default hyperparameters and presets. + + """ + def __init__(self): super().__init__() self.oxid_composition_featurizers = () diff --git a/modnet/featurizers/presets/matminer_2023.py b/modnet/featurizers/presets/matminer_2023.py new file mode 100644 index 00000000..14b0f3cf --- /dev/null +++ b/modnet/featurizers/presets/matminer_2023.py @@ -0,0 +1,231 @@ +""" This submodule contains the `Matminer2023Featurizer` class. """ + +import numpy as np +import modnet.featurizers +import contextlib + + +class Matminer2023Featurizer(modnet.featurizers.MODFeaturizer): + """A "kitchen-sink" featurizer for features implemented in matminer + at time of creation (matminer v0.8.0 from late 2022/early 2023). + + Follows the same philosophy and featurizer list as the `DeBreuck2020Featurizer` + but with with many features changing their underlying matminer implementation, + definition and behaviour since the creation of the former featurizer. + + """ + + def __init__(self, fast_oxid: bool = False): + """Creates the featurizer and imports all featurizer functions. + + Parameters: + fast_oxid: Whether to use the accelerated oxidation state parameters within + pymatgen when constructing features that constrain oxidation states such + that all sites with the same species in a structure will have the same + oxidation state (recommended if featurizing any structure + with large unit cells). + + """ + + super().__init__() + self.load_featurizers() + self.fast_oxid = fast_oxid + + def load_featurizers(self): + with contextlib.redirect_stdout(None): + from pymatgen.analysis.local_env import VoronoiNN + from matminer.featurizers.composition import ( + AtomicOrbitals, + AtomicPackingEfficiency, + BandCenter, + # CohesiveEnergy, - This descriptor was not used in the paper preset + # ElectronAffinity, - This descriptor was not used in the paper preset + ElectronegativityDiff, + ElementFraction, + ElementProperty, + IonProperty, + Miedema, + OxidationStates, + Stoichiometry, + TMetalFraction, + ValenceOrbital, + YangSolidSolution, + ) + from matminer.featurizers.structure import ( + # BagofBonds, - This descriptor was not used in the paper preset + BondFractions, + ChemicalOrdering, + CoulombMatrix, + DensityFeatures, + EwaldEnergy, + GlobalSymmetryFeatures, + MaximumPackingEfficiency, + # PartialRadialDistributionFunction, + RadialDistributionFunction, + SineCoulombMatrix, + StructuralHeterogeneity, + XRDPowderPattern, + ) + + from matminer.featurizers.site import ( + AGNIFingerprints, + AverageBondAngle, + AverageBondLength, + BondOrientationalParameter, + ChemEnvSiteFingerprint, + CoordinationNumber, + CrystalNNFingerprint, + GaussianSymmFunc, + GeneralizedRadialDistributionFunction, + LocalPropertyDifference, + OPSiteFingerprint, + VoronoiFingerprint, + ) + + self.composition_featurizers = ( + AtomicOrbitals(), + AtomicPackingEfficiency(), + BandCenter(), + ElementFraction(), + ElementProperty.from_preset("magpie"), + IonProperty(), + Miedema(), + Stoichiometry(), + TMetalFraction(), + ValenceOrbital(), + YangSolidSolution(), + ) + + self.oxid_composition_featurizers = ( + ElectronegativityDiff(), + OxidationStates(), + ) + + self.structure_featurizers = ( + DensityFeatures(), + GlobalSymmetryFeatures(), + RadialDistributionFunction(), + CoulombMatrix(), + # PartialRadialDistributionFunction(), + SineCoulombMatrix(), + EwaldEnergy(), + BondFractions(), + StructuralHeterogeneity(), + MaximumPackingEfficiency(), + ChemicalOrdering(), + XRDPowderPattern(), + # BagofBonds(), + ) + + # Patch for matminer: see https://github.com/hackingmaterials/matminer/issues/864 + self.structure_featurizers[0].desired_features = None + self.structure_featurizers[1].desired_features = None + + self.site_featurizers = ( + AGNIFingerprints(), + AverageBondAngle(VoronoiNN()), + AverageBondLength(VoronoiNN()), + BondOrientationalParameter(), + ChemEnvSiteFingerprint.from_preset("simple"), + CoordinationNumber(), + CrystalNNFingerprint.from_preset("ops"), + GaussianSymmFunc(), + GeneralizedRadialDistributionFunction.from_preset("gaussian"), + LocalPropertyDifference(), + OPSiteFingerprint(), + VoronoiFingerprint(), + ) + + def featurize_composition(self, df): + """Applies the preset composition featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + from pymatgen.core.periodic_table import Element + + df = super().featurize_composition(df) + + _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} + df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( + _orbitals + ) + df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( + _orbitals + ) + + df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + + return modnet.featurizers.clean_df(df) + + def featurize_structure(self, df): + """Applies the preset structural featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + + if self.structure_featurizers: + df = super().featurize_structure(df) + + _crystal_system = { + "cubic": 1, + "tetragonal": 2, + "orthorombic": 3, + "hexagonal": 4, + "trigonal": 5, + "monoclinic": 6, + "triclinic": 7, + } + + def _int_map(x): + if x == np.nan: + return 0 + elif x: + return 1 + else: + return 0 + + df["GlobalSymmetryFeatures|crystal_system"] = df[ + "GlobalSymmetryFeatures|crystal_system" + ].map(_crystal_system) + df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ + "GlobalSymmetryFeatures|is_centrosymmetric" + ].map(_int_map) + + return modnet.featurizers.clean_df(df) + + def featurize_site(self, df): + """Applies the preset site featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + + # rename some features for backwards compatibility with pretrained models + aliases = { + "GeneralizedRadialDistributionFunction": "GeneralizedRDF", + "AGNIFingerprints": "AGNIFingerPrint", + "BondOrientationalParameter": "BondOrientationParameter", + } + df = super().featurize_site(df, aliases=aliases) + df = df.loc[:, (df != 0).any(axis=0)] + + return modnet.featurizers.clean_df(df) + + +class CompositionOnlyMatminer2023Featurizer(Matminer2023Featurizer): + """This subclass simply disables structure and site-level features + from the main `Matminer2023Featurizer` class. + + This should yield identical results to the original 2020 version. + + """ + + def __init__(self): + super().__init__() + self.oxid_composition_featurizers = () + self.structure_featurizers = () + self.site_featurizers = () diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index 2fb9c686..bdf3bb88 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -13,7 +13,7 @@ from typing import Dict, List, Union, Optional, Callable, Hashable, Iterable, Tuple from functools import partial -from pymatgen import Structure, Composition +from pymatgen.core import Structure, Composition from sklearn.feature_selection import mutual_info_regression, mutual_info_classif from sklearn.utils import resample @@ -28,6 +28,7 @@ from modnet import __version__ from modnet.utils import LOG + DATABASE = pd.DataFrame([]) @@ -539,14 +540,14 @@ def merge_ranked(lists: List[List[Hashable]]) -> List[Hashable]: class MODData: - """The MODData class takes takes a list of `pymatgen.Structure` + """The MODData class takes takes a list of pymatgen `Structure` objects and creates a `pandas.DataFrame` that contains many matminer features per structure. It then uses mutual information between features and targets, and between the features themselves, to perform feature selection using relevance-redundancy indices. Attributes: - df_structure (pd.DataFrame): dataframe storing the `pymatgen.Structure` + df_structure (pd.DataFrame): dataframe storing the pymatgen `Structure` representations for each structured, indexed by ID. df_targets (pd.Dataframe): dataframe storing the prediction targets per structure, indexed by ID. @@ -600,7 +601,11 @@ def __init__( """ - from modnet.featurizers.presets import FEATURIZER_PRESETS + from modnet.featurizers.presets import ( + FEATURIZER_PRESETS, + DEFAULT_FEATURIZER, + DEFAULT_COMPOSITION_ONLY_FEATURIZER, + ) self.__modnet_version__ = __version__ self.df_featurized = df_featurized @@ -644,9 +649,11 @@ def __init__( self.featurizer = featurizer elif featurizer is None and self.df_featurized is None: if getattr(self, "_composition_only", False): - self.featurizer = FEATURIZER_PRESETS["CompositionOnly"]() + self.featurizer = FEATURIZER_PRESETS[ + DEFAULT_COMPOSITION_ONLY_FEATURIZER + ]() else: - self.featurizer = FEATURIZER_PRESETS["DeBreuck2020"]() + self.featurizer = FEATURIZER_PRESETS[DEFAULT_FEATURIZER]() if self.featurizer is not None: LOG.info(f"Loaded {self.featurizer.__class__.__name__} featurizer.") @@ -906,12 +913,12 @@ def rebalance(self): @property def structures(self) -> List[Union[Structure, CompositionContainer]]: - """Returns the list of `pymatgen.Structure` objects.""" + """Returns the list of pymatgen `Structure` objects.""" return list(self.df_structure["structure"]) @property def compositions(self) -> List[Union[Structure, CompositionContainer]]: - """Returns the list of materials as`pymatgen.Composition` objects.""" + """Returns the list of materials as pymatgen `Composition` objects.""" return [s.composition for s in self.df_structure["structure"]] @property diff --git a/modnet/tests/conftest.py b/modnet/tests/conftest.py index d855b78c..d5061c90 100644 --- a/modnet/tests/conftest.py +++ b/modnet/tests/conftest.py @@ -9,14 +9,22 @@ "d7d75e646dbde539645c8c0b065fd82cbe93f81d3500809655bd13d0acf2027c" "1786091a73f53985b08868c5be431a3c700f7f1776002df28ebf3a12a79ab1a1" ), - "MP_2018.6_small.zip": ( + "MP_2018.6_small_2020.zip": ( "0efc2ce998faaadc9cf54a25e1db80834c5f53b1298da0e824ee2675124f47c8" "3fce2a86971a92eb3d0a860d29e0eb37683aa47ec80af2b6c8dee879584b1491" ), - "MP_2018.6_small_composition.zip": ( + "MP_2018.6_small_2023.zip": ( + "47e3f34fe31679575b3143ea410d61d73158794018bdb09d362672fd3e048bbc" + "b2d7967bceb664918b1af77e2e6c853f3ae642a42cd8ce47db304c5612cb3aeb" + ), + "MP_2018.6_small_composition_2020.zip": ( "59f8c4e546df005799e3fb7a1e64daa0edfece48fa346ab0d2efe92aa107d0d1" "b14bb16f56bfe3f54e5a9020d088a268536f6ad86134e264ed7547b4fd583c79" ), + "MP_2018.6_small_composition_2023.zip": ( + "519e6bc8c2f7277e8f9d9f8e99d4def3fc088de1978857bfaef2aa0ff2db873e" + "c5c4bb3beeda58f1508d2ea06a98aa4743b80f991fe25007fd8f0bfa11d92edd" + ), } @@ -46,21 +54,43 @@ def subset_moddata(): @pytest.fixture(scope="function") -def small_moddata(): +def small_moddata_2023(): + """Loads the small 5-structure featurized subset of MP.2018.6 for use + in other tests, checking only the hash, updated for 2023. + + """ + return _load_moddata("MP_2018.6_small_2023.zip") + + +small_moddata = small_moddata_2023 +"""Alias for new data.""" + + +@pytest.fixture(scope="function") +def small_moddata_composition_2023(): + """Loads the small 5-structure featurized subset of MP.2018.6 composition only for use + in other tests, checking only the hash, updated for 2023. + + """ + return _load_moddata("MP_2018.6_small_composition_2023.zip") + + +@pytest.fixture(scope="function") +def small_moddata_2020(): """Loads the small 5-structure featurized subset of MP.2018.6 for use in other tests, checking only the hash. """ - return _load_moddata("MP_2018.6_small.zip") + return _load_moddata("MP_2018.6_small_2020.zip") @pytest.fixture(scope="function") -def small_moddata_composition(): +def small_moddata_composition_2020(): """Loads the small 5-structure featurized subset of MP.2018.6 composition only for use in other tests, checking only the hash. """ - return _load_moddata("MP_2018.6_small_composition.zip") + return _load_moddata("MP_2018.6_small_composition_2020.zip") @pytest.fixture(scope="module") diff --git a/modnet/tests/data/MP_2018.6_small.zip b/modnet/tests/data/MP_2018.6_small_2020.zip similarity index 100% rename from modnet/tests/data/MP_2018.6_small.zip rename to modnet/tests/data/MP_2018.6_small_2020.zip diff --git a/modnet/tests/data/MP_2018.6_small_2023.zip b/modnet/tests/data/MP_2018.6_small_2023.zip new file mode 100644 index 00000000..34586f51 Binary files /dev/null and b/modnet/tests/data/MP_2018.6_small_2023.zip differ diff --git a/modnet/tests/data/MP_2018.6_small_composition.zip b/modnet/tests/data/MP_2018.6_small_composition_2020.zip similarity index 100% rename from modnet/tests/data/MP_2018.6_small_composition.zip rename to modnet/tests/data/MP_2018.6_small_composition_2020.zip diff --git a/modnet/tests/data/MP_2018.6_small_composition_2023.zip b/modnet/tests/data/MP_2018.6_small_composition_2023.zip new file mode 100644 index 00000000..c7a111d6 Binary files /dev/null and b/modnet/tests/data/MP_2018.6_small_composition_2023.zip differ diff --git a/modnet/tests/test_benchmark.py b/modnet/tests/test_benchmark.py index 8ac6fe58..5492a61b 100644 --- a/modnet/tests/test_benchmark.py +++ b/modnet/tests/test_benchmark.py @@ -2,6 +2,7 @@ import pytest +@pytest.mark.slow def test_train_small_model_benchmark(small_moddata, tf_session): """Tests the `matbench_benchmark()` method with optional arguments.""" from modnet.matbench.benchmark import matbench_benchmark @@ -81,6 +82,7 @@ def test_train_small_ensemblemodel_benchmark(small_moddata, tf_session): assert all(len(results[key]) == 5 for key in expected_keys) +@pytest.mark.slow def test_train_small_model_benchmark_with_extra_args(small_moddata): """Tests the `matbench_benchmark()` method with some extra settings, parallelised over 2 jobs. @@ -130,6 +132,7 @@ def test_train_small_model_benchmark_with_extra_args(small_moddata): assert all(len(results[key]) == 5 for key in expected_keys) +@pytest.mark.slow def test_ga_benchmark(small_moddata, tf_session): """Tests the `matbench_benchmark()` method with the GA strategy.""" from modnet.matbench.benchmark import matbench_benchmark @@ -156,7 +159,7 @@ def test_ga_benchmark(small_moddata, tf_session): "refit": False, }, fast=True, - n_jobs=2, + n_jobs=1, ) expected_keys = ( diff --git a/modnet/tests/test_model.py b/modnet/tests/test_model.py index 73f58c30..971bc2a8 100644 --- a/modnet/tests/test_model.py +++ b/modnet/tests/test_model.py @@ -19,7 +19,7 @@ def test_train_small_model_single_target(subset_moddata, tf_session): n_feat=10, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) model.predict(data) @@ -48,7 +48,7 @@ def is_metal(egap): n_feat=10, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) def test_train_small_model_multi_target(subset_moddata, tf_session): @@ -68,10 +68,11 @@ def test_train_small_model_multi_target(subset_moddata, tf_session): n_feat=10, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) model.predict(data) +@pytest.mark.slow def test_train_small_model_presets(subset_moddata, tf_session): """Tests the `fit_preset()` method.""" from modnet.model_presets import gen_presets @@ -80,7 +81,7 @@ def test_train_small_model_presets(subset_moddata, tf_session): modified_presets = gen_presets(100, 100)[:2] for ind, preset in enumerate(modified_presets): - modified_presets[ind]["epochs"] = 5 + modified_presets[ind]["epochs"] = 2 data = subset_moddata # set 'optimal' features manually @@ -127,7 +128,7 @@ def test_model_integration(subset_moddata, tf_session): n_feat=10, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) model.save("test") loaded_model = MODNetModel.load("test") @@ -152,7 +153,7 @@ def test_train_small_bayesian_single_target(subset_moddata, tf_session): n_feat=10, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) @@ -182,7 +183,7 @@ def is_metal(egap): n_feat=10, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) @@ -204,7 +205,7 @@ def test_train_small_bayesian_multi_target(subset_moddata, tf_session): n_feat=10, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) @@ -228,7 +229,7 @@ def test_train_small_bootstrap_single_target(subset_moddata, tf_session): bootstrap=True, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) @@ -260,7 +261,7 @@ def is_metal(egap): bootstrap=True, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) model.predict(data) model.predict(data, return_unc=True) @@ -284,16 +285,18 @@ def test_train_small_bootstrap_multi_target(small_moddata, tf_session): bootstrap=True, ) - model.fit(data, epochs=5) + model.fit(data, epochs=2) model.predict(data, return_unc=True) @pytest.mark.slow def test_train_small_bootstrap_presets(small_moddata, tf_session): """Tests the `fit_preset()` method.""" + import time from modnet.model_presets import gen_presets from modnet.models import EnsembleMODNetModel + start = time.time() modified_presets = gen_presets(100, 100)[:2] for ind, preset in enumerate(modified_presets): @@ -313,6 +316,7 @@ def test_train_small_bootstrap_presets(small_moddata, tf_session): n_models=2, bootstrap=True, ) + print(f"{time.time() - start} elapsed after model creation.") # nested=0/False -> no inner loop, so only 1 model # nested=1/True -> inner loop, but default n_folds so 5 @@ -324,6 +328,7 @@ def test_train_small_bootstrap_presets(small_moddata, tf_session): val_fraction=0.2, n_jobs=2, ) + print(f"{time.time() - start} elapsed nested {num_nested}, {nested_option}") models = results[0] assert len(models) == len(modified_presets) assert len(models[0]) == num_nested diff --git a/modnet/tests/test_preprocessing.py b/modnet/tests/test_preprocessing.py index 41ff27ce..16fafdb1 100644 --- a/modnet/tests/test_preprocessing.py +++ b/modnet/tests/test_preprocessing.py @@ -291,15 +291,19 @@ def test_load_moddata_zip(subset_moddata): assert len(data.df_targets) == 100 -def test_small_moddata_featurization(small_moddata): +def test_small_moddata_featurization(small_moddata_2023): """This test creates a new MODData from the MP 2018.6 structures.""" - old = small_moddata + from modnet.featurizers.presets import Matminer2023Featurizer + + old = small_moddata_2023 structures = old.structures targets = old.targets names = old.names - new = MODData(structures, targets, target_names=names) + new = MODData( + structures, targets, target_names=names, featurizer=Matminer2023Featurizer() + ) new.featurize(fast=False, n_jobs=1) new_cols = sorted(new.df_featurized.columns.tolist()) @@ -312,22 +316,31 @@ def test_small_moddata_featurization(small_moddata): # assert relative error below 3 percent for col in new.df_featurized.columns: - assert ( - np.absolute( - (new.df_featurized[col].to_numpy() - old.df_featurized[col].to_numpy()) - / (old.df_featurized[col].to_numpy() + 1e-6) - ).max() - < 0.03 - ) + if col in old.df_featurized.columns: + assert ( + np.max( + np.absolute( + ( + new.df_featurized[col].to_numpy() + - old.df_featurized[col].to_numpy() + ) + / (old.df_featurized[col].to_numpy() + 1e-6) + ) + ) + < 0.03 + ) -def test_small_moddata_composition_featurization(small_moddata_composition): +def test_small_moddata_composition_featurization(small_moddata_composition_2023): """This test creates a new MODData from the MP 2018.6 structures.""" + from modnet.featurizers.presets import CompositionOnlyMatminer2023Featurizer - reference = small_moddata_composition + reference = small_moddata_composition_2023 compositions = reference.compositions - new = MODData(materials=compositions) + new = MODData( + materials=compositions, featurizer=CompositionOnlyMatminer2023Featurizer() + ) new.featurize(fast=False, n_jobs=1) new_cols = sorted(new.df_featurized.columns.tolist()) @@ -351,7 +364,7 @@ def test_small_moddata_composition_featurization(small_moddata_composition): ) -def test_small_moddata_feature_selection_classif(small_moddata): +def test_small_moddata_feature_selection_classif(small_moddata_2023): """This test creates classifier MODData and test the feature selection method""" x1 = np.array([0] * 500 + [1] * 500 + [2] * 500, dtype="float") @@ -466,13 +479,13 @@ def test_moddata_splits(subset_moddata): break -def test_precomputed_cross_nmi(small_moddata): +def test_precomputed_cross_nmi(small_moddata_2020): new = MODData( - materials=small_moddata.structures, - targets=small_moddata.targets, - target_names=small_moddata.names, - df_featurized=small_moddata.df_featurized, + materials=small_moddata_2020.structures, + targets=small_moddata_2020.targets, + target_names=small_moddata_2020.names, + df_featurized=small_moddata_2020.df_featurized, ) new.feature_selection(5, use_precomputed_cross_nmi=True) diff --git a/requirements.txt b/requirements.txt index 8b477f81..10b00d63 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ tensorflow==2.10.0 tensorflow-probability==0.18.0 -pandas==1.5.0 -pymatgen==2020.8.13 -matminer==0.6.5 -numpy==1.23.4 -scikit-learn==0.23.2 +pandas==1.5.2 +pymatgen==2022.9.21 +matminer==0.8.0 +numpy==1.23.5 +scikit-learn==1.2.0 diff --git a/setup.py b/setup.py index 45d311d3..285ddfbd 100644 --- a/setup.py +++ b/setup.py @@ -12,11 +12,8 @@ version = re.search('__version__ = "(.*)"', lines).group(1) -tests_require = ["pytest>=6.0", "pytest-cov>=2.10", "flake8>=3.8"] - -dev_require = [ - "pre-commit~=2.11", -] +tests_require = ("pytest>=6.0", "pytest-cov>=2.10", "flake8>=3.8") +dev_require = ("pre-commit~=2.11",) setuptools.setup( name="modnet", @@ -34,13 +31,13 @@ include_package_data=True, packages=setuptools.find_packages(), install_requires=[ - "pandas>=0.25.3", - "tensorflow>=2.4", - "tensorflow-probability>=0.12", - "pymatgen>=2020,<2020.9", - "matminer>=0.6.2", - "numpy>=1.18.3", - "scikit-learn>=0.23,<0.24", + "pandas~=1.5", + "tensorflow~=2.10", + "tensorflow-probability~=0.18", + "pymatgen~=2022.9", + "matminer~=0.8", + "numpy~=1.23", + "scikit-learn~=1.1", ], tests_require=tests_require, test_suite="modnet.tests",