From 5a8971cb0d6e12e444bf842e69e5aee4a60ba195 Mon Sep 17 00:00:00 2001 From: Jack Sundberg Date: Sat, 16 Apr 2022 19:44:42 -0400 Subject: [PATCH 1/3] add partial site stats fp --- matminer/featurizers/structure/sites.py | 174 ++++++++++++++++++++++-- 1 file changed, 159 insertions(+), 15 deletions(-) diff --git a/matminer/featurizers/structure/sites.py b/matminer/featurizers/structure/sites.py index 57f9f04fd..6911fc3bc 100644 --- a/matminer/featurizers/structure/sites.py +++ b/matminer/featurizers/structure/sites.py @@ -2,8 +2,10 @@ Structure featurizers based on aggregating site features. """ +import itertools import numpy as np from pymatgen.analysis.local_env import VoronoiNN +from pymatgen.core.periodic_table import Element, Specie from matminer.featurizers.base import BaseFeaturizer from matminer.featurizers.site import ( @@ -147,8 +149,8 @@ def citations(self): def implementors(self): return ["Nils E. R. Zimmermann", "Alireza Faghaninia", "Anubhav Jain", "Logan Ward", "Alex Dunn"] - @staticmethod - def from_preset(preset, **kwargs): + @classmethod + def from_preset(cls, preset, **kwargs): """ Create a SiteStatsFingerprint class according to a preset @@ -158,37 +160,37 @@ def from_preset(preset, **kwargs): """ if preset == "SOAP_formation_energy": - return SiteStatsFingerprint(SOAP.from_preset("formation_energy"), **kwargs) + return cls(SOAP.from_preset("formation_energy"), **kwargs) elif preset == "CrystalNNFingerprint_cn": - return SiteStatsFingerprint(CrystalNNFingerprint.from_preset("cn", cation_anion=False), **kwargs) + return cls(CrystalNNFingerprint.from_preset("cn", cation_anion=False), **kwargs) elif preset == "CrystalNNFingerprint_cn_cation_anion": - return SiteStatsFingerprint(CrystalNNFingerprint.from_preset("cn", cation_anion=True), **kwargs) + return cls(CrystalNNFingerprint.from_preset("cn", cation_anion=True), **kwargs) elif preset == "CrystalNNFingerprint_ops": - return SiteStatsFingerprint(CrystalNNFingerprint.from_preset("ops", cation_anion=False), **kwargs) + return cls(CrystalNNFingerprint.from_preset("ops", cation_anion=False), **kwargs) elif preset == "CrystalNNFingerprint_ops_cation_anion": - return SiteStatsFingerprint(CrystalNNFingerprint.from_preset("ops", cation_anion=True), **kwargs) + return cls(CrystalNNFingerprint.from_preset("ops", cation_anion=True), **kwargs) elif preset == "OPSiteFingerprint": - return SiteStatsFingerprint(OPSiteFingerprint(), **kwargs) + return cls(OPSiteFingerprint(), **kwargs) elif preset == "LocalPropertyDifference_ward-prb-2017": - return SiteStatsFingerprint( + return cls( LocalPropertyDifference.from_preset("ward-prb-2017"), stats=["minimum", "maximum", "range", "mean", "avg_dev"], ) elif preset == "CoordinationNumber_ward-prb-2017": - return SiteStatsFingerprint( + return cls( CoordinationNumber(nn=VoronoiNN(weight="area"), use_weights="effective"), stats=["minimum", "maximum", "range", "mean", "avg_dev"], ) elif preset == "Composition-dejong2016_AD": - return SiteStatsFingerprint( + return cls( LocalPropertyDifference( properties=[ "Number", @@ -204,7 +206,7 @@ def from_preset(preset, **kwargs): ) elif preset == "Composition-dejong2016_SD": - return SiteStatsFingerprint( + return cls( LocalPropertyDifference( properties=[ "Number", @@ -220,13 +222,13 @@ def from_preset(preset, **kwargs): ) elif preset == "BondLength-dejong2016": - return SiteStatsFingerprint( + return cls( AverageBondLength(VoronoiNN()), stats=["holder_mean::%d" % d for d in range(-4, 4 + 1)] + ["std_dev", "geom_std_dev"], ) elif preset == "BondAngle-dejong2016": - return SiteStatsFingerprint( + return cls( AverageBondAngle(VoronoiNN()), stats=["holder_mean::%d" % d for d in range(-4, 4 + 1)] + ["std_dev", "geom_std_dev"], ) @@ -236,8 +238,150 @@ def from_preset(preset, **kwargs): # One of the various Coordination Number presets: # MinimumVIRENN, MinimumDistanceNN, JmolNN, VoronoiNN, etc. try: - return SiteStatsFingerprint(CoordinationNumber.from_preset(preset), **kwargs) + return cls(CoordinationNumber.from_preset(preset), **kwargs) except Exception: pass raise ValueError("Unrecognized preset!") + + +class PartialsSiteStatsFingerprint(SiteStatsFingerprint): + """ + Computes statistics of properties across all sites in a structure, and + breaks these down by element. This featurizer first uses a site featurizer + class (see site.py for options) to compute features of each site of a + specific element in a structure, and then computes features of the entire + structure by measuring statistics of each attribute. + Features: + - Returns each statistic of each site feature, broken down by element + """ + + def __init__( + self, + site_featurizer, + stats=("mean", "std_dev"), + min_oxi=None, + max_oxi=None, + covariance=False, + include_elems=(), + exclude_elems=(), + ): + """ + Args: + site_featurizer (BaseFeaturizer): a site-based featurizer + stats ([str]): list of weighted statistics to compute for each feature. + If stats is None, a list is returned for each features + that contains the calculated feature for each site in the + structure. + *Note for nth mode, stat must be 'n*_mode'; e.g. stat='2nd_mode' + min_oxi (int): minimum site oxidation state for inclusion (e.g., + zero means metals/cations only) + max_oxi (int): maximum site oxidation state for inclusion + covariance (bool): Whether to compute the covariance of site features + """ + + self.include_elems = list(include_elems) # Makes sure the element lists are ordered + self.exclude_elems = list(exclude_elems) + super().__init__(site_featurizer, stats, min_oxi, max_oxi, covariance) + + def fit(self, X, y=None): + """Define the list of elements to be included in the PRDF. By default, + the PRDF will include all of the elements in `X` + Args: + X: (numpy array nx1) structures used in the training set. Each entry + must be Pymatgen Structure objects. + y: *Not used* + fit_kwargs: *not used* + Returns: + self + """ + + # This method largely copies code from the partial-RDF fingerprint + + # Initialize list with included elements + elements = {Element(e) for e in self.include_elems} + + # Get all of elements that appaer + for strc in X: + elements.update([e.element if isinstance(e, Specie) else e for e in strc.composition.keys()]) + + # Remove the elements excluded by the user + elements.difference_update([Element(e) for e in self.exclude_elems]) + + # Store the elements + self.elements_ = [e.symbol for e in sorted(elements)] + + return self + + def featurize(self, s): + """ + Get PSSF of the input structure. + Args: + s: Pymatgen Structure object. + Returns: + pssf: 1D array of each element's ssf + """ + + if not s.is_ordered: + raise ValueError("Disordered structure support not built yet") + if self.elements_ is None: + raise Exception("You must run 'fit' first!") + + output = [] + for e in self.elements_: + pssf_stats = self.compute_pssf(s, e) + output.append(pssf_stats) + + return np.hstack(output) + + def compute_pssf(self, s, e): + + # This code is extremely similar to super().featurize(). The key + # difference is that only one specific element is analyzed. + + # Get each feature for each site + vals = [[] for t in self._site_labels] + for i, site in enumerate(s.sites): + if site.specie.symbol == e: + opvalstmp = self.site_featurizer.featurize(s, i) + for j, opval in enumerate(opvalstmp): + if opval is None: + vals[j].append(0.0) + else: + vals[j].append(opval) + + # If the user does not request statistics, return the site features now + if self.stats is None: + return vals + + # Compute the requested statistics + stats = [] + for op in vals: + for stat in self.stats: + stats.append(PropertyStats().calc_stat(op, stat)) + + # If desired, compute covariances + if self.covariance: + if len(s) == 1: + stats.extend([0] * int(len(vals) * (len(vals) - 1) / 2)) + else: + covar = np.cov(vals) + tri_ind = np.triu_indices(len(vals), 1) + stats.extend(covar[tri_ind].tolist()) + + return stats + + def feature_labels(self): + if self.elements_ is None: + raise Exception("You must run 'fit' first!") + + labels = [] + for e in self.elements_: + e_labels = [f"{e} {l}" for l in super().feature_labels()] + for l in e_labels: + labels.append(l) + + return labels + + def implementors(self): + return ["Jack Sundberg"] From 45afff749f829e95b8f9ca1507829d50098c3e8c Mon Sep 17 00:00:00 2001 From: jacksund Date: Thu, 28 Apr 2022 12:06:20 -0400 Subject: [PATCH 2/3] add unittest for partialsitestats --- matminer/featurizers/structure/sites.py | 31 +++-- .../featurizers/structure/tests/test_sites.py | 114 +++++++++++++++++- 2 files changed, 127 insertions(+), 18 deletions(-) diff --git a/matminer/featurizers/structure/sites.py b/matminer/featurizers/structure/sites.py index 6911fc3bc..e91332f98 100644 --- a/matminer/featurizers/structure/sites.py +++ b/matminer/featurizers/structure/sites.py @@ -279,8 +279,8 @@ def __init__( max_oxi (int): maximum site oxidation state for inclusion covariance (bool): Whether to compute the covariance of site features """ - - self.include_elems = list(include_elems) # Makes sure the element lists are ordered + + self.include_elems = list(include_elems) self.exclude_elems = list(exclude_elems) super().__init__(site_featurizer, stats, min_oxi, max_oxi, covariance) @@ -292,27 +292,24 @@ def fit(self, X, y=None): must be Pymatgen Structure objects. y: *Not used* fit_kwargs: *not used* - Returns: - self """ - + # This method largely copies code from the partial-RDF fingerprint # Initialize list with included elements - elements = {Element(e) for e in self.include_elems} - - # Get all of elements that appaer - for strc in X: - elements.update([e.element if isinstance(e, Specie) else e for e in strc.composition.keys()]) + elements = [Element(e) for e in self.include_elems] - # Remove the elements excluded by the user - elements.difference_update([Element(e) for e in self.exclude_elems]) + # Get all of elements that appear + for structure in X: + for element in structure.composition.elements: + if isinstance(element, Specie): + element = element.element # converts from Specie to Element object + if element not in elements and element.name not in self.exclude_elems: + elements.append(element) # Store the elements self.elements_ = [e.symbol for e in sorted(elements)] - return self - def featurize(self, s): """ Get PSSF of the input structure. @@ -335,8 +332,8 @@ def featurize(self, s): return np.hstack(output) def compute_pssf(self, s, e): - - # This code is extremely similar to super().featurize(). The key + + # This code is extremely similar to super().featurize(). The key # difference is that only one specific element is analyzed. # Get each feature for each site @@ -374,7 +371,7 @@ def compute_pssf(self, s, e): def feature_labels(self): if self.elements_ is None: raise Exception("You must run 'fit' first!") - + labels = [] for e in self.elements_: e_labels = [f"{e} {l}" for l in super().feature_labels()] diff --git a/matminer/featurizers/structure/tests/test_sites.py b/matminer/featurizers/structure/tests/test_sites.py index f34ff7341..0e81f1954 100644 --- a/matminer/featurizers/structure/tests/test_sites.py +++ b/matminer/featurizers/structure/tests/test_sites.py @@ -3,7 +3,10 @@ import numpy as np from matminer.featurizers.site import SiteElementalProperty -from matminer.featurizers.structure.sites import SiteStatsFingerprint +from matminer.featurizers.structure.sites import ( + SiteStatsFingerprint, + PartialsSiteStatsFingerprint, +) from matminer.featurizers.structure.tests.base import StructureFeaturesTest @@ -111,5 +114,114 @@ def test_ward_prb_2017_efftcn(self): self.assertArrayAlmostEqual([12, 12, 0, 12, 0], features) +class PartialStructureSitesFeaturesTest(StructureFeaturesTest): + def test_partialsitestatsfingerprint(self): + # Test matrix. + op_struct_fp = PartialsSiteStatsFingerprint.from_preset("OPSiteFingerprint", stats=None) + op_struct_fp.fit([self.diamond]) + opvals = op_struct_fp.featurize(self.diamond) + _ = op_struct_fp.feature_labels() + self.assertAlmostEqual(opvals[10][0], 0.9995, places=7) + self.assertAlmostEqual(opvals[10][1], 0.9995, places=7) + opvals = op_struct_fp.featurize(self.nacl) + self.assertAlmostEqual(opvals[18][0], 0.9995, places=7) + self.assertAlmostEqual(opvals[18][1], 0.9995, places=7) + opvals = op_struct_fp.featurize(self.cscl) + self.assertAlmostEqual(opvals[22][0], 0.9995, places=7) + self.assertAlmostEqual(opvals[22][1], 0.9995, places=7) + + # Test stats. + op_struct_fp = PartialsSiteStatsFingerprint.from_preset("OPSiteFingerprint") + op_struct_fp.fit([self.diamond]) + opvals = op_struct_fp.featurize(self.diamond) + self.assertAlmostEqual(opvals[0], 0.0005, places=7) + self.assertAlmostEqual(opvals[1], 0, places=7) + self.assertAlmostEqual(opvals[2], 0.0005, places=7) + self.assertAlmostEqual(opvals[3], 0.0, places=7) + self.assertAlmostEqual(opvals[4], 0.0005, places=7) + self.assertAlmostEqual(opvals[18], 0.0805, places=7) + self.assertAlmostEqual(opvals[20], 0.9995, places=7) + self.assertAlmostEqual(opvals[21], 0, places=7) + self.assertAlmostEqual(opvals[22], 0.0075, places=7) + self.assertAlmostEqual(opvals[24], 0.2355, places=7) + self.assertAlmostEqual(opvals[-1], 0.0, places=7) + + # Test coordination number + cn_fp = PartialsSiteStatsFingerprint.from_preset("JmolNN", stats=("mean",)) + cn_fp.fit([self.diamond]) + cn_vals = cn_fp.featurize(self.diamond) + self.assertEqual(cn_vals[0], 4.0) + + # Test the covariance + prop_fp = PartialsSiteStatsFingerprint( + SiteElementalProperty(properties=["Number", "AtomicWeight"]), + stats=["mean"], + covariance=True, + ) + prop_fp.fit([self.diamond]) + + # Test the feature labels + labels = prop_fp.feature_labels() + self.assertEqual(3, len(labels)) + + # Test a structure with all the same type (cov should be zero) + features = prop_fp.featurize(self.diamond) + self.assertArrayAlmostEqual(features, [6, 12.0107, 0]) + + # Test a structure with only one atom (cov should be zero too) + features = prop_fp.featurize(self.sc) + self.assertArrayAlmostEqual([13, 26.9815386, 0], features) + + # Test a structure with nonzero covariance + features = prop_fp.featurize(self.nacl) + self.assertArrayAlmostEqual([14, 29.22138464, 37.38969216], features) + + # Test soap site featurizer + soap_fp = PartialsSiteStatsFingerprint.from_preset("SOAP_formation_energy") + soap_fp.fit([self.sc, self.diamond, self.nacl]) + feats = soap_fp.featurize(self.diamond) + self.assertEqual(len(feats), 9504) + self.assertAlmostEqual(feats[0], 0.4412608, places=5) + self.assertAlmostEqual(feats[1], 0.0) + self.assertAlmostEqual(np.sum(feats), 207.88194724, places=5) + + def test_ward_prb_2017_lpd(self): + """Test the local property difference attributes from Ward 2017""" + f = PartialsSiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017") + f.fit([self.diamond]) + + # Test diamond + features = f.featurize(self.diamond) + self.assertArrayAlmostEqual(features, [0] * (22 * 5)) + features = f.featurize(self.diamond_no_oxi) + self.assertArrayAlmostEqual(features, [0] * (22 * 5)) + + # Test CsCl + big_face_area = np.sqrt(3) * 3 / 2 * (2 / 4 / 4) + small_face_area = 0.125 + big_face_diff = 55 - 17 + features = f.featurize(self.cscl) + labels = f.feature_labels() + my_label = "mean local difference in Number" + self.assertAlmostEqual( + (8 * big_face_area * big_face_diff) / (8 * big_face_area + 6 * small_face_area), + features[labels.index(my_label)], + places=3, + ) + my_label = "range local difference in Electronegativity" + self.assertAlmostEqual(0, features[labels.index(my_label)], places=3) + + def test_ward_prb_2017_efftcn(self): + """Test the effective coordination number attributes of Ward 2017""" + f = PartialsSiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017") + + # Test Ni3Al + features = f.featurize(self.ni3al) + labels = f.feature_labels() + my_label = "mean CN_VoronoiNN" + self.assertAlmostEqual(12, features[labels.index(my_label)]) + self.assertArrayAlmostEqual([12, 12, 0, 12, 0], features) + + if __name__ == "__main__": unittest.main() From b3e29548beb07ef2bbd9e501eb4ae4014e475ba3 Mon Sep 17 00:00:00 2001 From: Jack Sundberg Date: Mon, 20 Jun 2022 12:35:11 -0400 Subject: [PATCH 3/3] fix psitestats tests --- matminer/featurizers/structure/sites.py | 4 +-- .../featurizers/structure/tests/test_sites.py | 35 ++++++++++--------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/matminer/featurizers/structure/sites.py b/matminer/featurizers/structure/sites.py index e91332f98..b993eead4 100644 --- a/matminer/featurizers/structure/sites.py +++ b/matminer/featurizers/structure/sites.py @@ -321,7 +321,7 @@ def featurize(self, s): if not s.is_ordered: raise ValueError("Disordered structure support not built yet") - if self.elements_ is None: + if not hasattr(self, "elements_") or self.elements_ is None: raise Exception("You must run 'fit' first!") output = [] @@ -369,7 +369,7 @@ def compute_pssf(self, s, e): return stats def feature_labels(self): - if self.elements_ is None: + if not hasattr(self, "elements_") or self.elements_ is None: raise Exception("You must run 'fit' first!") labels = [] diff --git a/matminer/featurizers/structure/tests/test_sites.py b/matminer/featurizers/structure/tests/test_sites.py index 0e81f1954..8124d313b 100644 --- a/matminer/featurizers/structure/tests/test_sites.py +++ b/matminer/featurizers/structure/tests/test_sites.py @@ -118,14 +118,19 @@ class PartialStructureSitesFeaturesTest(StructureFeaturesTest): def test_partialsitestatsfingerprint(self): # Test matrix. op_struct_fp = PartialsSiteStatsFingerprint.from_preset("OPSiteFingerprint", stats=None) + op_struct_fp.fit([self.diamond]) opvals = op_struct_fp.featurize(self.diamond) _ = op_struct_fp.feature_labels() self.assertAlmostEqual(opvals[10][0], 0.9995, places=7) self.assertAlmostEqual(opvals[10][1], 0.9995, places=7) + + op_struct_fp.fit([self.nacl]) opvals = op_struct_fp.featurize(self.nacl) self.assertAlmostEqual(opvals[18][0], 0.9995, places=7) self.assertAlmostEqual(opvals[18][1], 0.9995, places=7) + + op_struct_fp.fit([self.cscl]) opvals = op_struct_fp.featurize(self.cscl) self.assertAlmostEqual(opvals[22][0], 0.9995, places=7) self.assertAlmostEqual(opvals[22][1], 0.9995, places=7) @@ -158,57 +163,52 @@ def test_partialsitestatsfingerprint(self): stats=["mean"], covariance=True, ) - prop_fp.fit([self.diamond]) # Test the feature labels + prop_fp.fit([self.diamond]) labels = prop_fp.feature_labels() self.assertEqual(3, len(labels)) # Test a structure with all the same type (cov should be zero) + prop_fp.fit([self.diamond]) features = prop_fp.featurize(self.diamond) self.assertArrayAlmostEqual(features, [6, 12.0107, 0]) # Test a structure with only one atom (cov should be zero too) + prop_fp.fit([self.sc]) features = prop_fp.featurize(self.sc) self.assertArrayAlmostEqual([13, 26.9815386, 0], features) # Test a structure with nonzero covariance + prop_fp.fit([self.nacl]) features = prop_fp.featurize(self.nacl) - self.assertArrayAlmostEqual([14, 29.22138464, 37.38969216], features) - - # Test soap site featurizer - soap_fp = PartialsSiteStatsFingerprint.from_preset("SOAP_formation_energy") - soap_fp.fit([self.sc, self.diamond, self.nacl]) - feats = soap_fp.featurize(self.diamond) - self.assertEqual(len(feats), 9504) - self.assertAlmostEqual(feats[0], 0.4412608, places=5) - self.assertAlmostEqual(feats[1], 0.0) - self.assertAlmostEqual(np.sum(feats), 207.88194724, places=5) + self.assertArrayAlmostEqual([11, 22.9897693, np.nan, 17, 35.453, np.nan], features) def test_ward_prb_2017_lpd(self): """Test the local property difference attributes from Ward 2017""" f = PartialsSiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017") - f.fit([self.diamond]) # Test diamond + f.fit([self.diamond]) features = f.featurize(self.diamond) self.assertArrayAlmostEqual(features, [0] * (22 * 5)) features = f.featurize(self.diamond_no_oxi) self.assertArrayAlmostEqual(features, [0] * (22 * 5)) # Test CsCl + f.fit([self.cscl]) big_face_area = np.sqrt(3) * 3 / 2 * (2 / 4 / 4) small_face_area = 0.125 big_face_diff = 55 - 17 features = f.featurize(self.cscl) labels = f.feature_labels() - my_label = "mean local difference in Number" + my_label = "Cs mean local difference in Number" self.assertAlmostEqual( (8 * big_face_area * big_face_diff) / (8 * big_face_area + 6 * small_face_area), features[labels.index(my_label)], places=3, ) - my_label = "range local difference in Electronegativity" + my_label = "Cs range local difference in Electronegativity" self.assertAlmostEqual(0, features[labels.index(my_label)], places=3) def test_ward_prb_2017_efftcn(self): @@ -216,11 +216,12 @@ def test_ward_prb_2017_efftcn(self): f = PartialsSiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017") # Test Ni3Al + f.fit([self.ni3al]) features = f.featurize(self.ni3al) labels = f.feature_labels() - my_label = "mean CN_VoronoiNN" - self.assertAlmostEqual(12, features[labels.index(my_label)]) - self.assertArrayAlmostEqual([12, 12, 0, 12, 0], features) + self.assertAlmostEqual(12, features[labels.index("Al mean CN_VoronoiNN")]) + self.assertAlmostEqual(12, features[labels.index("Ni mean CN_VoronoiNN")]) + self.assertArrayAlmostEqual([12, 12, 0, 12, 0] * 2, features) if __name__ == "__main__":