From 05b403a8aff99f8f5bed497ef3e8c79ccecf6471 Mon Sep 17 00:00:00 2001
From: Nebgen <315312@win.lanl.gov>
Date: Fri, 31 Mar 2023 11:01:06 -0600
Subject: [PATCH 1/7] Added database options to remove outlier data

    def remove_high_property(self,key,perAtom,species_key=None,cut=None,std_factor=10):
        """
        This function removes outlier data from the dataset
        Must be called before splitting
        "key": the property key in the dataset to check for high values
        "perAtom": True if the property is defined per atom in axis 1, otherwise property is treated as full system
        "std_factor": systems with values larger than this multiplier time the standard deviation of all data will be reomved. None to skip this step
        "cut_factor": systems with values larger than this number are reomved. None to skip this step. This step is done first.
        """
---
 hippynn/databases/database.py | 56 +++++++++++++++++++++++++++++++----
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/hippynn/databases/database.py b/hippynn/databases/database.py
index b454fc1b..3466d008 100644
--- a/hippynn/databases/database.py
+++ b/hippynn/databases/database.py
@@ -191,11 +191,10 @@ def make_generator(self, split_type, evaluation_mode, batch_size=None, subsample
         if not self.splitting_completed:
             raise ValueError("Database has not yet been split.")
 
-
-        if split_type not in self.splits:
-            raise ValueError(f"Split {split_type} Invalid. Current splits:{list(self.splits.keys())}")
-
-        data = [self.splits[split_type][k] for k in self.var_list]
+        if split_type in ("train", "valid", "test"):
+            data = [self.splits[split_type][k] for k in self.var_list]
+        else:
+            raise ValueError("Datatype {} Invalid. Must be one of 'train','valid','test'".format(split_type))
 
         if evaluation_mode == "train":
             if split_type != "train":
@@ -206,7 +205,7 @@ def make_generator(self, split_type, evaluation_mode, batch_size=None, subsample
         elif evaluation_mode == "eval":
             shuffle = False
         else:
-            raise ValueError(f"Evaluation_mode ({evaluation_mode}) must be one of 'train' or 'eval'")
+            raise ValueError("Evaluation_mode ({}) must be one of 'train' or 'eval'")
 
         dataset = NamedTensorDataset(self.var_list, *data)
         if subsample:
@@ -226,6 +225,51 @@ def make_generator(self, split_type, evaluation_mode, batch_size=None, subsample
         )
 
         return generator
+        
+    def trim_all_arrays(self,index):
+        """
+        To be used in conjuction with remove_high_property
+        """
+        for key in self.arr_dict:
+            self.arr_dict[key] = self.arr_dict[key][index]
+    
+    def remove_high_property(self,key,perAtom,species_key=None,cut=None,std_factor=10):
+        """
+        This function removes outlier data from the dataset
+        Must be called before splitting
+        "key": the property key in the dataset to check for high values
+        "perAtom": True if the property is defined per atom in axis 1, otherwise property is treated as full system
+        "std_factor": systems with values larger than this multiplier time the standard deviation of all data will be reomved. None to skip this step
+        "cut_factor": systems with values larger than this number are reomved. None to skip this step. This step is done first. 
+        """
+        if perAtom:
+            if species_key==None:
+                raise RuntimeError("species_key must be defined to trim a per atom quantity")
+            atom_ind = self.arr_dict[species_key] > 0
+        ndim = len(self.arr_dict[key].shape)
+        if cut!=None:
+            if perAtom:
+                Kmean = np.mean(self.arr_dict[key][atom_ind])
+            else:
+                Kmean = np.mean(self.arr_dict[key])
+            failArr = np.abs(self.arr_dict[key]-Kmean)>cut
+            #This does nothing with ndim=1
+            trimArr = np.sum(failArr,axis=tuple(range(1,ndim)))==0
+            self.trim_all_arrays(trimArr)
+            
+        if std_factor!=None:
+            if perAtom:
+                atom_ind = self.arr_dict[species_key] > 0
+                Kmean = np.mean(self.arr_dict[key][atom_ind])
+                std_cut = np.std(self.arr_dict[key][atom_ind]) * std_factor
+            else: 
+                Kmean = np.mean(self.arr_dict[key])
+                std_cut = np.std(self.arr_dict[key]) * std_factor
+            failArr = np.abs(self.arr_dict[key]-Kmean)>std_cut
+            #This does nothing with ndim=1
+            trimArr = np.sum(failArr,axis=tuple(range(1,ndim)))==0
+            self.trim_all_arrays(trimArr)
+
 
 
 def compute_index_mask(indices, index_pool):

From c841ddd35214c5336afb37d9a3f36d6be0e6e029 Mon Sep 17 00:00:00 2001
From: Nicholas Lubbers <56895592+lubbersnick@users.noreply.github.com>
Date: Mon, 1 May 2023 11:06:51 -0600
Subject: [PATCH 2/7] unnecessary index type constraint for vectors (#31) (#32)

This was causing lammps interfaces not to build correctly.
---
 hippynn/graphs/nodes/physics.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hippynn/graphs/nodes/physics.py b/hippynn/graphs/nodes/physics.py
index eecbadae..8851f6a3 100644
--- a/hippynn/graphs/nodes/physics.py
+++ b/hippynn/graphs/nodes/physics.py
@@ -229,7 +229,6 @@ def expansion2(self, vector, helper, *, purpose, **kwargs):
 
     _parent_expander.assertlen(1)
     _parent_expander.get_main_outputs()
-    _parent_expander.require_idx_states(IdxType.Atoms)
 
     def __init__(self, name, parents, module="auto", _helper=None, **kwargs):
         parents = self.expand_parents(parents)

From 6d571ea5aeb9c7814eeea55d55b26a426108002a Mon Sep 17 00:00:00 2001
From: Ben Nebgen <bnebgen@lanl.gov>
Date: Wed, 3 May 2023 21:58:31 -0600
Subject: [PATCH 3/7] Removed ase interface reference from calculator

This causes errors with later versions of ase
---
 hippynn/interfaces/ase_interface/calculator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hippynn/interfaces/ase_interface/calculator.py b/hippynn/interfaces/ase_interface/calculator.py
index 586fdf14..b4d1f671 100644
--- a/hippynn/interfaces/ase_interface/calculator.py
+++ b/hippynn/interfaces/ase_interface/calculator.py
@@ -4,7 +4,6 @@
 import warnings
 import torch
 
-from ase.calculators import interface
 from ase.calculators.calculator import compare_atoms, PropertyNotImplementedError, Calculator # Calculator is required to allow HIPNN to be used with ASE Mixing Calculators
 
 from hippynn.graphs import find_relatives, find_unique_relative, get_subgraph, copy_subgraph, replace_node, GraphModule

From d3a7d69cc997e0794bc15b3500009d99eb25c05a Mon Sep 17 00:00:00 2001
From: Ben Nebgen <bnebgen@lanl.gov>
Date: Sun, 16 Jul 2023 06:14:56 -0600
Subject: [PATCH 4/7] changed model to float 32

---
 hippynn/interfaces/lammps_interface/mliap_interface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hippynn/interfaces/lammps_interface/mliap_interface.py b/hippynn/interfaces/lammps_interface/mliap_interface.py
index db180cef..0b7a5eec 100644
--- a/hippynn/interfaces/lammps_interface/mliap_interface.py
+++ b/hippynn/interfaces/lammps_interface/mliap_interface.py
@@ -40,7 +40,7 @@ def __init__(self, energy_node, element_types, ndescriptors=1,
         # Build the calculator
         self.rcutfac, self.species_set, self.graph = setup_LAMMPS_graph(energy_node)
         self.nparams = sum(p.nelement() for p in self.graph.parameters())
-        self.graph.to(torch.float64)
+        self.graph.to(torch.float32)
 
     def compute_gradients(self, data):
         pass
@@ -61,7 +61,7 @@ def compute_forces(self, data):
         z_vals = self.species_set[elems+1]
         pair_i = self.as_tensor(data.pair_i).type(torch.int64)
         pair_j = self.as_tensor(data.pair_j).type(torch.int64)
-        rij = self.as_tensor(data.rij).type(torch.float64)
+        rij = self.as_tensor(data.rij).type(torch.float32)
         nlocal = self.as_tensor(data.nlistatoms) 
            
         # note your sign for rij might need to be +1 or -1, depending on how your implementation works

From 83e4b4d9c734979ec5214df6d2b789a67f8e39cb Mon Sep 17 00:00:00 2001
From: Ben Nebgen <bnebgen@lanl.gov>
Date: Mon, 11 Sep 2023 15:16:32 -0600
Subject: [PATCH 5/7] Fixed devide in targes.py(70)

---
 hippynn/layers/targets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hippynn/layers/targets.py b/hippynn/layers/targets.py
index 4b8128d0..98077c74 100644
--- a/hippynn/layers/targets.py
+++ b/hippynn/layers/targets.py
@@ -67,7 +67,7 @@ def forward(self, all_features, mol_index, n_molecules):
             total_hier = torch.zeros_like(total_energies)
             mol_hier = torch.zeros_like(total_energies)
             total_atom_hier = torch.zeros_like(total_atomen)
-            batch_hier = torch.zeros(1,dtype=total_energies.dtype,device=total_energies.dtype)
+            batch_hier = torch.zeros(1,dtype=total_energies.dtype,device=total_energies.device)
 
         return total_energies, total_atomen, partial_sums, total_hier, total_atom_hier, mol_hier, batch_hier
 

From 80d8bd50caf1e558e0f8fc8351afd75c06fc62f0 Mon Sep 17 00:00:00 2001
From: Ben Nebgen <bnebgen@lanl.gov>
Date: Mon, 9 Oct 2023 14:58:56 -0600
Subject: [PATCH 6/7] Fixes for lammps and ase interface

lammps: reduce float64 to float32 for speed
ase: remove reference to calculator
---
 hippynn/interfaces/ase_interface/calculator.py         | 2 +-
 hippynn/interfaces/lammps_interface/mliap_interface.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hippynn/interfaces/ase_interface/calculator.py b/hippynn/interfaces/ase_interface/calculator.py
index 586fdf14..213a3011 100644
--- a/hippynn/interfaces/ase_interface/calculator.py
+++ b/hippynn/interfaces/ase_interface/calculator.py
@@ -4,7 +4,7 @@
 import warnings
 import torch
 
-from ase.calculators import interface
+#from ase.calculators import interface
 from ase.calculators.calculator import compare_atoms, PropertyNotImplementedError, Calculator # Calculator is required to allow HIPNN to be used with ASE Mixing Calculators
 
 from hippynn.graphs import find_relatives, find_unique_relative, get_subgraph, copy_subgraph, replace_node, GraphModule
diff --git a/hippynn/interfaces/lammps_interface/mliap_interface.py b/hippynn/interfaces/lammps_interface/mliap_interface.py
index acc27e32..fb2858ea 100644
--- a/hippynn/interfaces/lammps_interface/mliap_interface.py
+++ b/hippynn/interfaces/lammps_interface/mliap_interface.py
@@ -41,7 +41,7 @@ def __init__(self, energy_node, element_types, ndescriptors=1,
         # Build the calculator
         self.rcutfac, self.species_set, self.graph = setup_LAMMPS_graph(energy_node)
         self.nparams = sum(p.nelement() for p in self.graph.parameters())
-        self.graph.to(torch.float64)
+        self.graph.to(torch.float32)
 
     def compute_gradients(self, data):
         pass
@@ -62,7 +62,7 @@ def compute_forces(self, data):
         z_vals = self.species_set[elems+1]
         pair_i = self.as_tensor(data.pair_i).type(torch.int64)
         pair_j = self.as_tensor(data.pair_j).type(torch.int64)
-        rij = self.as_tensor(data.rij).type(torch.float64)
+        rij = self.as_tensor(data.rij).type(torch.float32)
         nlocal = self.as_tensor(data.nlistatoms) 
            
         # note your sign for rij might need to be +1 or -1, depending on how your implementation works

From 5bbdb6a5a4d65e665a1323d69423c737493fdefc Mon Sep 17 00:00:00 2001
From: Ben Nebgen <bnebgen@lanl.gov>
Date: Fri, 13 Oct 2023 12:43:02 -0600
Subject: [PATCH 7/7] Reverted merge mistakes with database.py

---
 hippynn/databases/database.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hippynn/databases/database.py b/hippynn/databases/database.py
index 3466d008..e9375b49 100644
--- a/hippynn/databases/database.py
+++ b/hippynn/databases/database.py
@@ -191,10 +191,10 @@ def make_generator(self, split_type, evaluation_mode, batch_size=None, subsample
         if not self.splitting_completed:
             raise ValueError("Database has not yet been split.")
 
-        if split_type in ("train", "valid", "test"):
-            data = [self.splits[split_type][k] for k in self.var_list]
-        else:
-            raise ValueError("Datatype {} Invalid. Must be one of 'train','valid','test'".format(split_type))
+        if split_type not in self.splits:
+            raise ValueError(f"Split {split_type} Invalid. Current splits:{list(self.splits.keys())}")
+
+        data = [self.splits[split_type][k] for k in self.var_list]
 
         if evaluation_mode == "train":
             if split_type != "train":
@@ -205,7 +205,7 @@ def make_generator(self, split_type, evaluation_mode, batch_size=None, subsample
         elif evaluation_mode == "eval":
             shuffle = False
         else:
-            raise ValueError("Evaluation_mode ({}) must be one of 'train' or 'eval'")
+            raise ValueError(f"Evaluation_mode ({evaluation_mode}) must be one of 'train' or 'eval'")
 
         dataset = NamedTensorDataset(self.var_list, *data)
         if subsample: