Merge branch 'main' into add_calculator_only_output_field

FAIR-Chem · Dec 3, 2024 · f724a36 · f724a36
2 parents 5e3ee75 + e11e78e
commit f724a36
Show file tree

Hide file tree

Showing 21 changed files with 1,622 additions and 47 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -0,0 +1,80 @@
+name: Report a Bug
+description: FAIR-Chem bug report
+labels: bug
+body:
+  - type: input
+    id: python-version
+    attributes:
+      label: Python version
+      description: Use `python --version` to get Python version
+      placeholder: ex. Python 3.11.5
+    validations:
+      required: true
+
+  - type: input
+    id: fairchem-version
+    attributes:
+      label: fairchem-core version
+      description: Use `pip show fairchem-core | grep Version` to get fairchem-core version
+      placeholder: ex. 1.2.1
+    validations:
+      required: true
+
+  - type: input
+    id: torch-version
+    attributes:
+      label: pytorch version
+      description: Use `pip show torch | grep Version` to get pytorch version
+      placeholder: ex. 2.4.0
+    validations:
+      required: true
+
+  - type: input
+    id: cuda-version
+    attributes:
+      label: cuda version
+      description: Use `python -c 'import torch; cuda=torch.cuda.is_available(); print(cuda,torch.version.cuda if cuda else None);'` to get cuda version
+      placeholder: ex. 12.1
+    validations:
+      required: true
+
+  - type: input
+    id: os
+    attributes:
+      label: Operating system version
+      placeholder: ex. Ubuntu 22.04 LTS
+    validations:
+      required: false
+
+  - type: textarea
+    id: code-snippet
+    attributes:
+      label: Minimal example
+      description: Please provide a minimal code snippet to reproduce this bug.
+      render: Python
+    validations:
+      required: false
+
+  - type: textarea
+    id: current-behavior
+    attributes:
+      label: Current behavior
+      description: What behavior do you see?
+    validations:
+      required: true
+
+  - type: textarea
+    id: expected-behavior
+    attributes:
+      label: Expected Behavior
+      description: What did you expect to see?
+    validations:
+      required: true
+
+  - type: textarea
+    id: files
+    attributes:
+      label: Relevant files to reproduce this bug
+      description: Please upload relevant files to help reproduce this bug, or logs if helpful.
+    validations:
+      required: false
diff --git a/.github/ISSUE_TEMPLATE/misc.yaml b/.github/ISSUE_TEMPLATE/misc.yaml
@@ -0,0 +1,10 @@
+name: Other
+description: A report is not a bug exactly
+
+body:
+  - type: textarea
+    attributes:
+      label: What would you like to report?
+      description: A clear and concise description of what you would like to report.
+    validations:
+      required: true
diff --git a/configs/omat24/all/eqV2_153M.yml b/configs/omat24/all/eqV2_153M.yml
@@ -43,7 +43,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/all/eqV2_31M.yml b/configs/omat24/all/eqV2_31M.yml
@@ -44,7 +44,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/all/eqV2_86M.yml b/configs/omat24/all/eqV2_86M.yml
@@ -43,7 +43,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/finetune/eqV2_153M_ft_salexmptrj.yml b/configs/omat24/finetune/eqV2_153M_ft_salexmptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 10

diff --git a/configs/omat24/finetune/eqV2_31M_ft_salexmptrj.yml b/configs/omat24/finetune/eqV2_31M_ft_salexmptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 10

diff --git a/configs/omat24/finetune/eqV2_86M_ft_salexmptrj.yml b/configs/omat24/finetune/eqV2_86M_ft_salexmptrj.yml
@@ -43,7 +43,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 10

diff --git a/configs/omat24/mptrj/eqV2_153M_dens_mptrj.yml b/configs/omat24/mptrj/eqV2_153M_dens_mptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/mptrj/eqV2_31M_dens_mptrj.yml b/configs/omat24/mptrj/eqV2_31M_dens_mptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 5
   - forces:
       fn: l2mae
       coefficient: 20
@@ -148,17 +148,16 @@ model:
 
     use_force_encoding:                   True
     use_noise_schedule_sigma_encoding:    False
-    use_denoising_energy:                 True
-    use_denoising_stress:                 False
-
 
   heads:
     energy:
-      module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSEnergyHead
+      module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSScalarHead
+      use_denoising:                True
     forces:
-      module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSForceHead
+      module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSVectorHead
     stress:
       module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSRank2Head
       output_name: stress
       use_source_target_embedding:   True
       decompose:                     True
+      use_denoising:                 False
diff --git a/configs/omat24/mptrj/eqV2_31M_mptrj.yml b/configs/omat24/mptrj/eqV2_31M_mptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/configs/omat24/mptrj/eqV2_86M_dens_mptrj.yml b/configs/omat24/mptrj/eqV2_86M_dens_mptrj.yml
@@ -45,7 +45,7 @@ outputs:
 loss_functions:
   - energy:
       fn: per_atom_mae
-      coefficient: 20
+      coefficient: 2.5
   - forces:
       fn: l2mae
       coefficient: 20

diff --git a/docs/core/model_checkpoints.md b/docs/core/model_checkpoints.md
@@ -149,15 +149,15 @@ OC22 dataset or pretrained models, as well as the original paper for each model:
 | GemNet-OC-S2EF-ODAC          | GemNet-OC           | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Gemnet-OC.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/gemnet-oc.yml) |
 | eSCN-S2EF-ODAC               | eSCN                | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/eSCN.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eSCN.yml) |
 | EquiformerV2-S2EF-ODAC       | EquiformerV2        | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231116/eqv2_31M.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eqv2_31M.yml) |
-| EquiformerV2-Large-S2EF-ODAC | EquiformerV2 (Large) | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Equiformer_V2_Large.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eqv2_153M.yml) |
+| EquiformerV2-Large-S2EF-ODAC | EquiformerV2 (Large) | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231116/Equiformer_V2_Large.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eqv2_153M.yml) |
 
 ## IS2RE Direct models
 
 | Model Name              | Model        |Checkpoint	| Config |
 |-------------------------|--------------|---	| --- |
 | Gemnet-OC-IS2RE-ODAC    | Gemnet-OC    | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Gemnet-OC_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/gemnet-oc.yml) |
 | eSCN-IS2RE-ODAC         | eSCN         | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/eSCN_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/eSCN.yml) |
-| EquiformerV2-IS2RE-ODAC | EquiformerV2 | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Equiformer_V2_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/eqv2_31M.yml) |
+| EquiformerV2-IS2RE-ODAC | EquiformerV2 | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231116/Equiformer_V2_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/eqv2_31M.yml) |
 
 The models in the table above were trained to predict relaxed energy directly. Relaxed energies can also be predicted by running structural relaxations using the S2EF models from the previous section.
 

diff --git a/packages/fairchem-data-om/pyproject.toml b/packages/fairchem-data-om/pyproject.toml
@@ -30,9 +30,6 @@ git_describe_command = 'git describe --tags --match fairchem_data_om-*'
 [tool.hatch.build]
 directory = "../../dist-data-om"
 
-[tool.hatch.build]
-directory = "../../dist"
-
 [tool.hatch.build.targets.sdist]
 only-include = ["src/fairchem/data/om"]
 

diff --git a/src/fairchem/core/_cli_hydra.py b/src/fairchem/core/_cli_hydra.py
@@ -12,49 +12,71 @@
 from typing import TYPE_CHECKING
 
 import hydra
+from omegaconf import OmegaConf
 
 if TYPE_CHECKING:
     import argparse
 
     from omegaconf import DictConfig
 
+    from fairchem.core.components.runner import Runner
+
+
 from submitit import AutoExecutor
 from submitit.helpers import Checkpointable, DelayedSubmission
 from torch.distributed.launcher.api import LaunchConfig, elastic_launch
 
 from fairchem.core.common import distutils
 from fairchem.core.common.flags import flags
 from fairchem.core.common.utils import get_timestamp_uid, setup_env_vars, setup_imports
-from fairchem.core.components.runner import Runner
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 
 class Submitit(Checkpointable):
-    def __call__(self, dict_config: DictConfig, cli_args: argparse.Namespace) -> None:
+    def __call__(self, dict_config: DictConfig) -> None:
         self.config = dict_config
-        self.cli_args = cli_args
         # TODO: setup_imports is not needed if we stop instantiating models with Registry.
         setup_imports()
         setup_env_vars()
-        try:
-            distutils.setup(map_cli_args_to_dist_config(cli_args))
-            runner: Runner = hydra.utils.instantiate(dict_config.runner)
-            runner.load_state()
-            runner.run()
-        finally:
-            distutils.cleanup()
-
-    def checkpoint(self, *args, **kwargs):
+        distutils.setup(map_cli_args_to_dist_config(dict_config.cli_args))
+        self._init_logger()
+        runner: Runner = hydra.utils.instantiate(dict_config.runner)
+        runner.load_state()
+        runner.run()
+        distutils.cleanup()
+
+    def _init_logger(self) -> None:
+        # optionally instantiate a singleton wandb logger, intentionally only supporting the new wandb logger
+        # don't start logger if in debug mode
+        if (
+            "logger" in self.config
+            and distutils.is_master()
+            and not self.config.cli_args.debug
+        ):
+            # get a partial function from the config and instantiate wandb with it
+            logger_initializer = hydra.utils.instantiate(self.config.logger)
+            simple_config = OmegaConf.to_container(
+                self.config, resolve=True, throw_on_missing=True
+            )
+            logger_initializer(
+                config=simple_config,
+                run_id=self.config.cli_args.timestamp_id,
+                run_name=self.config.cli_args.identifier,
+                log_dir=self.config.cli_args.logdir,
+            )
+
+    def checkpoint(self, *args, **kwargs) -> DelayedSubmission:
+        # TODO: this is yet to be tested properly
         logging.info("Submitit checkpointing callback is triggered")
-        new_runner = Runner()
-        new_runner.save_state()
+        new_runner = Submitit()
+        self.runner.save_state()
         logging.info("Submitit checkpointing callback is completed")
-        return DelayedSubmission(new_runner, self.config)
+        return DelayedSubmission(new_runner, self.config, self.cli_args)
 
 
-def map_cli_args_to_dist_config(cli_args: argparse.Namespace) -> dict:
+def map_cli_args_to_dist_config(cli_args: DictConfig) -> dict:
     return {
         "world_size": cli_args.num_nodes * cli_args.num_gpus,
         "distributed_backend": "gloo" if cli_args.cpu else "nccl",
@@ -76,8 +98,8 @@ def get_hydra_config_from_yaml(
     return hydra.compose(config_name=config_name, overrides=overrides_args)
 
 
-def runner_wrapper(config: DictConfig, cli_args: argparse.Namespace):
-    Submitit()(config, cli_args)
+def runner_wrapper(config: DictConfig):
+    Submitit()(config)
 
 
 # this is meant as a future replacement for the main entrypoint
@@ -91,6 +113,11 @@ def main(
     cfg = get_hydra_config_from_yaml(args.config_yml, override_args)
     timestamp_id = get_timestamp_uid()
     log_dir = os.path.join(args.run_dir, timestamp_id, "logs")
+    # override timestamp id and logdir
+    args.timestamp_id = timestamp_id
+    args.logdir = log_dir
+    os.makedirs(log_dir)
+    OmegaConf.update(cfg, "cli_args", vars(args), force_add=True)
     if args.submit:  # Run on cluster
         executor = AutoExecutor(folder=log_dir, slurm_max_num_timeout=3)
         executor.update_parameters(
@@ -105,22 +132,32 @@ def main(
             slurm_qos=args.slurm_qos,
             slurm_account=args.slurm_account,
         )
-        job = executor.submit(runner_wrapper, cfg, args)
+        job = executor.submit(runner_wrapper, cfg)
         logger.info(
             f"Submitted job id: {timestamp_id}, slurm id: {job.job_id}, logs: {log_dir}"
         )
     else:
         if args.num_gpus > 1:
-            logger.info(f"Running in local mode with {args.num_gpus} ranks")
+            logging.info(f"Running in local mode with {args.num_gpus} ranks")
+            # HACK to disable multiprocess dataloading in local mode
+            # there is an open issue where LMDB's environment cannot be pickled and used
+            # during torch multiprocessing https://github.com/pytorch/examples/issues/526
+            # this HACK only works for a training submission where the config is passed in here
+            if "optim" in cfg and "num_workers" in cfg["optim"]:
+                cfg["optim"]["num_workers"] = 0
+                logging.info(
+                    "WARNING: running in local mode, setting dataloading num_workers to 0, see https://github.com/pytorch/examples/issues/526"
+                )
+
             launch_config = LaunchConfig(
                 min_nodes=1,
                 max_nodes=1,
                 nproc_per_node=args.num_gpus,
                 rdzv_backend="c10d",
                 max_restarts=0,
             )
-            elastic_launch(launch_config, runner_wrapper)(cfg, args)
+            elastic_launch(launch_config, runner_wrapper)(cfg)
         else:
             logger.info("Running in local mode without elastic launch")
             distutils.setup_env_local()
-            runner_wrapper(cfg, args)
+            runner_wrapper(cfg)
diff --git a/src/fairchem/core/common/utils.py b/src/fairchem/core/common/utils.py
@@ -716,7 +716,8 @@ def radius_graph_pbc(
 
     # Tensor of unit cells
     cells_per_dim = [
-        torch.arange(-rep, rep + 1, device=device, dtype=torch.float) for rep in max_rep
+        torch.arange(-rep.item(), rep.item() + 1, device=device, dtype=torch.float)
+        for rep in max_rep
     ]
     unit_cell = torch.cartesian_prod(*cells_per_dim)
     num_cells = len(unit_cell)