Skip to content

Commit

Permalink
Merge branch 'main' into add_calculator_only_output_field
Browse files Browse the repository at this point in the history
  • Loading branch information
lbluque authored Dec 3, 2024
2 parents 5e3ee75 + e11e78e commit f724a36
Show file tree
Hide file tree
Showing 21 changed files with 1,622 additions and 47 deletions.
80 changes: 80 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: Report a Bug
description: FAIR-Chem bug report
labels: bug
body:
- type: input
id: python-version
attributes:
label: Python version
description: Use `python --version` to get Python version
placeholder: ex. Python 3.11.5
validations:
required: true

- type: input
id: fairchem-version
attributes:
label: fairchem-core version
description: Use `pip show fairchem-core | grep Version` to get fairchem-core version
placeholder: ex. 1.2.1
validations:
required: true

- type: input
id: torch-version
attributes:
label: pytorch version
description: Use `pip show torch | grep Version` to get pytorch version
placeholder: ex. 2.4.0
validations:
required: true

- type: input
id: cuda-version
attributes:
label: cuda version
description: Use `python -c 'import torch; cuda=torch.cuda.is_available(); print(cuda,torch.version.cuda if cuda else None);'` to get cuda version
placeholder: ex. 12.1
validations:
required: true

- type: input
id: os
attributes:
label: Operating system version
placeholder: ex. Ubuntu 22.04 LTS
validations:
required: false

- type: textarea
id: code-snippet
attributes:
label: Minimal example
description: Please provide a minimal code snippet to reproduce this bug.
render: Python
validations:
required: false

- type: textarea
id: current-behavior
attributes:
label: Current behavior
description: What behavior do you see?
validations:
required: true

- type: textarea
id: expected-behavior
attributes:
label: Expected Behavior
description: What did you expect to see?
validations:
required: true

- type: textarea
id: files
attributes:
label: Relevant files to reproduce this bug
description: Please upload relevant files to help reproduce this bug, or logs if helpful.
validations:
required: false
10 changes: 10 additions & 0 deletions .github/ISSUE_TEMPLATE/misc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: Other
description: A report is not a bug exactly

body:
- type: textarea
attributes:
label: What would you like to report?
description: A clear and concise description of what you would like to report.
validations:
required: true
2 changes: 1 addition & 1 deletion configs/omat24/all/eqV2_153M.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 2.5
- forces:
fn: l2mae
coefficient: 20
Expand Down
2 changes: 1 addition & 1 deletion configs/omat24/all/eqV2_31M.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 2.5
- forces:
fn: l2mae
coefficient: 20
Expand Down
2 changes: 1 addition & 1 deletion configs/omat24/all/eqV2_86M.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 2.5
- forces:
fn: l2mae
coefficient: 20
Expand Down
2 changes: 1 addition & 1 deletion configs/omat24/finetune/eqV2_153M_ft_salexmptrj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 2.5
- forces:
fn: l2mae
coefficient: 10
Expand Down
2 changes: 1 addition & 1 deletion configs/omat24/finetune/eqV2_31M_ft_salexmptrj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 2.5
- forces:
fn: l2mae
coefficient: 10
Expand Down
2 changes: 1 addition & 1 deletion configs/omat24/finetune/eqV2_86M_ft_salexmptrj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 2.5
- forces:
fn: l2mae
coefficient: 10
Expand Down
2 changes: 1 addition & 1 deletion configs/omat24/mptrj/eqV2_153M_dens_mptrj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 2.5
- forces:
fn: l2mae
coefficient: 20
Expand Down
11 changes: 5 additions & 6 deletions configs/omat24/mptrj/eqV2_31M_dens_mptrj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 5
- forces:
fn: l2mae
coefficient: 20
Expand Down Expand Up @@ -148,17 +148,16 @@ model:

use_force_encoding: True
use_noise_schedule_sigma_encoding: False
use_denoising_energy: True
use_denoising_stress: False


heads:
energy:
module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSEnergyHead
module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSScalarHead
use_denoising: True
forces:
module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSForceHead
module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSVectorHead
stress:
module: fairchem.core.models.equiformer_v2.equiformer_v2_dens.DeNSRank2Head
output_name: stress
use_source_target_embedding: True
decompose: True
use_denoising: False
2 changes: 1 addition & 1 deletion configs/omat24/mptrj/eqV2_31M_mptrj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 5
- forces:
fn: l2mae
coefficient: 20
Expand Down
2 changes: 1 addition & 1 deletion configs/omat24/mptrj/eqV2_86M_dens_mptrj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ outputs:
loss_functions:
- energy:
fn: per_atom_mae
coefficient: 20
coefficient: 2.5
- forces:
fn: l2mae
coefficient: 20
Expand Down
4 changes: 2 additions & 2 deletions docs/core/model_checkpoints.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,15 +149,15 @@ OC22 dataset or pretrained models, as well as the original paper for each model:
| GemNet-OC-S2EF-ODAC | GemNet-OC | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Gemnet-OC.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/gemnet-oc.yml) |
| eSCN-S2EF-ODAC | eSCN | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/eSCN.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eSCN.yml) |
| EquiformerV2-S2EF-ODAC | EquiformerV2 | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231116/eqv2_31M.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eqv2_31M.yml) |
| EquiformerV2-Large-S2EF-ODAC | EquiformerV2 (Large) | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Equiformer_V2_Large.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eqv2_153M.yml) |
| EquiformerV2-Large-S2EF-ODAC | EquiformerV2 (Large) | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231116/Equiformer_V2_Large.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/s2ef/eqv2_153M.yml) |

## IS2RE Direct models

| Model Name | Model |Checkpoint | Config |
|-------------------------|--------------|--- | --- |
| Gemnet-OC-IS2RE-ODAC | Gemnet-OC | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Gemnet-OC_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/gemnet-oc.yml) |
| eSCN-IS2RE-ODAC | eSCN | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/eSCN_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/eSCN.yml) |
| EquiformerV2-IS2RE-ODAC | EquiformerV2 | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231018/Equiformer_V2_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/eqv2_31M.yml) |
| EquiformerV2-IS2RE-ODAC | EquiformerV2 | [checkpoint](https://dl.fbaipublicfiles.com/dac/checkpoints_20231116/Equiformer_V2_Direct.pt) | [config](https://github.com/FAIR-Chem/fairchem/tree/main/configs/odac/is2re/eqv2_31M.yml) |

The models in the table above were trained to predict relaxed energy directly. Relaxed energies can also be predicted by running structural relaxations using the S2EF models from the previous section.

Expand Down
3 changes: 0 additions & 3 deletions packages/fairchem-data-om/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@ git_describe_command = 'git describe --tags --match fairchem_data_om-*'
[tool.hatch.build]
directory = "../../dist-data-om"

[tool.hatch.build]
directory = "../../dist"

[tool.hatch.build.targets.sdist]
only-include = ["src/fairchem/data/om"]

Expand Down
81 changes: 59 additions & 22 deletions src/fairchem/core/_cli_hydra.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,49 +12,71 @@
from typing import TYPE_CHECKING

import hydra
from omegaconf import OmegaConf

if TYPE_CHECKING:
import argparse

from omegaconf import DictConfig

from fairchem.core.components.runner import Runner


from submitit import AutoExecutor
from submitit.helpers import Checkpointable, DelayedSubmission
from torch.distributed.launcher.api import LaunchConfig, elastic_launch

from fairchem.core.common import distutils
from fairchem.core.common.flags import flags
from fairchem.core.common.utils import get_timestamp_uid, setup_env_vars, setup_imports
from fairchem.core.components.runner import Runner

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class Submitit(Checkpointable):
def __call__(self, dict_config: DictConfig, cli_args: argparse.Namespace) -> None:
def __call__(self, dict_config: DictConfig) -> None:
self.config = dict_config
self.cli_args = cli_args
# TODO: setup_imports is not needed if we stop instantiating models with Registry.
setup_imports()
setup_env_vars()
try:
distutils.setup(map_cli_args_to_dist_config(cli_args))
runner: Runner = hydra.utils.instantiate(dict_config.runner)
runner.load_state()
runner.run()
finally:
distutils.cleanup()

def checkpoint(self, *args, **kwargs):
distutils.setup(map_cli_args_to_dist_config(dict_config.cli_args))
self._init_logger()
runner: Runner = hydra.utils.instantiate(dict_config.runner)
runner.load_state()
runner.run()
distutils.cleanup()

def _init_logger(self) -> None:
# optionally instantiate a singleton wandb logger, intentionally only supporting the new wandb logger
# don't start logger if in debug mode
if (
"logger" in self.config
and distutils.is_master()
and not self.config.cli_args.debug
):
# get a partial function from the config and instantiate wandb with it
logger_initializer = hydra.utils.instantiate(self.config.logger)
simple_config = OmegaConf.to_container(
self.config, resolve=True, throw_on_missing=True
)
logger_initializer(
config=simple_config,
run_id=self.config.cli_args.timestamp_id,
run_name=self.config.cli_args.identifier,
log_dir=self.config.cli_args.logdir,
)

def checkpoint(self, *args, **kwargs) -> DelayedSubmission:
# TODO: this is yet to be tested properly
logging.info("Submitit checkpointing callback is triggered")
new_runner = Runner()
new_runner.save_state()
new_runner = Submitit()
self.runner.save_state()
logging.info("Submitit checkpointing callback is completed")
return DelayedSubmission(new_runner, self.config)
return DelayedSubmission(new_runner, self.config, self.cli_args)


def map_cli_args_to_dist_config(cli_args: argparse.Namespace) -> dict:
def map_cli_args_to_dist_config(cli_args: DictConfig) -> dict:
return {
"world_size": cli_args.num_nodes * cli_args.num_gpus,
"distributed_backend": "gloo" if cli_args.cpu else "nccl",
Expand All @@ -76,8 +98,8 @@ def get_hydra_config_from_yaml(
return hydra.compose(config_name=config_name, overrides=overrides_args)


def runner_wrapper(config: DictConfig, cli_args: argparse.Namespace):
Submitit()(config, cli_args)
def runner_wrapper(config: DictConfig):
Submitit()(config)


# this is meant as a future replacement for the main entrypoint
Expand All @@ -91,6 +113,11 @@ def main(
cfg = get_hydra_config_from_yaml(args.config_yml, override_args)
timestamp_id = get_timestamp_uid()
log_dir = os.path.join(args.run_dir, timestamp_id, "logs")
# override timestamp id and logdir
args.timestamp_id = timestamp_id
args.logdir = log_dir
os.makedirs(log_dir)
OmegaConf.update(cfg, "cli_args", vars(args), force_add=True)
if args.submit: # Run on cluster
executor = AutoExecutor(folder=log_dir, slurm_max_num_timeout=3)
executor.update_parameters(
Expand All @@ -105,22 +132,32 @@ def main(
slurm_qos=args.slurm_qos,
slurm_account=args.slurm_account,
)
job = executor.submit(runner_wrapper, cfg, args)
job = executor.submit(runner_wrapper, cfg)
logger.info(
f"Submitted job id: {timestamp_id}, slurm id: {job.job_id}, logs: {log_dir}"
)
else:
if args.num_gpus > 1:
logger.info(f"Running in local mode with {args.num_gpus} ranks")
logging.info(f"Running in local mode with {args.num_gpus} ranks")
# HACK to disable multiprocess dataloading in local mode
# there is an open issue where LMDB's environment cannot be pickled and used
# during torch multiprocessing https://github.com/pytorch/examples/issues/526
# this HACK only works for a training submission where the config is passed in here
if "optim" in cfg and "num_workers" in cfg["optim"]:
cfg["optim"]["num_workers"] = 0
logging.info(
"WARNING: running in local mode, setting dataloading num_workers to 0, see https://github.com/pytorch/examples/issues/526"
)

launch_config = LaunchConfig(
min_nodes=1,
max_nodes=1,
nproc_per_node=args.num_gpus,
rdzv_backend="c10d",
max_restarts=0,
)
elastic_launch(launch_config, runner_wrapper)(cfg, args)
elastic_launch(launch_config, runner_wrapper)(cfg)
else:
logger.info("Running in local mode without elastic launch")
distutils.setup_env_local()
runner_wrapper(cfg, args)
runner_wrapper(cfg)
3 changes: 2 additions & 1 deletion src/fairchem/core/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,8 @@ def radius_graph_pbc(

# Tensor of unit cells
cells_per_dim = [
torch.arange(-rep, rep + 1, device=device, dtype=torch.float) for rep in max_rep
torch.arange(-rep.item(), rep.item() + 1, device=device, dtype=torch.float)
for rep in max_rep
]
unit_cell = torch.cartesian_prod(*cells_per_dim)
num_cells = len(unit_cell)
Expand Down
Loading

0 comments on commit f724a36

Please sign in to comment.