From 27f90beb57de62333989a5a64f47d1759b33804e Mon Sep 17 00:00:00 2001
From: James Spencer <jamessspencer@google.com>
Date: Mon, 5 Oct 2020 12:24:51 +0100
Subject: [PATCH] Initial commit.

PiperOrigin-RevId: 335389185
---
 CONTRIBUTING.md                     |   29 +
 LICENSE                             |  203 +++++
 README.md                           |  117 +++
 bin/ferminet                        |  384 ++++++++
 ferminet/hamiltonian.py             |  211 +++++
 ferminet/mcmc.py                    |  138 +++
 ferminet/mean_corrected_kfac_opt.py |   52 ++
 ferminet/networks.py                | 1300 +++++++++++++++++++++++++++
 ferminet/qmc.py                     |  493 ++++++++++
 ferminet/tests/elements_test.py     |  121 +++
 ferminet/tests/hamiltonian_test.py  |  285 ++++++
 ferminet/tests/mcmc_test.py         |  119 +++
 ferminet/tests/networks_test.py     |  394 ++++++++
 ferminet/tests/scf_test.py          |  238 +++++
 ferminet/tests/system_test.py       |  100 +++
 ferminet/tests/train_test.py        |  268 ++++++
 ferminet/tests/units_test.py        |   83 ++
 ferminet/train.py                   |  542 +++++++++++
 ferminet/utils/analysis_tools.py    |  114 +++
 ferminet/utils/elements.py          |  221 +++++
 ferminet/utils/scf.py               |  348 +++++++
 ferminet/utils/system.py            |  265 ++++++
 ferminet/utils/units.py             |   46 +
 setup.py                            |   52 ++
 24 files changed, 6123 insertions(+)
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 bin/ferminet
 create mode 100644 ferminet/hamiltonian.py
 create mode 100644 ferminet/mcmc.py
 create mode 100644 ferminet/mean_corrected_kfac_opt.py
 create mode 100644 ferminet/networks.py
 create mode 100644 ferminet/qmc.py
 create mode 100644 ferminet/tests/elements_test.py
 create mode 100644 ferminet/tests/hamiltonian_test.py
 create mode 100644 ferminet/tests/mcmc_test.py
 create mode 100644 ferminet/tests/networks_test.py
 create mode 100644 ferminet/tests/scf_test.py
 create mode 100644 ferminet/tests/system_test.py
 create mode 100644 ferminet/tests/train_test.py
 create mode 100644 ferminet/tests/units_test.py
 create mode 100644 ferminet/train.py
 create mode 100644 ferminet/utils/analysis_tools.py
 create mode 100644 ferminet/utils/elements.py
 create mode 100644 ferminet/utils/scf.py
 create mode 100644 ferminet/utils/system.py
 create mode 100644 ferminet/utils/units.py
 create mode 100644 setup.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..22b241c
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,29 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement (CLA). You (or your employer) retain the copyright to your
+contribution; this simply gives us permission to use and redistribute your
+contributions as part of the project. Head over to
+<https://cla.developers.google.com/> to see your current agreements on file or
+to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google/conduct/).
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6b0b127
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..21cb2f0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,117 @@
+# FermiNet: Fermionic Neural Networks
+
+An implementation of the algorithm and experiments defined in "Ab-Initio
+Solution of the Many-Electron Schroedinger Equation with Deep Neural Networks",
+David Pfau, James S. Spencer, Alex G de G Matthews and W.M.C. Foulkes, Phys.
+Rev. Research 2, 033429 (2020). FermiNet is a neural network for learning the
+ground state wavefunctions of atoms and molecules using a variational Monte
+Carlo approach.
+
+## Installation
+
+`pip install -e .` will install all required dependencies. This is best done
+inside a [virtual environment](https://docs.python-guide.org/dev/virtualenvs/).
+
+```
+virtualenv -m python3.7 ~/venv/ferminet
+source ~/venv/ferminet/bin/activate
+pip install -e .
+```
+
+If you have a GPU available (highly recommended for fast training), then use
+`pip install -e '.[tensorflow-gpu]'` to install TensorFlow with GPU support.
+
+We use python 3.7 (or earlier) because there is TensorFlow 1.15 wheel available
+for it. TensorFlow 2 is not currently supported.
+
+The tests are easiest run using pytest:
+
+```
+pip install pytest
+python -m pytest
+```
+
+## Usage
+
+```
+ferminet --batch_size 1024 --pretrain_iterations 100
+```
+
+will train FermiNet to find the ground-state wavefunction of the LiH molecule
+with a bond-length of 1.63999 angstroms using a batch size of 1024 MCMC
+configurations ("walkers" in variational Monte Carlo language), and 100
+iterations of pretraining (the default of 1000 is overkill for such a small
+system). The system and hyperparameters can be controlled by flags. Run
+
+```
+ferminet --help
+```
+
+to see the available options. Several systems used in the FermiNet paper are
+included by default. Other systems can easily be set up, by setting the
+appropriate system flags to `ferminet`, modifying `ferminet.utils.system` or
+writing a custom training script. For example, to run on the H2 molecule:
+
+```
+import sys
+
+from absl import logging
+from ferminet.utils import system
+from ferminet import train
+
+# Optional, for also printing training progress to STDOUT
+logging.get_absl_handler().python_handler.stream = sys.stdout
+logging.set_verbosity(logging.INFO)
+
+# Define H2 molecule
+molecule = [system.Atom('H', (0, 0, -1)), system.Atom('H', (0, 0, 1))]
+
+train.train(
+  molecule=molecule,
+  spins=(1, 1),
+  batch_size=256,
+  pretrain_config=train.PretrainConfig(iterations=100),
+  logging_config=train.LoggingConfig(result_path='H2'),
+)
+```
+
+`train.train` is controlled by a several lightweight config objects. Only
+non-default settings need to be explicitly supplied. Please see the docstrings
+for `train.train` and associated `*Config` classes for details.
+
+Note: to train on larger atoms and molecules with large batch sizes, multi-GPU
+parallelisation is essential. This is supported via TensorFlow's
+[MirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)
+and the `--multi_gpu` flag.
+
+## Output
+
+The results directory contains `pretrain_stats.csv`, which contains the
+pretraining loss for each iteration, `train_stats.csv` which contains the local
+energy and MCMC acceptance probability for each iteration, and the `checkpoints`
+directory, which contains the checkpoints generated during training. If
+requested, there is also an HDF5 file, `data.h5`, which contains the walker
+configuration, per-walker local energies and per-walker wavefunction values for
+each iteration. Warning: this quickly becomes very large!
+
+## Giving Credit
+
+If you use this code in your work, please cite the associated paper:
+
+```
+@article{ferminet,
+  title={Ab-Initio Solution of the Many-Electron Schr{\"o}dinger Equation with Deep Neural Networks},
+  author={D. Pfau and J.S. Spencer and A.G. de G. Matthews and W.M.C. Foulkes},
+  journal={Phys. Rev. Research},
+  year={2020},
+  volume={2},
+  issue = {3},
+  pages={033429},
+  doi = {10.1103/PhysRevResearch.2.033429},
+  url = {https://link.aps.org/doi/10.1103/PhysRevResearch.2.033429}
+}
+```
+
+## Disclaimer
+
+This is not an official Google product.
diff --git a/bin/ferminet b/bin/ferminet
new file mode 100644
index 0000000..b8efc7b
--- /dev/null
+++ b/bin/ferminet
@@ -0,0 +1,384 @@
+# Lint as: python3
+# Copyright 2020 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Learn ground state wavefunctions for molecular systems using VMC."""
+
+import ast
+import datetime
+import os
+import sys
+
+from absl import app
+from absl import flags
+from absl import logging
+from ferminet import train
+from ferminet.utils import system
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+FLAGS = flags.FLAGS
+
+################################################################################
+# High-level flags to configure hardware
+################################################################################
+
+flags.DEFINE_boolean(
+    'multi_gpu', False,
+    'Use all available GPUs on machine for batch parallelism. '
+    'Default: use single GPU.')
+
+################################################################################
+# High-level flags about training
+################################################################################
+
+flags.DEFINE_integer('batch_size', 4096, 'number of walkers')
+flags.DEFINE_boolean('double_precision', False,
+                     'use double rather than single precision')
+
+################################################################################
+# Flags related to pretraining
+################################################################################
+
+flags.DEFINE_integer(
+    'pretrain_iterations', 1000,
+    'Number of iterations for which to pretrain the network '
+    'to match Hartree-Fock orbitals.')
+flags.DEFINE_string('pretrain_basis', 'sto-3g',
+                    'Basis set used to run Hartree-Fock calculation in PySCF.')
+
+################################################################################
+# Flags related to optimization
+################################################################################
+
+flags.DEFINE_integer('iterations', 1000000, 'number of iterations')
+flags.DEFINE_float('clip_el', 5.0,
+                   'If not none, scale at which to clip local energy')
+
+# Flags related to the learning rate
+flags.DEFINE_float('learning_rate', 1.e-4, 'learning rate')
+flags.DEFINE_float('learning_rate_decay', 1.0,
+                   'exponent of learning rate decay')
+flags.DEFINE_float('learning_rate_delay', 10000.0,
+                   'set the scale of the rate decay')
+
+# Flags related to KFAC
+flags.DEFINE_boolean('use_kfac', True,
+                     'If false, use ADAM, else use KFAC as optimizer')
+flags.DEFINE_integer('kfac_invert_every', 1, 'See KFAC documentation')
+flags.DEFINE_integer('kfac_cov_update_every', 1, 'See KFAC documentation')
+flags.DEFINE_float('kfac_damping', 0.001, 'See KFAC documentation')
+flags.DEFINE_float('kfac_cov_ema_decay', 0.95, 'See KFAC documentation')
+flags.DEFINE_float('kfac_momentum', 0.0, 'See KFAC documentation')
+flags.DEFINE_string('kfac_momentum_type', 'regular', 'See KFAC documentation')
+flags.DEFINE_boolean('kfac_adapt_damping', False, 'See KFAC documentation')
+flags.DEFINE_float('kfac_damping_adaptation_decay', 0.9,
+                   'See KFAC documentation')
+flags.DEFINE_integer('kfac_damping_adaptation_interval', 5,
+                     'See KFAC documentation')
+flags.DEFINE_float('kfac_min_damping', 1.e-4, 'See KFAC documentation')
+flags.DEFINE_float('kfac_norm_constraint', 0.001, 'See KFAC documentation')
+
+################################################################################
+# Flags related to the system to solve for
+################################################################################
+
+flags.DEFINE_string('system_type', 'molecule',
+                    'Function to be called to create the system.')
+flags.DEFINE_string(
+    'system', 'LiH', 'If system_type is "molecule", the name of the molecule. '
+    'If system_type is "atom", the atomic symbol. '
+    'If system_type is "hn", the number of atoms in the '
+    'hydrogen chain.')
+flags.DEFINE_integer(
+    'system_charge', 0,
+    'The overall charge of the system. Positive for cations '
+    'and negative for anions.')
+flags.DEFINE_integer('system_dim', 3,
+                     'Number of dimensions of the system. Change with care.')
+flags.DEFINE_string(
+    'system_units', 'bohr',
+    'Units of *input* coords of atoms. Either "bohr" or '
+    '"angstrom". Internally work in a.u.; positions in '
+    'Angstroms are converged to Bohr.')
+
+# Flags related to diatomics, the hydrogen chain, and the hydrogen circle
+flags.DEFINE_float(
+    'system_separation', 0.0,
+    'For the hydrogen chain and diatomic systems, the '
+    'separation between nuclei. For the H4 circle, the radius '
+    'of the circle. For diatomics, will default to the '
+    'equilibrium bond length if set to 0.')
+
+# Flags related to the hydrogen circle
+flags.DEFINE_float('system_angle', np.pi / 4.0,
+                   'Angle from the x-axis for the H4 circle')
+
+################################################################################
+# Flags related to the MCMC chain
+################################################################################
+
+flags.DEFINE_integer(
+    'mcmc_burn_in', 100, 'Number of burn in steps after pretraining. '
+    'If zero do not burn in or reinitialize walkers.')
+flags.DEFINE_integer('mcmc_steps', 10,
+                     'Number of MCMC steps to make between network updates.')
+flags.DEFINE_float(
+    'mcmc_init_width', 0.8,
+    'Width of (atom-centred) Gaussian used to generate initial '
+    'electron configurations.')
+flags.DEFINE_float('mcmc_move_width', 0.02,
+                   'Width of Gaussian used for random moves.')
+flags.DEFINE_list(
+    'mcmc_init_means', '',
+    'Iterable of 3*nelectrons giving the mean initial position '
+    'of each electron. Configurations are drawn using Gaussians '
+    'of width init_width at each 3D position. Alpha electrons '
+    'are listed before beta electrons. If empty, electrons are '
+    'assigned to atoms based upon the isolated atom spin '
+    'configuration.')
+
+################################################################################
+# Flags related to the network architecture
+################################################################################
+
+flags.DEFINE_enum(
+    'network_architecture', 'ferminet', ['ferminet', 'slater'],
+    'The choice of architecture to run the calculation with. '
+    'Either "ferminet" or "slater" for the Fermi Net and '
+    'standard Slater determinant respectively.')
+flags.DEFINE_string(
+    'hidden_units', '((256, 32), (256, 32), (256, 32), (256, 32))',
+    'Number of hidden units in each layer of the network. If '
+    'the Fermi Net with one- and two-electron streams is used, '
+    'a tuple is provided for each layer, with the first '
+    'element giving the number of hidden units in the '
+    'one-electron stream and the second element giving the '
+    'number of units in the two-electron stream.')
+flags.DEFINE_integer('determinants', 16,
+                     'Number of determinants in the Fermi Net')
+
+flags.DEFINE_boolean(
+    'r12_en_features', True,
+    'Include r12/distance features between electrons and nuclei')
+flags.DEFINE_boolean(
+    'r12_ee_features', True,
+    'Include r12/distance features between pairs of electrons')
+flags.DEFINE_boolean('pos_ee_features', True,
+                     'Include electron-electron position features')
+flags.DEFINE_boolean(
+    'use_envelope', True,
+    'Include multiplicative exponentially-decaying envelope. '
+    'Calculations will not converge if set to False.')
+flags.DEFINE_boolean(
+    'backflow', False, 'Include backflow transformation in input coordinates. '
+    'Only for use if network_architecture == "slater". '
+    'Implies --build_backflow.')
+flags.DEFINE_boolean(
+    'build_backflow', False, 'Create backflow weights but do '
+    'not include backflow coordinate transformation. Use to '
+    'train a Slater-Jastrow architecture and then train a '
+    'Slater-Jastrow-Backflow architecture based on it.')
+flags.DEFINE_boolean('residual', True, 'Use residual connections. Recommended.')
+flags.DEFINE_list(
+    'after_det', '1',
+    'Comma-separated configuration of neural network after the '
+    'determinants. By default, just takes a  weighted sum of '
+    'determinants with no nonlinearity.')
+
+# Flags related to the Jastrow factor
+flags.DEFINE_boolean('jastrow_en', False,
+                     'Include electron-nuclear Jastrow factor.')
+flags.DEFINE_boolean('jastrow_ee', False,
+                     'Include electron-electron Jastrow factor.')
+flags.DEFINE_boolean('jastrow_een', False,
+                     'Include electron-electron-nuclear Jastrow factor.')
+
+################################################################################
+# Flags related to logging, checkpointing, and restoring
+################################################################################
+
+flags.DEFINE_integer('stats_frequency', 1,
+                     'Iterations between logging of stats.')
+flags.DEFINE_float('save_frequency', 10.0,
+                   'Minutes between saving network params.')
+flags.DEFINE_string(
+    'result_folder', '.', 'Path to save results and checkpoints to. A new '
+    'subdirectory will be created for every experiment. '
+    'By default, save locally.')
+flags.DEFINE_string('restore_path', '',
+                    'Path containing checkpoint to restore network from.')
+flags.DEFINE_boolean(
+    'log_walkers', False,
+    'Whether or not to log the values of all walkers every '
+    'iteration. Use with caution!!! Produces a lot of data '
+    'very quickly.')
+flags.DEFINE_boolean(
+    'log_local_energies', False,
+    'Whether or not to log all local energies for each walker '
+    'at each step.')
+flags.DEFINE_boolean(
+    'log_wavefunction', False,
+    'Whether or not to log all values of wavefunction for '
+    'each walker at each step.')
+
+################################################################################
+# Flags related to debugging
+################################################################################
+
+flags.DEFINE_boolean(
+    'check_loss', False, 'Apply gradient update only if the loss is not NaN. '
+    'If true, training could be slightly slower but the '
+    'checkpoint written out when a NaN is detected will be '
+    'with the network weights which led to '
+    'the NaN.')
+flags.DEFINE_boolean(
+    'determinism', False, 'CPU only mode that also enforces determinism.'
+    'Will run significantly slower if used.')
+flags.DEFINE_integer(
+    'random_seed', 1, 'Only works in determinism mode. '
+    'Set a random seed for the run.')
+flags.DEFINE_string('graph_path', '', 'File to write graph to.')
+
+
+def main(argv):
+  del argv
+  tf.enable_resource_variables()
+
+  if FLAGS.determinism:
+    tf.set_random_seed(FLAGS.random_seed)
+    logging.info('Activating determinism mode. Expect slow performance.')
+
+  # Create folders for logging
+  result_path = os.path.join(
+      FLAGS.result_folder, 'ferminet_results_' +
+      datetime.datetime.ctime(datetime.datetime.now()).replace(' ', '_'))
+  if not os.path.isdir(FLAGS.result_folder):
+    os.mkdir(FLAGS.result_folder)
+  os.mkdir(result_path)
+
+  # Save the command line arguments for reproducibility
+  with open(os.path.join(result_path, 'flags.txt'), 'w') as f:
+    f.write(' '.join(sys.argv[1:]) + '\n')
+
+  # Run function to create the system from flags.
+  logging.info('System Type: %s', FLAGS.system_type)
+  logging.info('System: %s', FLAGS.system)
+  if FLAGS.system_type == 'molecule':
+    molecule, spins = system.molecule(
+        FLAGS.system,
+        bond_length=FLAGS.system_separation,
+        units=FLAGS.system_units)
+  elif FLAGS.system_type == 'atom':
+    molecule, spins = system.atom(FLAGS.system, charge=FLAGS.system_charge)
+  elif FLAGS.system_type == 'hn':
+    molecule, spins = system.hn(
+        int(FLAGS.system),
+        FLAGS.system_separation,
+        charge=FLAGS.system_charge,
+        units=FLAGS.system_units)
+  elif FLAGS.system_type == 'h4_circle':
+    molecule, spins = system.h4_circle(
+        FLAGS.system_separation, FLAGS.system_angle, units=FLAGS.system_units)
+  else:
+    raise ValueError('Not a recognized system type: %s' % FLAGS.system_type)
+
+  network_config = train.NetworkConfig(
+      architecture=FLAGS.network_architecture,
+      hidden_units=ast.literal_eval(FLAGS.hidden_units),
+      determinants=FLAGS.determinants,
+      r12_en_features=FLAGS.r12_en_features,
+      r12_ee_features=FLAGS.r12_ee_features,
+      pos_ee_features=FLAGS.pos_ee_features,
+      use_envelope=FLAGS.use_envelope,
+      backflow=FLAGS.backflow,
+      build_backflow=FLAGS.build_backflow,
+      residual=FLAGS.residual,
+      after_det=tuple(int(x) for x in FLAGS.after_det),
+      jastrow_en=FLAGS.jastrow_en,
+      jastrow_ee=FLAGS.jastrow_ee,
+      jastrow_een=FLAGS.jastrow_een,
+  )
+
+  pretrain_config = train.PretrainConfig(
+      iterations=FLAGS.pretrain_iterations,
+      basis=FLAGS.pretrain_basis,
+  )
+
+  optim_config = train.OptimConfig(
+      iterations=FLAGS.iterations,
+      learning_rate=FLAGS.learning_rate,
+      learning_rate_decay=FLAGS.learning_rate_decay,
+      learning_rate_delay=FLAGS.learning_rate_delay,
+      clip_el=FLAGS.clip_el,
+      use_kfac=FLAGS.use_kfac,
+      check_loss=FLAGS.check_loss,
+      deterministic=FLAGS.determinism,
+  )
+
+  kfac_config = train.KfacConfig(
+      invert_every=FLAGS.kfac_invert_every,
+      cov_update_every=FLAGS.kfac_cov_update_every,
+      damping=FLAGS.kfac_damping,
+      cov_ema_decay=FLAGS.kfac_cov_ema_decay,
+      momentum=FLAGS.kfac_momentum,
+      momentum_type=FLAGS.kfac_momentum_type,
+      adapt_damping=FLAGS.kfac_adapt_damping,
+      damping_adaptation_decay=FLAGS.kfac_damping_adaptation_decay,
+      damping_adaptation_interval=FLAGS.kfac_damping_adaptation_interval,
+      min_damping=FLAGS.kfac_min_damping,
+  )
+
+  mcmc_config = train.MCMCConfig(
+      burn_in=FLAGS.mcmc_burn_in,
+      steps=FLAGS.mcmc_steps,
+      init_width=FLAGS.mcmc_init_width,
+      move_width=FLAGS.mcmc_move_width,
+      init_means=tuple(float(x) for x in FLAGS.mcmc_init_means),
+  )
+
+  logging_config = train.LoggingConfig(
+      result_path=result_path,
+      save_frequency=FLAGS.save_frequency,
+      restore_path=FLAGS.restore_path,
+      stats_frequency=FLAGS.stats_frequency,
+      walkers=FLAGS.log_walkers,
+      wavefunction=FLAGS.log_wavefunction,
+      local_energy=FLAGS.log_local_energies,
+      config={
+          'system_type': FLAGS.system_type,
+          'system': FLAGS.system,
+          'system_units': FLAGS.system_units,
+          'system_separation': FLAGS.system_separation,
+          'system_charge': FLAGS.system_charge,
+      },
+  )
+
+  train.train(
+      molecule=molecule,
+      spins=spins,
+      batch_size=FLAGS.batch_size,
+      network_config=network_config,
+      pretrain_config=pretrain_config,
+      optim_config=optim_config,
+      kfac_config=kfac_config,
+      mcmc_config=mcmc_config,
+      logging_config=logging_config,
+      multi_gpu=FLAGS.multi_gpu,
+      double_precision=FLAGS.double_precision,
+      graph_path=FLAGS.graph_path)
+  logging.info('Fermi Net training run completed successfully.')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/ferminet/hamiltonian.py b/ferminet/hamiltonian.py
new file mode 100644
index 0000000..c2141a3
--- /dev/null
+++ b/ferminet/hamiltonian.py
@@ -0,0 +1,211 @@
+# Lint as: python3
+# Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for constructing Hamiltonians and setting up VMC calculations."""
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+
+def kinetic_from_log(f, x):
+  r"""Compute -1/2 \nabla^2 \psi / \psi from log|psi|."""
+  with tf.name_scope('kinetic_from_log'):
+    df = tf.gradients(f, x)[0]
+    lapl_elem = tf.map_fn(
+        lambda i: tf.gradients(tf.expand_dims(df[..., i], -1), x)[0][..., i],
+        tf.range(x.shape[1]),
+        dtype=x.dtype.base_dtype,
+        back_prop=False,
+        parallel_iterations=20)
+    lapl = (tf.reduce_sum(lapl_elem, axis=0) +
+            tf.reduce_sum(df**2, axis=-1))
+    return -0.5*tf.expand_dims(lapl, -1)
+
+
+def operators(atoms,
+              nelectrons,
+              potential_epsilon=0.0):
+  """Creates kinetic and potential operators of Hamiltonian in atomic units.
+
+  Args:
+    atoms: list of Atom objects for each atom in the system.
+    nelectrons: number of electrons
+    potential_epsilon: Epsilon used to smooth the divergence of the 1/r
+      potential near the origin for algorithms with numerical stability issues.
+
+  Returns:
+    The functions that generates the kinetic and potential energy as a TF op.
+  """
+  vnn = 0.0
+  for i, atom_i in enumerate(atoms):
+    for atom_j in atoms[i+1:]:
+      qij = float(atom_i.charge * atom_j.charge)
+      vnn += qij / np.linalg.norm(atom_i.coords_array - atom_j.coords_array)
+
+  def smooth_norm(x):
+    with tf.name_scope('smooth_norm'):
+      if potential_epsilon == 0.0:
+        return tf.norm(x, axis=1, keepdims=True)
+      else:
+        return tf.sqrt(tf.reduce_sum(x**2 + potential_epsilon**2,
+                                     axis=1,
+                                     keepdims=True))
+
+  def nuclear_potential(xs):
+    """Calculates nuclear potential for set of electron positions."""
+    with tf.name_scope('Vne'):
+      v = []
+      for atom in atoms:
+        charge = tf.constant(atom.charge, dtype=xs[0].dtype)
+        coords = tf.constant(atom.coords, dtype=xs[0].dtype)
+        v.extend([-charge / smooth_norm(coords - x) for x in xs])
+      return tf.add_n(v)
+
+  def electronic_potential(xs):
+    """Calculates electronic potential for set of electron positions."""
+    with tf.name_scope('Vee'):
+      if len(xs) > 1:
+        v = []
+        for (i, ri) in enumerate(xs):
+          v.extend([
+              1.0 / smooth_norm(ri - rj) for rj in xs[i + 1:]
+          ])
+        return tf.add_n(v)
+      else:
+        return 0.0
+
+  def nuclear_nuclear(dtype):
+    """Calculates the nuclear-nuclear interaction contribution."""
+    with tf.name_scope('Vnn'):
+      return tf.constant(vnn, dtype=dtype)
+
+  def potential(x):
+    """Calculates the total potential energy at each electron position."""
+    xs = tf.split(x, nelectrons, axis=1)
+    return (nuclear_potential(xs) + electronic_potential(xs)
+            + nuclear_nuclear(xs[0].dtype))
+
+  return kinetic_from_log, potential
+
+
+def exact_hamiltonian(atoms,
+                      nelectrons,
+                      potential_epsilon=0.0):
+  """Construct function that evaluates exact Hamiltonian of a system.
+
+  Args:
+    atoms: list of Atom objects for each atom in the system.
+    nelectrons: number of electrons
+    potential_epsilon: Epsilon used to smooth the divergence of the 1/r
+      potential near the origin for algorithms with numerical stability issues.
+
+  Returns:
+    A functions that generates the wavefunction and hamiltonian op.
+  """
+  k_fn, v_fn = operators(atoms, nelectrons, potential_epsilon)
+  def _hamiltonian(f, x):
+    # kinetic_from_log actually computes -1/2 \nabla^2 f / f, so multiply by f
+    logpsi, signpsi = f(x)
+    psi = tf.exp(logpsi) * signpsi
+    hpsi = psi * (k_fn(logpsi, x) + v_fn(x))
+    return psi, hpsi
+  return _hamiltonian
+
+
+def r12_features(x, atoms, nelectrons, keep_pos=True, flatten=False,
+                 atomic_coords=False):
+  """Adds physically-motivated features depending upon electron distances.
+
+  The tensor of electron positions is extended to include the distance of each
+  electron to each nucleus and distance between each pair of electrons.
+
+  Args:
+    x: electron positions. Tensor either of shape (batch_size, nelectrons*ndim),
+      or (batch_size, nelectrons, ndim), where ndim is the dimensionality of the
+      system (usually 3).
+    atoms: list of Atom objects for each atom in the system.
+    nelectrons: number of electrons.
+    keep_pos: If true, includes the original electron positions in the output
+    flatten: If true, return the distances as a flat vector for each element of
+      the batch. If false, return the atom-electron distances and electron-
+      electron distances each as 3D arrays.
+    atomic_coords: If true, replace the original position of the electrons with
+      the position of the electrons relative to all atoms.
+
+  Returns:
+    If flatten is true, keep_pos is true and atomic_coords is false:
+      tensor of shape (batch_size, ndim*Ne + Ne*Na + Ne(Ne-1)/2), where Ne (Na)
+      is the number of electrons (atoms). The first ndim*Ne terms are the
+      original x, the next Ne terms are  |x_i - R_1|, where R_1 is the position
+      of the first nucleus (and so on for each atom), and the remaining terms
+      are |x_i - x_j| for each (i,j) pair, where i and j run over all electrons,
+      with i varied slowest.
+    If flatten is true and keep_pos is false: it does not include the first
+      ndim*Ne features.
+    If flatten is false and keep_pos is false: tensors of shape
+      (batch_size, Ne, Na) and (batch_size, Ne, Ne)
+    If flatten is false and keep_pos is true: same as above, and also a tensor
+      of size (batch_size, Ne, ndim)
+    If atomic_coords is true: the same as if keep_pos is true, except the
+      ndim*Ne coordinates corresponding to the original positions are replaced
+      by ndim*Ne*Na coordinates corresponding to the different from each
+      electron position to each atomic position.
+  """
+  with tf.name_scope('r12_features'):
+    if len(x.shape) == 2:
+      xs = tf.reshape(x, [x.shape[0], nelectrons, -1])
+    else:
+      xs = x
+    coords = tf.stack([
+        tf.constant(atom.coords, dtype=x.dtype.base_dtype) for atom in atoms
+    ])
+    coords = tf.expand_dims(tf.expand_dims(coords, 0), 0)
+    xsa = tf.expand_dims(xs, 2) - coords  # xs in atomic coordinates
+
+    r_ae = tf.norm(xsa, axis=-1)
+    r_ee = np.zeros((nelectrons, nelectrons), dtype=object)
+    for i in range(nelectrons):
+      for j in range(i+1, nelectrons):
+        r_ee[i, j] = tf.norm(xs[:, i, :] - xs[:, j, :], axis=1, keepdims=True)
+
+    if flatten:
+      r_ae = tf.reshape(r_ae, [r_ae.shape[0], -1])
+      if nelectrons > 1:
+        r_ee = tf.concat(
+            r_ee[np.triu_indices(nelectrons, k=1)].tolist(), axis=1)
+      else:
+        r_ee = tf.zeros([r_ae.shape[0], 0])
+      if keep_pos:
+        if atomic_coords:
+          xsa = tf.reshape(xsa, [xsa.shape[0], -1])
+          return tf.concat([r_ae, r_ee, xsa], axis=1)
+        else:
+          return tf.concat([r_ae, r_ee, x], axis=1)
+      else:
+        return tf.concat([r_ae, r_ee], axis=1)
+    else:
+      zeros_like = tf.zeros((xs.shape[0], 1), dtype=x.dtype.base_dtype)
+      for i in range(nelectrons):
+        r_ee[i, i] = zeros_like
+        for j in range(i):
+          r_ee[i, j] = r_ee[j, i]
+      r_ee = tf.transpose(tf.stack(r_ee.tolist()), [2, 0, 1, 3])
+      if keep_pos:
+        if atomic_coords:
+          return r_ae, r_ee, xsa
+        else:
+          return r_ae, r_ee, xs
+      else:
+        return r_ae, r_ee
diff --git a/ferminet/mcmc.py b/ferminet/mcmc.py
new file mode 100644
index 0000000..723e0c0
--- /dev/null
+++ b/ferminet/mcmc.py
@@ -0,0 +1,138 @@
+# Lint as: python3
+# Copyright 2020 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple Markov Chain Monte Carlo data generator."""
+
+import functools
+import tensorflow.compat.v1 as tf
+
+
+class MCMC:
+  """Simple Markov Chain Monte Carlo implementation for sampling from |f|^2."""
+
+  def __init__(self,
+               network,
+               batch_size,
+               init_mu,
+               init_sigma,
+               move_sigma,
+               dtype=tf.float32):
+    """Constructs MCMC object.
+
+    Args:
+      network: network to sample from according to the square of the output.
+      batch_size: batch size - number of configurations (walkers) to generate.
+      init_mu: mean of a truncated normal distribution to draw from to generate
+        initial configurations for each coordinate (i.e. of length 3 x number of
+        electrons).
+        distribution.
+      init_sigma: standard deviation of a truncated normal distribution to draw
+        from to generate initial configurations.
+      move_sigma: standard deviation of random normal distribution from which to
+        draw random moves.
+      dtype: the data type of the configurations.
+    """
+
+    strategy = tf.distribute.get_strategy()
+    nreplicas = strategy.num_replicas_in_sync
+    self._dtype = dtype.base_dtype
+    self.network = network
+    self._init_mu = init_mu
+    self._init_sigma = init_sigma
+    self._move_sigma = move_sigma
+    self.walkers = tf.get_variable(
+        'walkers',
+        initializer=self._rand_init((nreplicas, batch_size)),
+        trainable=False,
+        use_resource=True,
+        dtype=self._dtype)
+    self.psi = self.update_psi()
+    self.move_acc = tf.constant(0.0)
+
+  def _rand_init(self, batch_dims):
+    """Generate a random set of samples from a uniform distribution."""
+    return tf.concat(
+        [
+            tf.random.truncated_normal(  # pylint: disable=g-complex-comprehension
+                shape=(*batch_dims, 1),
+                mean=mu,
+                stddev=self._init_sigma,
+                dtype=self._dtype) for mu in self._init_mu
+        ],
+        axis=-1)
+
+  def reinitialize_walkers(self):
+    walker_reset = self.walkers.assign(
+        self._rand_init(self.walkers.shape.as_list()[:-1]))
+    with tf.control_dependencies([walker_reset]):
+      psi_reset = self.update_psi()
+    return walker_reset, psi_reset
+
+  def update_psi(self):
+    strategy = tf.distribute.get_strategy()
+    psi_per_gpu = strategy.experimental_run(
+        lambda: self.network(self.walkers_per_gpu)[0])
+    self.psi = tf.stack(
+        strategy.experimental_local_results(psi_per_gpu), axis=0)
+    return self.psi
+
+  @property
+  def walkers_per_gpu(self):
+    replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group
+    return self.walkers[replica_id]
+
+  @property
+  def psi_per_gpu(self):
+    replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group
+    return self.psi[replica_id]
+
+  def step(self):
+    """Returns ops for one Metropolis-Hastings step across all replicas."""
+    strategy = tf.distribute.get_strategy()
+    walkers, psi, move_acc = strategy.experimental_run(
+        functools.partial(self._mh_step, step_size=self._move_sigma))
+    update_walkers = self.walkers.assign(
+        tf.stack(strategy.experimental_local_results(walkers), axis=0))
+    self.psi = tf.stack(strategy.experimental_local_results(psi), axis=0)
+    self.move_acc = tf.reduce_mean(
+        strategy.experimental_local_results(move_acc))
+    return update_walkers, self.psi, self.move_acc
+
+  def _mh_step(self, step_size):
+    """Returns ops for one Metropolis-Hastings step in a replica context."""
+    walkers, psi = self.walkers_per_gpu, self.psi_per_gpu
+    new_walkers = walkers + tf.random.normal(
+        shape=walkers.shape, stddev=step_size, dtype=self._dtype)
+    new_psi = self.network(new_walkers)[0]
+    pmove = tf.squeeze(2 * (new_psi - psi))
+    pacc = tf.log(
+        tf.random_uniform(shape=walkers.shape.as_list()[:1], dtype=self._dtype))
+    decision = tf.less(pacc, pmove)
+    with tf.control_dependencies([decision]):
+      new_walkers = tf.where(decision, new_walkers, walkers)
+      new_psi = tf.where(decision, new_psi, psi)
+    move_acc = tf.reduce_mean(tf.cast(decision, tf.float32))
+    return new_walkers, new_psi, move_acc
+
+  def stats_ops(self, log_walkers=False):
+    """Returns op for evaluating the move acceptance probability.
+
+    Args:
+      log_walkers: also include the complete walker configurations (slow, lots
+      of data).
+    """
+    stats = {'pmove': self.move_acc}
+    if log_walkers:
+      stats['walkers'] = self.walkers
+    return stats
diff --git a/ferminet/mean_corrected_kfac_opt.py b/ferminet/mean_corrected_kfac_opt.py
new file mode 100644
index 0000000..dfb9401
--- /dev/null
+++ b/ferminet/mean_corrected_kfac_opt.py
@@ -0,0 +1,52 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Extension to KFAC that adds correction term for non-normalized densities."""
+
+import kfac
+
+
+ip_p = kfac.utils.ip_p
+sum_p = kfac.utils.sum_p
+sprod_p = kfac.utils.sprod_p
+
+
+class MeanCorrectedKfacOpt(kfac.PeriodicInvCovUpdateKfacOpt):
+  """Like KFAC, but with all moments mean-centered."""
+
+  def __init__(self, *args, **kwargs):
+    """batch_size is mandatory, estimation_mode='exact' or 'exact_GGN'."""
+
+    kwargs["compute_params_stats"] = True
+
+    if not kwargs["estimation_mode"].startswith("exact"):  # pytype: disable=attribute-error
+      raise ValueError("This is probably wrong.")
+
+    super(MeanCorrectedKfacOpt, self).__init__(*args, **kwargs)
+
+  def _multiply_preconditioner(self, vecs_and_vars):
+
+    # Create a w list with the same order as vecs_and_vars
+    w_map = {var: ps for (ps, var) in zip(self.params_stats,
+                                          self.registered_variables)}
+    w = tuple((w_map[var], var) for (_, var) in vecs_and_vars)
+
+    r = super(MeanCorrectedKfacOpt, self)._multiply_preconditioner(
+        vecs_and_vars)
+    u = super(MeanCorrectedKfacOpt, self)._multiply_preconditioner(w)
+
+    scalar = ip_p(w, r) / (1. + ip_p(w, w))
+
+    return sum_p(r, sprod_p(scalar, u))
diff --git a/ferminet/networks.py b/ferminet/networks.py
new file mode 100644
index 0000000..09d3cda
--- /dev/null
+++ b/ferminet/networks.py
@@ -0,0 +1,1300 @@
+# Lint as: python3
+# Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Network building for quantum Monte Carlo calculations."""
+
+# Using networks with MCMC and optimisation
+#
+# We create different instances of the same model, all using the same parameters
+# via Sonnet's variable sharing: one to run pre-training, one to sample electron
+# configurations using MCMC and one for use with the optimiser. This requires
+# care when using K-FAC: normally one would only have multiple registrations of
+# network instances if using multi-tower mode, which we aren't.
+# K-FAC detects variable re-use from the variable scope, which gets set to true
+# by Sonnet when we create additional instances of the model, in layer
+# registration, and hence results in errors because we're not using multi-tower.
+# Thus we override the defaults by explicitly setting reuse=False in all layer
+# registrion calls.
+
+# pylint: disable=g-long-lambda
+
+import functools
+import typing
+
+from ferminet import hamiltonian
+import sonnet as snt
+import tensorflow.compat.v1 as tf
+
+
+def extend(x, y):
+  """Add extra dimensions so x can be broadcast with y."""
+  if isinstance(x, float):
+    return x
+  rank = y.shape.rank
+  rank_ = x.shape.rank
+  assert rank_ <= rank
+  for _ in range(rank - rank_):
+    x = tf.expand_dims(x, -1)
+  return x
+
+
+def flatten(seq):
+  """Recursively flattens a sequence."""
+  lst = []
+  for el in seq:
+    if isinstance(el, (list, tuple)):
+      lst.extend(flatten(el))
+    else:
+      lst.append(el)
+  return lst
+
+
+@tf.custom_gradient
+def sign_det(x):
+  """Sign of determinant op whose gradient doesn't crash when the sign is 0."""
+  with tf.name_scope('sign_det'):
+    sign = tf.linalg.slogdet(x)[0]
+
+    def grad(_):
+      return tf.zeros_like(x)
+
+  return sign, grad
+
+
+@tf.custom_gradient
+def determinant(x):
+  """Determinant op with well-defined gradient for singular matrices."""
+  with tf.name_scope('determinant'):
+    with tf.device('/cpu:0'):
+      s, u, v = tf.linalg.svd(x)
+
+    sign = sign_det(x)
+
+    def grad(dx):
+      ss = tf.tile(s[..., None], len(s.shape) * [1] + [s.shape[-1]])
+      ss = tf.linalg.set_diag(ss, tf.ones_like(s))
+      z = tf.reduce_prod(ss, axis=-2)
+      adj = tf.matmul(u, tf.matmul(tf.linalg.diag(z), v, transpose_b=True))
+      signdx = tf.expand_dims(tf.expand_dims(sign * dx, -1), -1)
+      return signdx * adj
+
+    return sign * tf.reduce_prod(s, axis=-1), grad
+
+
+@tf.custom_gradient
+def log_determinant(x):
+  """Log-determinant op with well-defined gradient for singular matrices."""
+  with tf.name_scope('determinant'):
+    with tf.device('/cpu:0'):
+      s, u, v = tf.linalg.svd(x)
+
+    sign = sign_det(x)
+
+    def grad(dsign, dlog):
+      del dsign
+      # d ln|det(X)|/dX = transpose(X^{-1}). This uses pseudo-inverse via SVD
+      # and so is not numerically stable...
+      adj = tf.matmul(u,
+                      tf.matmul(tf.linalg.diag(1.0 / s), v, transpose_b=True))
+      return tf.expand_dims(tf.expand_dims(dlog, -1), -1) * adj
+
+    return (sign, tf.reduce_sum(tf.log(s), axis=-1)), grad
+
+
+def cofactor(u, s, v, shift=0.0):
+  r"""Calculates the cofactor matrix from singular value decomposition.
+
+  Given M = U S V*, where * indicates the conjugate transpose, the cofactor
+  of M, M^C = det(A) A^{-T}, is given by M^C = det(U) det(V) U \Gamma V^T,
+  where \Gamma_{ij} = \delta_{ij} \prod_{k!=i} s_k, s_k is the k-th singular
+  value of M.  The latter equation for M^C is well behaved even for singular
+  matrices.
+
+  Note:
+
+  det(U), det(V) \in {-1, +1} are calculated as part of the determinant
+  evaluation and so not repeated here.
+
+  Args:
+    u: unitary matrix, U, from SVD.
+    s: singular values, S, from SVD.
+    v: unitary matrix, V, from SVD.
+    shift (optional): constant to subtract in log domain for stability.
+
+  Returns:
+    cofactor matrix up to the sign given by det(U) det(V), i.e.
+    M^C / (det(U) det(V)) = U \Gamma V^T, divided by exp(shift).
+  """
+  return tf.matmul(
+      u, tf.matmul(tf.linalg.diag(gamma(s, shift)), v, transpose_b=True))
+
+
+def gamma(s, shift=0.0):
+  r"""Calculates diagonal of \Gamma_{ij} = \delta_{ij} \Prod_{k!=i} s_k.
+
+  Args:
+    s: singular values of matrix.
+    shift (optional): constant to subtract in log domain for stability.
+
+  Returns:
+    Diagonal of \Gamma over the outer dimension of s, divided by exp(shift).
+  """
+  ls = tf.log(s)
+  lower = tf.cumsum(ls, axis=-1, exclusive=True)
+  upper = tf.cumsum(ls, axis=-1, exclusive=True, reverse=True)
+  return tf.exp(lower + upper - shift)
+
+
+def rho(s, shift=0.0):
+  r"""Calculates \rho_ij = \prod_{k!=i,j} s_k, \rho_ii = 0.
+
+  Args:
+    s: singular values of matrix.
+    shift (optional): constant to subtract in log domain for stability.
+
+  Returns:
+    \rho, the first derivative of \Gamma (see `gamma`), divided by exp(shift).
+  """
+  # Product of singular elements before and after two indices, i, j,
+  # i.e. v_{ij} = \Prod_{k<i,l>j} s_i s_j -- outer product of forward and
+  # reverse cumulative products. Valid only for j>i.
+  # Computed in log domain for numerical stability.
+  ls = tf.log(s)
+  lower_cumsum = tf.cumsum(ls, axis=-1, exclusive=True)
+  upper_cumsum = tf.cumsum(ls, axis=-1, reverse=True, exclusive=True)
+  v = tf.expand_dims(lower_cumsum, -1) + tf.expand_dims(upper_cumsum, -2)
+
+  # Product of singular elements between two indices, \Prod_{i<k<l} s_k.
+  # Create matrix with strict upper diagonal set to s_k and ones elsewhere and
+  # take cumulative product.
+  # Also computed in log domain for numerical stability
+  s_mat = tf.linalg.transpose(
+      tf.tile(ls[..., None], [1] * len(s.shape) + [s.shape[-1]]))
+  triu_s_mat = tf.linalg.set_diag(
+      tf.matrix_band_part(s_mat, 0, -1), tf.zeros_like(s))
+  z = tf.cumsum(triu_s_mat, exclusive=True, axis=-1)
+  # Discard lower triangle.
+  r = tf.exp(z + v - extend(shift, z))
+  r = tf.matrix_band_part(r, 0, -1)
+  r = tf.linalg.set_diag(r, tf.zeros_like(s))
+
+  return r + tf.linalg.transpose(r)
+
+
+def grad_cofactor(u, v, r, grad_dy):
+  r"""Calculate the first derivative of the cofactor of a matrix M.
+
+  Given M = U S V*, the cofactor of M, M^C, and \Gamma as defined in `cofactor`,
+  d(M^C) = U (Tr(K \Gamma) S^{-1} - S^{-1} K \Gamma) V^T, where K = V^T dY^T U,
+  and dY is the backwards sensitivities.  Whilst S^{-1} can be singular, such
+  terms explicitly cancel, giving for the term in parantheses:
+
+  ( ... ) = \sum_{k!=i} K_{kk} r_{ik}  for i = j
+          = - K_{ik} r_{ik}            otherwise.
+
+  where r is calculated by `rho`.
+
+  Note:
+    As in `cofactor`, the factor det(U) det(V) is not included.
+
+  Args:
+    u: unitary matrix U from SVD.
+    v: unitary matrix V from SVD.
+    r: matrix r_{ij} = \prod_{k!=i,j} s_k, where s_k is the k-th singular value.
+      See `rho`.
+    grad_dy: backwards sensitivites.
+
+  Returns:
+    first derivative of the cofactor matrix, up to the sign det(U) det(V).
+  """
+  with tf.name_scope('grad_cofactor'):
+    k = tf.matmul(v, tf.matmul(grad_dy, u, transpose_a=True), transpose_a=True)
+    # r_ii = 0 for convenience. Calculate off-diagonal terms.
+    kr = -tf.multiply(k, r)
+    # NB: r is symmetric and r_ii = 0.
+    kr_diag = tf.matmul(r, tf.expand_dims(tf.linalg.diag_part(k), -1))
+    kr = tf.linalg.set_diag(kr, tf.squeeze(kr_diag))
+    return tf.matmul(u, tf.matmul(kr, v, transpose_b=True))
+
+
+@tf.custom_gradient
+def logdet_matmul(x1, x2, w):
+  """Numerically stable implementation of log(abs(sum_i w_i |x1_i| * |x2_i|)).
+
+  Args:
+    x1: Tensor of shape [batch size, dets in, n, n].
+    x2: Tensor of shape [batch size, dets in, m, m].
+    w: Weight matrix of shape [dets in, dets out].
+
+  Returns:
+    Op that computes matmul(det(x1) * det(x2), w). For numerical stability,
+    returns this op in the form of a tuple, the first of which is the log
+    of the absolute value of the output, and the second of which is the sign of
+    the output.
+
+  Raises:
+    ValueError if x1 and x2 do not have the same shape except the last two
+    dimensions, or if the number of columns in det(x1) does not match the number
+    of rows in w.
+  """
+  if x1.shape[:-2] != x2.shape[:-2]:
+    raise ValueError('x1 and x2 must have the same number of determinants.')
+  if w.shape[0] != x1.shape[-3]:
+    raise ValueError('Number of determinants should equal '
+                     'number of inputs to weights.')
+  with tf.name_scope('logdet_matmul'):
+    with tf.device('/cpu:0'):
+      if x1.shape[-1] > 1:
+        s1, u1, v1 = tf.linalg.svd(x1)
+      if x2.shape[-1] > 1:
+        s2, u2, v2 = tf.linalg.svd(x2)
+
+    # Computing the sign of the determinant u and v instead of x means this
+    # works even with singular matrices.
+    if x1.shape[-1] > 1:
+      sign1 = sign_det(u1) * sign_det(v1)
+      logdet1 = tf.reduce_sum(tf.log(s1), axis=-1)
+    else:
+      sign1 = tf.sign(x1[..., 0, 0])
+      logdet1 = tf.log(tf.abs(x1[..., 0, 0]))
+
+    if x2.shape[-1] > 1:
+      sign2 = sign_det(u2) * sign_det(v2)
+      logdet2 = tf.reduce_sum(tf.log(s2), axis=-1)
+    else:
+      sign2 = tf.sign(x2[..., 0, 0])
+      logdet2 = tf.log(tf.abs(x2[..., 0, 0]))
+
+    sign = sign1 * sign2
+    logdet = logdet1 + logdet2
+
+    logdet_max1 = tf.reduce_max(logdet1, axis=-1, keepdims=True)
+    logdet_max2 = tf.reduce_max(logdet2, axis=-1, keepdims=True)
+    det = tf.exp(logdet - logdet_max1 - logdet_max2) * sign
+    output = tf.matmul(det, w)
+    sign_out = tf.sign(output)
+    log_out = tf.log(tf.abs(output)) + logdet_max1 + logdet_max2
+
+    @tf.custom_gradient
+    def logdet_matmul_grad(x1, x2, w, grad_log):
+      """Numerically stable gradient of log(abs(sum_i w_i |x1_i| * |x2_i|)).
+
+      Args:
+        x1: Tensor of shape [batch size, dets in, n, n].
+        x2: Tensor of shape [batch size, dets in, m, m].
+        w: Weight matrix of shape [dets in, dets out].
+        grad_log: Tensor of shape [batch_size, dets_out] by which to
+          left-multiply the Jacobian (aka the reverse sensitivity).
+
+      Returns:
+        Ops that compute the gradient of log(abs(matmul(det(x1) * det(x2), w))).
+        The ops give the gradients with respect to the inputs x1, x2, w.
+      """
+      # This is missing a factor of exp(-logdet_max1-logdet_max2), which is
+      # instead picked up by including it in the terms adj1*det2 and adj2*det1
+      glog_out = grad_log / output
+      dout = tf.matmul(glog_out, w, transpose_b=True)
+
+      if x1.shape[-1] > 1:
+        adj1 = cofactor(u1, s1, v1, extend(logdet_max1, s1)) * sign1[..., None,
+                                                                     None]
+      else:
+        adj1 = tf.ones_like(x1) * extend(tf.exp(-logdet_max1), x1)
+
+      if x2.shape[-1] > 1:
+        adj2 = cofactor(u2, s2, v2, extend(logdet_max2, s2)) * sign2[..., None,
+                                                                     None]
+      else:
+        adj2 = tf.ones_like(x2) * extend(tf.exp(-logdet_max2), x2)
+
+      det1 = tf.exp(logdet1 - logdet_max1) * sign1
+      det2 = tf.exp(logdet2 - logdet_max2) * sign2
+
+      dx1 = adj1 * (det2 * dout)[..., None, None]
+      dx2 = adj2 * (det1 * dout)[..., None, None]
+      dw = tf.matmul(det, glog_out, transpose_a=True)
+
+      def grad(grad_dx1, grad_dx2, grad_dw):
+        """Stable gradient of gradient of log(abs(sum_i w_i |x1_i| * |x2_i|)).
+
+        Args:
+          grad_dx1: Tensor of shape [batch size, dets in, n, n].
+          grad_dx2: Tensor of shape [batch size, dets in, m, m].
+          grad_dw: Tensor of shape [dets in, dets out].
+
+        Returns:
+          Ops that compute the gradient of the gradient of
+          log(abs(matmul(det(x1) * det(x2), w))). The ops give the gradients
+          with respect to the outputs of the gradient op dx1, dx2, dw and the
+          reverse sensitivity grad_log.
+        """
+        # Terms that appear repeatedly in different gradients.
+        det_grad_dw = tf.matmul(det, grad_dw)
+        # Missing a factor of exp(-2logdet_max1-2logdet_max2) which is included\
+        # via terms involving r1, r2, det1, det2, adj1, adj2
+        glog_out2 = grad_log / (output**2)
+
+        adj_dx1 = tf.reduce_sum(adj1 * grad_dx1, axis=[-1, -2])
+        adj_dx2 = tf.reduce_sum(adj2 * grad_dx2, axis=[-1, -2])
+
+        adj_dx1_det2 = det2 * adj_dx1
+        adj_dx2_det1 = det1 * adj_dx2
+        adj_dx = adj_dx2_det1 + adj_dx1_det2
+
+        if x1.shape[-1] > 1:
+          r1 = rho(s1, logdet_max1)
+          dadj1 = (grad_cofactor(u1, v1, r1, grad_dx1) * sign1[..., None, None])
+        else:
+          dadj1 = tf.zeros_like(x1)
+
+        if x2.shape[-1] > 1:
+          r2 = rho(s2, logdet_max2)
+          dadj2 = (grad_cofactor(u2, v2, r2, grad_dx2) * sign2[..., None, None])
+        else:
+          dadj2 = tf.zeros_like(x2)
+
+        # Computes gradients wrt x1 and x2.
+        ddout_w = (
+            tf.matmul(glog_out, grad_dw, transpose_b=True) -
+            tf.matmul(glog_out2 * det_grad_dw, w, transpose_b=True))
+        ddout_x = tf.matmul(
+            glog_out2 * tf.matmul(adj_dx, w), w, transpose_b=True)
+
+        grad_x1 = (
+            adj1 * (det2 *
+                    (ddout_w - ddout_x) + adj_dx2 * dout)[..., None, None] +
+            dadj1 * (dout * det2)[..., None, None])
+        grad_x2 = (
+            adj2 * (det1 *
+                    (ddout_w - ddout_x) + adj_dx1 * dout)[..., None, None] +
+            dadj2 * (dout * det1)[..., None, None])
+
+        adj_dx_w = tf.matmul(adj_dx, w)
+
+        # Computes gradient wrt w.
+        grad_w = (
+            tf.matmul(adj_dx, glog_out, transpose_a=True) - tf.matmul(
+                det, glog_out2 * (det_grad_dw + adj_dx_w), transpose_a=True))
+
+        # Computes gradient wrt grad_log.
+        grad_grad_log = (det_grad_dw + adj_dx_w) / output
+        return grad_x1, grad_x2, grad_w, grad_grad_log
+
+      return (dx1, dx2, dw), grad
+
+    def grad(grad_log, grad_sign):
+      """Computes gradient wrt log(w*det(x1)*det(x2))."""
+      del grad_sign
+      return logdet_matmul_grad(x1, x2, w, grad_log)
+
+    return (log_out, sign_out), grad
+
+
+class TensorMlp(snt.AbstractModule):
+  """Regular MLP, but works with rank-2, -3, or -4 tensors as input."""
+
+  def __init__(self,
+               layers,
+               rank=0,
+               use_bias=True,
+               activate_final=True,
+               residual=False,
+               gain=1.0,
+               stddev=1.0,
+               name='tensor_mlp'):
+    """Create an MLP with different input ranks.
+
+    Args:
+      layers: list of ints, with number of units in each layer
+      rank: Rank of the input, not counting batch or feature dimension. If 0,
+        creates a normal MLP. If 1, uses 1D convolutions with kernel size 1. If
+        2, uses 2D convolutions with 1x1 kernel.
+      use_bias: If false, leave bias out of linear layers.
+      activate_final: Boolean. If true, add nonlinearity to last layer.
+      residual: use residual connections in each layer.
+      gain: gain used in orthogonal initializer for weights in each linear
+        layer.
+      stddev: standard deviation used in random normal initializer for biases in
+        each linear layer.
+      name: String. Sonnet default.
+
+    Raises:
+      ValueError: if the rank is not 0, 1 or 2.
+    """
+    super(TensorMlp, self).__init__(name=name)
+    initializers = {'w': tf.initializers.orthogonal(gain=gain)}
+    if use_bias:
+      initializers['b'] = tf.initializers.random_normal(stddev=stddev)
+    if rank == 0:
+      self._linear = lambda x: snt.Linear(
+          x, use_bias=use_bias, initializers=initializers)
+    elif rank == 1:
+      self._linear = lambda x: snt.Conv1D(
+          output_channels=x,
+          kernel_shape=1,
+          use_bias=use_bias,
+          initializers=initializers)
+    elif rank == 2:
+      self._linear = lambda x: snt.Conv2D(
+          output_channels=x,
+          kernel_shape=1,
+          use_bias=use_bias,
+          initializers=initializers)
+    else:
+      raise ValueError('Rank for TensorMlp must be 0, 1 or 2.')
+    self._rank = rank
+    self._nlayers = layers
+    self._use_bias = use_bias
+    self._activate_final = activate_final
+    self._use_residual = residual
+
+  def _build(self, inputs, layer_collection=None):
+    if self._nlayers:
+      x = inputs
+      self._layers = []
+      for i, h in enumerate(self._nlayers):
+        self._layers.append(self._linear(h))
+        y = self._layers[-1](x)
+        if layer_collection:
+          if self._use_bias:
+            params = (self._layers[-1].w, self._layers[-1].b)
+          else:
+            params = self._layers[-1].w
+
+          if self._rank == 0:
+            layer_collection.register_fully_connected(params, x, y, reuse=False)
+          elif self._rank == 1:
+            layer_collection.register_conv1d(
+                params, [1, 1, 1], 'SAME', x, y, reuse=False)
+          else:
+            layer_collection.register_conv2d(
+                params, [1, 1, 1, 1], 'SAME', x, y, reuse=False)
+
+        if i < len(self._nlayers) - 1 or self._activate_final:
+          y = tf.nn.tanh(y)
+        if self._use_residual and x.shape[-1].value == h:
+          residual = x
+        else:
+          residual = 0.0
+        x = y + residual
+      return x
+    return inputs  # if layers is an empty list, just act as the identity
+
+
+class ParallelNet(snt.AbstractModule):
+  """A symmetric network with parallel streams for rank 1 and 2 features.
+
+  Rank 2 features propagate with parallel independent channels.
+  Rank 1 features are able to input from other channels and rank 2 stream.
+
+  Attributes:
+    shape: number of alpha and beta electrons.
+    layers: iterable containing integers pairs containing description of layers-
+      i.e tuples containing (num_rank_one_features, num_rank_two_features)
+    use_last_layer: bool. whether or not return rank-2 features at end of net.
+    residual: bool. whether or not to use residual connections.
+    name: for Sonnet namespacing.
+  """
+
+  def __init__(self,
+               shape,
+               layers,
+               use_last_layer=True,
+               residual=False,
+               name='parallel_net'):
+    super(ParallelNet, self).__init__(name=name)
+    self._shape = shape  # tuple of input shapes
+    self._n = sum(shape)
+    self._layers = layers
+    self._use_last_layer = use_last_layer
+    self._use_residual = residual
+
+  def _build(self, x1, x2, r_ee, layer_collection=None):
+    """Construct the parallel network from rank 1 and rank 2 inputs.
+
+    Args:
+      x1: rank 1 input features. tensor of shape (batch, n_electrons,
+        n_rank_one_inputs)
+      x2: rank 2 input features. tensor of shape (batch, n_electrons,
+        n_electrons, n_rank_two_inputs)
+      r_ee: distances between electrons. tensor of shape (batch, n_electrons,
+        n_electrons)
+      layer_collection: KFAC layer collection or None.
+
+    Returns:
+      x1: rank 1 output features.
+        tensor of shape (batch, n_electrons, n_rank_one_outputs)
+      x2: rank 2 output features or None if use_last_layer is False.
+        if not None tensor is of shape (batch, n_electrons, n_rank_two_outputs)
+    """
+    self._w1 = []
+    self._w2 = []
+
+    initializers = {
+        'w': tf.initializers.orthogonal(),
+        'b': tf.initializers.random_normal()
+    }
+    for i, (nc1, nc2) in enumerate(self._layers):
+      xs1 = tf.split(x1, self._shape, axis=1)
+      xs2 = tf.split(x2, self._shape, axis=1)
+
+      # mean field features
+      mf1 = [
+          tf.tile(
+              tf.reduce_mean(x, axis=1, keepdims=True), [1, x1.shape[1], 1])
+          for x in xs1
+      ]
+      mf2 = [tf.reduce_mean(x, axis=1) for x in xs2]
+      y1 = tf.concat([x1] + mf1 + mf2, axis=2)
+
+      self._w1.append(
+          snt.Conv1D(
+              output_channels=nc1, kernel_shape=1, initializers=initializers))
+      out1 = self._w1[-1](y1)
+
+      if layer_collection:
+        layer_collection.register_conv1d((self._w1[-1].w, self._w1[-1].b),
+                                         [1, 1, 1],
+                                         'SAME',
+                                         y1,
+                                         out1,
+                                         reuse=False)
+
+      # Implements residual connection
+      if self._use_residual and x1.shape[-1].value == nc1:
+        res = x1
+      else:
+        res = 0.0
+
+      x1 = tf.nn.tanh(out1) + res
+
+      # If we don't need the last layer of rank-2 features, set nc2 to None
+      if i < len(self._layers) - 1 or self._use_last_layer:
+        self._w2.append(
+            snt.Conv2D(
+                output_channels=nc2, kernel_shape=1, initializers=initializers))
+        out2 = self._w2[-1](x2)
+
+        if layer_collection:
+          layer_collection.register_conv2d((self._w2[-1].w, self._w2[-1].b),
+                                           [1, 1, 1, 1],
+                                           'SAME',
+                                           x2,
+                                           out2,
+                                           reuse=False)
+
+        x2 = tf.nn.tanh(out2)
+      else:
+        x2 = None
+
+    return x1, x2
+
+
+class EquivariantNet(snt.AbstractModule):
+  """A symmetric network like ParallelNet, but without the rank 2 stream."""
+
+  def __init__(self, shape, layers, residual=False, name='parallel_net'):
+    super(EquivariantNet, self).__init__(name=name)
+    self._shape = shape  # tuple of input shapes
+    self._n = sum(shape)
+    self._layers = layers
+    self._use_residual = residual
+
+  def _build(self, x1, layer_collection=None):
+    self._w1 = []
+    initializers = {
+        'w': tf.initializers.orthogonal(),
+        'b': tf.initializers.random_normal()
+    }
+    for nc1 in self._layers:
+      xs1 = tf.split(x1, self._shape, axis=1)
+
+      # mean field features
+      mf1 = [
+          tf.tile(
+              tf.reduce_mean(x, axis=1, keepdims=True), [1, x1.shape[1], 1])
+          for x in xs1
+      ]
+      y1 = tf.concat([x1] + mf1, axis=2)
+
+      self._w1.append(
+          snt.Conv1D(
+              output_channels=nc1, kernel_shape=1, initializers=initializers))
+      out1 = self._w1[-1](y1)
+
+      if layer_collection:
+        layer_collection.register_conv1d((self._w1[-1].w, self._w1[-1].b),
+                                         [1, 1, 1],
+                                         'SAME',
+                                         y1,
+                                         out1,
+                                         reuse=False)
+
+      # Implements residual connection
+      if self._use_residual and x1.shape[-1].value == nc1:
+        res = x1
+      else:
+        res = 0.0
+
+      x1 = tf.nn.tanh(out1) + res
+
+    return x1
+
+
+class LogMatmul(snt.AbstractModule):
+  """Takes log-domain vector and performs numerically stable matmul."""
+
+  def __init__(self, outputs, use_bias=False, name='log_matmul'):
+    super(LogMatmul, self).__init__(name=name)
+    self._outputs = outputs
+    self._use_bias = use_bias
+
+  def _build(self, log_input, sign_input, layer_collection=None):
+    rank = log_input.shape.ndims
+    if rank == 2:
+      self._layer = snt.Linear(self._outputs, use_bias=self._use_bias)
+    elif rank == 3:
+      self._layer = snt.Conv1D(
+          output_channels=self._outputs,
+          kernel_shape=1,
+          use_bias=self._use_bias)
+    else:
+      raise RuntimeError('LogMatmul only supports rank 2 and 3 tensors.')
+
+    log_max = tf.reduce_max(log_input, axis=-1, keepdims=True)
+    inputs = tf.exp(log_input - log_max) * sign_input
+    output = self._layer(inputs)
+    sign_output = tf.sign(output)
+    log_output = tf.log(tf.abs(output)) + log_max
+    if layer_collection:
+      if self._use_bias:
+        params = (self._layer.w, self._layer.b)
+      else:
+        params = self._layer.w
+      if rank == 2:
+        layer_collection.register_fully_connected(
+            params, inputs, output, reuse=False)
+      elif rank == 3:
+        layer_collection.register_conv1d(
+            params, [1, 1, 1], 'SAME', inputs, output, reuse=False)
+    return log_output, sign_output
+
+
+class Envelope(snt.AbstractModule):
+  """Compute exponentially-decaying envelope to enforce boundary conditions."""
+
+  def __init__(self, atoms, shape, determinants, soft=False, name='envelope'):
+    r"""Compute exponentially-decaying envelope to enforce boundary conditions.
+
+    Given a set of electron positions r_i and atom positions z_j, this module
+    learns a set of Mahalonobis distance metrics A_jk and mixing proportions
+    \pi_jk and computes a set of exponentially-decaying envelopes of the form:
+
+    f_k(r_i) =
+    \sum_j \pi_jk exp(-sqrt((r_i - z_j)^T * A_jk^T * A_jk * (r_i - z_j)))
+
+    where the index `k` is over all orbitals, both spin up and spin down.
+    Each of these envelopes is multiplied by one of the orbitals in a Fermi Net
+    to enforce the boundary condition that the wavefunction goes to zero at
+    infinity.
+
+    Args:
+      atoms: list of atom positions
+      shape: tuple with number of spin up and spin down electrons
+      determinants: number of determinants, used to compute number of envelopes
+      soft: If true, use sqrt(1 + sum(x**2)) instead of norm(x)
+      name: Sonnet boilerplate
+
+    Returns:
+      list of envelope ops, one spin up and one for spin down orbitals
+    """
+    super(Envelope, self).__init__(name=name)
+    self._atoms = atoms
+    self._shape = shape
+    self._na = len(atoms)
+    self._nd = determinants
+    self._soft = soft  # remove cusp from envelope
+
+  def _build(self, inputs, layer_collection=None):
+    """Create ops to compute envelope for every electron/orbital pair.
+
+    Args:
+      inputs: tensor of shape (batch size, nelectrons, 3) with position of every
+        electron
+      layer_collection (optional): if using KFAC, the layer collection to
+        register ops with.
+
+    Returns:
+      List of the value of the envelope for spin up and spin down electrons.
+      Each element of the list is a tensorflow op with size
+      (batch, shape[i], norbitals) where shape[i] is the number of electrons of
+      spin "i", and norbitals is shape[i] * the number of determinants.
+    """
+    # shape of each (atom/orbital) envelope
+    self._envelope_sigma = []
+    # mixing proportion for all atoms for a given orbital
+    self._envelope_pi = []
+    self._envelopes = []
+    norms = [[] for _ in self._shape]
+
+    pos = tf.stack([
+        tf.constant(atom.coords, dtype=inputs.dtype.base_dtype)
+        for atom in self._atoms
+    ])
+    pos = tf.expand_dims(tf.expand_dims(pos, 0), 0)
+    diff_ae = tf.expand_dims(inputs, 2) - pos
+
+    # Split difference from electron to atom into list, one per atom, then
+    # further split each element into a list, one for spin up electrons, one
+    # for spin down.
+    diff_aes = [
+        tf.split(x, self._shape, axis=1)
+        for x in tf.split(diff_ae, self._na, axis=2)
+    ]
+
+    # Compute the weighted distances from electron to atom
+    # We swap the normal convention of `i` for the outer for loop and `j` for
+    # the inner for loop to conform to the notation in the docstring.
+    for j in range(self._na):  # iterate over atoms
+      self._envelope_sigma.append([])
+      for i in range(len(self._shape)):  # iterate over spin up/down electrons
+        norbital = self._shape[i] * self._nd
+        eyes = tf.transpose(
+            tf.reshape(
+                tf.eye(
+                    3, batch_shape=[norbital], dtype=inputs.dtype.base_dtype),
+                [1, 1, norbital * 3, 3]), (0, 1, 3, 2))
+        self._envelope_sigma[j].append(
+            tf.get_variable(
+                'envelope_sigma_%d_%d' % (j, i),
+                initializer=eyes,
+                use_resource=True,
+                trainable=True,
+                dtype=inputs.dtype.base_dtype,
+            ))
+
+        # multiply the difference from the electron to the atom by an
+        # anisotropic weight - one 3x3 matrix per orbital function
+
+        # From the notation in the docstring, `rout` is
+        # equivalent to (A_j1; ...; A_jk) * (r_i - z_j)
+
+        # Note, snt.Conv2D won't take bare TF ops as initializers, so we just
+        # use tf.nn.conv2d here instaed.
+        rout = tf.nn.conv2d(diff_aes[j][i], self._envelope_sigma[j][i],
+                            [1, 1, 1, 1], 'SAME')
+        if layer_collection:
+          layer_collection.register_conv2d(
+              self._envelope_sigma[j][i], [1, 1, 1, 1],
+              'SAME',
+              diff_aes[j][i],
+              rout,
+              reuse=False)
+
+        # Reshape `rout` so that rout[:, i, k, :] represents
+        # A_jk * (r_i - z_j) for all r_i in the batch
+        rout = tf.reshape(rout, [rout.shape[0], rout.shape[1], norbital, 3])
+        if self._soft:
+          norms[i].append(
+              tf.sqrt(tf.reduce_sum(1 + rout**2, axis=3, keepdims=True)))
+        else:
+          norms[i].append(tf.norm(rout, axis=3, keepdims=True))
+
+    # Compute the mixing proportions for different atoms
+    for i in range(len(self._shape)):
+      norbital = self._shape[i] * self._nd
+      self._envelope_pi.append(
+          tf.get_variable(
+              'envelope_pi_%d' % i,
+              initializer=tf.ones([norbital, self._na],
+                                  dtype=inputs.dtype.base_dtype),
+              use_resource=True,
+              trainable=True,
+              dtype=inputs.dtype.base_dtype,
+          ))
+      if layer_collection:
+        layer_collection.register_generic(
+            self._envelope_pi[i], inputs.shape[0].value, reuse=False)
+
+      # Take the exponential of the Mahalonobis distance and multiply each
+      # term by the weight for that atom.
+      norm = tf.concat(norms[i], axis=3)
+      mixed_norms = tf.exp(-norm) * self._envelope_pi[i]
+
+      # Add the weighted terms for each atom together to form the envelope.
+      # Each term in this list has shape (batch size, shape[i], norbitals)
+      # where shape[i] is the number of electrons of spin `i`. This computes
+      # the value of the envelope for every combination of electrons and
+      # orbitals.
+      self._envelopes.append(tf.reduce_sum(mixed_norms, axis=3))
+
+    return self._envelopes
+
+
+class BackFlow(snt.AbstractModule):
+  """Introduce correlation in Slater network via coordinate transformations.
+
+  Electron-electron, electron-nuclear and electron-electron-nuclear
+  contributions are computed using MLPs for the nu, mu, theta and phi
+  functions.
+  """
+
+  def _build(self, xs, xsa, r_en, r_ee, layer_collection=None):
+    # Notation is explained in the CASINO manual, section 23.1
+    # (https://casinoqmc.net/casino_manual_dir/casino_manual.pdf)
+    backflow_ee = TensorMlp([64, 64, 64, 1],
+                            rank=2,
+                            activate_final=False,
+                            residual=True,
+                            gain=1.e-3,
+                            stddev=1.e-3)
+
+    backflow_en = TensorMlp([64, 64, 64, 1],
+                            rank=2,
+                            activate_final=False,
+                            residual=True,
+                            gain=1.e-3,
+                            stddev=1.e-3)
+
+    backflow_een = TensorMlp([64, 64, 64, 2],
+                             rank=2,
+                             activate_final=False,
+                             residual=True,
+                             gain=1.e-3,
+                             stddev=1.e-3)
+
+    # electron-electron backflow term
+    eta = backflow_ee(r_ee, layer_collection=layer_collection)
+
+    # electron-nuclear backflow term
+    r_en = tf.expand_dims(r_en, -1)
+    mu = backflow_en(r_en, layer_collection=layer_collection)
+
+    # The most complicated term: the electron-electron-nuclear backflow term
+    shape = [r_ee.shape[0], r_ee.shape[1], r_ee.shape[2], r_en.shape[2], 1]
+    r_een = tf.concat((tf.broadcast_to(tf.expand_dims(
+        r_ee, -2), shape), tf.broadcast_to(tf.expand_dims(r_en, -3), shape),
+                       tf.broadcast_to(tf.expand_dims(r_en, -4), shape)),
+                      axis=-1)
+    # Reshape so we can squeeze this into a rank-2 network.
+    r_een_flat = tf.reshape(r_een, [
+        r_een.shape[0], r_een.shape[1], r_een.shape[2] * r_een.shape[3],
+        r_een.shape[4]
+    ])
+    phi_theta = backflow_een(r_een_flat, layer_collection=layer_collection)
+    phi = tf.reshape(phi_theta[..., 0], r_een.shape[:-1] + [1])
+    theta = tf.reshape(phi_theta[..., 1], r_een.shape[:-1] + [1])
+
+    diffs = tf.expand_dims(xs, 1) - tf.expand_dims(xs, 2)
+    backflow = (
+        tf.reduce_sum(eta * diffs, axis=2) + tf.reduce_sum(mu * xsa, axis=2) +
+        tf.reduce_sum(phi * tf.expand_dims(diffs, axis=3), axis=[2, 3]) +
+        tf.reduce_sum(theta * tf.expand_dims(xsa, axis=2), axis=[2, 3]))
+    return backflow
+
+
+class FermiNet(snt.AbstractModule):
+  """Neural network with a determinant layer to antisymmetrize output."""
+
+  def __init__(self,
+               *,
+               atoms,
+               nelectrons,
+               slater_dets,
+               hidden_units,
+               after_det,
+               dim=3,
+               architecture='ferminet',
+               envelope=False,
+               residual=False,
+               r12_ee_features=True,
+               r12_en_features=True,
+               pos_ee_features=True,
+               build_backflow=False,
+               use_backflow=False,
+               jastrow_en=False,
+               jastrow_ee=False,
+               jastrow_een=False,
+               logdet=False,
+               pretrain_iterations=2,
+               name='det_net'):
+    super(FermiNet, self).__init__(name=name)
+    self._atoms = atoms
+    self._dim = dim
+    # tuple of number of (spin up, spin down) e's or a tuple of (e's) for
+    # spin-polarised systems.
+    self._shape = [nelec for nelec in nelectrons  if nelec > 0]
+    self._ne = sum(nelectrons)  # total number of electrons
+    self._na = len(atoms)  # number of atoms
+    self._nd = slater_dets  # number of determinants
+
+    self._architecture = architecture
+    self._hidden_units = hidden_units
+    self._after_det = after_det
+    self._add_envelope = envelope
+    self._r12_ee = r12_ee_features
+    self._r12_en = r12_en_features
+    self._pos_ee = pos_ee_features
+    self._use_residual = residual
+    self._build_backflow = build_backflow
+    self._use_backflow = use_backflow
+    self._use_jastrow_en = jastrow_en
+    self._use_jastrow_ee = jastrow_ee
+    self._use_jastrow_een = jastrow_een
+    self._output_log = logdet  # Whether to represent wavef'n or log-wavef'n
+    self.pretrain_iterations = pretrain_iterations
+
+  def _features(self, xs, r_ee, r_en, xsa):
+    x1 = []
+    x2 = []
+    if self._r12_en:
+      x1.append(r_en)
+    if self._r12_ee:
+      x2.append(r_ee)
+    xsa = tf.reshape(xsa, [xsa.shape[0], xsa.shape[1], -1])
+    x1.append(xsa)
+    if self._pos_ee:
+      diffs = tf.expand_dims(xs, 1) - tf.expand_dims(xs, 2)
+      x2.append(diffs)
+    x1 = tf.concat(x1, axis=2) if x1 else None
+    x2 = tf.concat(x2, axis=3) if x2 else None
+    return x1, x2
+
+  def _build(self, inputs, layer_collection=None, back_prop=True):
+    self.inputs = inputs
+    xs = tf.reshape(inputs, [inputs.shape[0], self._ne, -1])
+    r_en, r_ee, xsa = hamiltonian.r12_features(
+        inputs,
+        self._atoms,
+        self._ne,
+        keep_pos=True,
+        flatten=False,
+        atomic_coords=True)
+
+    if self._use_backflow or self._build_backflow:
+      if self._architecture != 'slater':
+        raise ValueError('Backflow should only be used with Hartree-Fock/'
+                         'Slater-Jastrow network')
+      backflow_net = BackFlow()
+      backflow = backflow_net(xs, xsa, r_en, r_ee, layer_collection)
+    else:
+      backflow = None
+
+    if backflow is not None:
+      # Warning: Backflow transformed coordinates should *only* be fed into the
+      # orbitals and not into the Jastrow factor. Denote functions of the
+      # backflow as q* accordingly.
+      q_inputs = inputs + tf.reshape(backflow, inputs.shape)
+      qs = tf.reshape(q_inputs, [q_inputs.shape[0], self._ne, -1])
+      q_en, q_ee, qsa = hamiltonian.r12_features(
+          q_inputs,
+          self._atoms,
+          self._ne,
+          keep_pos=True,
+          flatten=False,
+          atomic_coords=True)
+      x1, x2 = self._features(qs, q_ee, q_en, qsa)
+    else:
+      x1, x2 = self._features(xs, r_ee, r_en, xsa)
+
+    # Fermi Net or Slater-Jastrow net?
+    if self._architecture == 'ferminet':
+      if isinstance(self._hidden_units[0], typing.Iterable):
+        self._parallel_net = ParallelNet(
+            self._shape,
+            self._hidden_units,
+            residual=self._use_residual,
+            use_last_layer=False)
+        self._before_det, self._before_jastrow = self._parallel_net(
+            x1, x2, r_ee, layer_collection=layer_collection)
+      else:
+        self._parallel_net = EquivariantNet(
+            self._shape, self._hidden_units, residual=self._use_residual)
+        self._before_det = self._parallel_net(
+            x1, layer_collection=layer_collection)
+        self._before_jastrow = None
+    elif self._architecture == 'slater':
+      self._before_det_mlp = TensorMlp(
+          self._hidden_units, rank=1, residual=True)
+      self._before_det = self._before_det_mlp(
+          x1, layer_collection=layer_collection)
+    else:
+      raise ValueError('Not a recognized architecture: {}'.format(
+          self._architecture))
+
+    # Split the features into spin up and down electrons, and shape into
+    # orbitals and determinants.
+    before_det = tf.split(self._before_det, self._shape, axis=1)
+    self._orbital_layers = [
+        snt.Conv1D(x.shape[1] * self._nd, 1, name='orbital_%d' % i)
+        for i, x in enumerate(before_det)
+    ]
+    orbitals = []
+    for i, x in enumerate(before_det):
+      layer = self._orbital_layers[i]
+      y = layer(x)
+      orbitals.append(y)
+      if layer_collection:
+        layer_collection.register_conv1d((layer.w, layer.b), [1, 1, 1],
+                                         'SAME',
+                                         x,
+                                         y,
+                                         reuse=False)
+
+    # Create an exponentially-decaying envelope around every atom
+    if self._add_envelope:
+      envelope = Envelope(self._atoms, self._shape, self._nd)
+      self._envelopes = envelope(xs, layer_collection=layer_collection)
+      for i in range(len(orbitals)):
+        orbitals[i] = orbitals[i] * self._envelopes[i]
+
+    self._orbitals = [
+        tf.transpose(
+            tf.reshape(x, [x.shape[0], x.shape[1], -1, x.shape[1]]),
+            [0, 2, 1, 3]) for x in orbitals
+    ]
+
+    slog1d = lambda x: (tf.sign(x[..., 0, 0]), tf.log(tf.abs(x[..., 0, 0])))
+    slog = lambda x: log_determinant(x) if x.shape[-1].value > 1 else slog1d(x)
+    if self._output_log:
+      if self._nd == 1:
+        self._slogdets = [
+            slog(x) for x in self._orbitals
+        ]
+        self._logdet = tf.add_n([x[1] for x in self._slogdets])
+        self._signs = functools.reduce(
+            tf.Tensor.__mul__,  # pytype: disable=unsupported-operands
+            [x[0] for x in self._slogdets])
+        output = (self._logdet, self._signs)
+      else:
+        self._after_det_weights = tf.get_variable(
+            'after_det_weights',
+            shape=[self._nd, self._after_det[0]],
+            initializer=tf.initializers.ones(dtype=inputs.dtype.base_dtype),
+            use_resource=True,
+            dtype=inputs.dtype.base_dtype,
+        )
+        if back_prop:
+          if len(self._orbitals) == 1:
+            output = list(slog(x) for x in self._orbitals)
+            signdet, logdet = output[0]
+            log_max = tf.reduce_max(logdet, axis=-1, keepdims=True)
+            inputs = tf.exp(logdet - log_max) * signdet
+            output = tf.matmul(inputs, self._after_det_weights)
+            output = tf.log(tf.abs(output)) + log_max, tf.sign(output)
+          else:
+            output = logdet_matmul(self._orbitals[0], self._orbitals[1],
+                                   self._after_det_weights)
+        else:
+          # Compute logdet with tf.linalg.slogdet, as gradients are not needed
+          # We keep this outside logdet_matmul, despite the code duplication,
+          # because tf.custom_gradient decorator does not like keyword args
+          if len(self._orbitals) == 1:
+            signdet, logdet = tf.linalg.slogdet(self._orbitals[0])
+          else:
+            slogdet0 = tf.linalg.slogdet(self._orbitals[0])
+            slogdet1 = tf.linalg.slogdet(self._orbitals[1])
+            signdet = slogdet0[0] * slogdet1[0]
+            logdet = slogdet0[1] + slogdet1[1]
+
+          log_max = tf.reduce_max(logdet, axis=-1, keepdims=True)
+          inputs = tf.exp(logdet - log_max) * signdet
+          output = tf.matmul(inputs, self._after_det_weights)
+          output = tf.log(tf.abs(output)) + log_max, tf.sign(output)
+
+        self._logdet = output[0]
+        if layer_collection:
+          layer_collection.register_generic(
+              self._after_det_weights,
+              self._orbitals[0].shape[0].value,
+              reuse=False)
+
+        if len(self._after_det) > 1:
+          log_out, sign_out = output
+          # Constructing x and -x to feed through network to make odd function
+          output = (tf.stack((log_out, log_out),
+                             axis=1), tf.stack((sign_out, -1 * sign_out),
+                                               axis=1))
+          for nout in self._after_det[1:]:
+            layer = LogMatmul(nout, use_bias=True)
+            # apply ReLU in log domain
+            output = (tf.where(
+                tf.greater(output[1], 0.0), output[0],
+                -100 * tf.ones_like(output[0])), output[1])
+            output = layer(*output, layer_collection=layer_collection)
+
+          # Make output an odd function
+          log_out, sign_out = output
+          logs_out = [
+              tf.squeeze(out, axis=1) for out in tf.split(log_out, 2, axis=1)
+          ]
+          signs_out = [
+              tf.squeeze(out, axis=1) for out in tf.split(sign_out, 2, axis=1)
+          ]
+          log_max = tf.maximum(
+              tf.reduce_max(logs_out[0], axis=1, keepdims=True),
+              tf.reduce_max(logs_out[1], axis=1, keepdims=True))
+          real_out = (
+              tf.exp(logs_out[0] - log_max) * signs_out[0] -
+              tf.exp(logs_out[1] - log_max) * signs_out[1])
+          output = (tf.log(tf.abs(real_out)) + log_max, tf.sign(real_out))
+    else:
+      self._dets = [
+          determinant(x) if x.shape[-1].value > 1 else x[..., 0, 0]
+          for x in self._orbitals
+      ]
+      self._det = functools.reduce(tf.Tensor.__mul__, self._dets)  # pytype: disable=unsupported-operands
+      after_det_input = self._det
+
+      self._after_det_mlp = TensorMlp(self._after_det, use_bias=True)
+
+      combined_input = tf.concat([after_det_input, -after_det_input], axis=0)
+      combined_output = self._after_det_mlp(
+          combined_input, layer_collection=layer_collection)
+      output_0, output_1 = tf.split(combined_output, 2, axis=0)
+      output = output_0 - output_1
+
+    total_jastrow = 0.0
+    if self._use_jastrow_ee:
+      self._jastrow_ee_net = TensorMlp(
+          [64, 64, 64, len(self._shape)**2],
+          rank=2,
+          activate_final=False,
+          residual=True)
+      # To prevent NaNs, rescale factor by number of electrons squared.
+      self._jastrow_ee = self._jastrow_ee_net(
+          r_ee, layer_collection=layer_collection) / (
+              self._ne**2)
+
+      # split output into channels for different spin interactions
+      jastrow_ee = flatten([
+          tf.split(x, self._shape, axis=2)
+          for x in tf.split(self._jastrow_ee, self._shape, axis=1)
+      ])
+      # For each spin interaction (alpha/alpha, alpha/beta, beta/alpha,
+      # beta/beta), get just the Jastrow term for that interaction
+      jastrow_ee = [x[..., i:i + 1] for i, x in enumerate(jastrow_ee)]
+      # Add up all Jastrow terms for all pairs of electrons over all spins
+      jastrow_ee = tf.add_n([tf.reduce_sum(x, axis=[1, 2]) for x in jastrow_ee])
+      total_jastrow += jastrow_ee
+
+    if self._use_jastrow_en:
+      self._jastrow_en_net = TensorMlp([64, 64, 64, self._na],
+                                       rank=2,
+                                       activate_final=False,
+                                       residual=True)
+      self._jastrow_en = (
+          self._jastrow_en_net(
+              r_en[..., None], layer_collection=layer_collection) / self._ne /
+          self._na)
+      # We have one output per atom/electron distance. Rather than having one
+      # network for every atom, we have one network with many outputs, and take
+      # the diagonal of the outputs.
+      jastrow_en = tf.reduce_sum(tf.matrix_diag_part(self._jastrow_en), axis=2)
+      jastrow_en = tf.reduce_sum(jastrow_en, axis=1, keepdims=True)
+      total_jastrow += jastrow_en
+
+    if self._use_jastrow_een:
+      self._jastrow_een_nets = []
+      self._jastrow_een = []
+      for i in range(self._na):
+        # Create a separate network for every atom. Not efficient, but works
+        # well enough for the experiments we need this for. This is only for
+        # comparison against benchmarks in the literature and is not meant to
+        # be a default component of the Fermionic Neural Network.
+        self._jastrow_een_nets.append(
+            TensorMlp([64, 64, 64, len(self._shape)**2],
+                      rank=2,
+                      activate_final=False,
+                      residual=True))
+
+        batch = tf.concat((r_ee,
+                           tf.tile(
+                               tf.expand_dims(r_en[..., i:i + 1], axis=1),
+                               [1, self._ne, 1, 1]),
+                           tf.tile(
+                               tf.expand_dims(r_en[..., i:i + 1], axis=2),
+                               [1, 1, self._ne, 1])),
+                          axis=-1)
+        self._jastrow_een.append(self._jastrow_een_nets[i](
+            batch, layer_collection=layer_collection) / (self._ne**2))
+
+        # split output into channels for different spin interactions
+        jastrow_een = flatten([
+            tf.split(x, self._shape, axis=2)
+            for x in tf.split(self._jastrow_een[i], self._shape, axis=1)
+        ])
+        # For each spin interaction (alpha/alpha, alpha/beta, beta/alpha,
+        # beta/beta), get just the Jastrow term for that interaction
+        jastrow_een = [x[..., i:i + 1] for i, x in enumerate(jastrow_een)]
+        # Add up all Jastrow terms for all pairs of electrons over all spins
+        jastrow_een = tf.add_n(
+            [tf.reduce_sum(x, axis=[1, 2]) for x in jastrow_een])
+        total_jastrow += jastrow_een
+
+    if self._output_log:
+      output = (output[0] - total_jastrow, output[1])
+    else:
+      output = output * tf.exp(-total_jastrow)
+
+    self.output = output
+    return self.output
+
+
+def pretrain_hartree_fock(network, data, strategy, hf_approx):
+  """Pretrain network so orbitals are nearly uncorrelated.
+
+  Args:
+    network: network to be pretrained. Note: only the network._orbitals tensor
+      is pretrained.
+    data: (nreplicas, batch, spatial_dim * nelectrons) Tensor. Input to network.
+    strategy: DistributionStrategy instance.
+    hf_approx: Instance of Scf class containing Hartree-Fock approximation.
+
+  Returns:
+    hf_loss: Distance between orbitals and Hartree-Fock orbitals. Minimised each
+      iteration by tf.AdamOptimizer.
+  """
+
+  with strategy.scope():
+    optim = tf.train.AdamOptimizer(name='Adam_pretrain')
+
+  def pretrain_step_fn(walkers):
+    """Step function for pretraining."""
+    replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group
+    data = walkers[replica_id]
+    network(data)
+    xs = tf.reshape(data, [int(data.shape[0]), sum(hf_approx.nelectrons), -1])
+    hf_mats = hf_approx.tf_eval_hf(xs, deriv=True)
+
+    # pylint: disable=protected-access
+    diffs = [
+        tf.expand_dims(hf, 1) - nn
+        for (hf, nn) in zip(hf_mats, network._orbitals)
+    ]
+    # pylint: enable=protected-access
+    mses = [tf.reduce_mean(tf.square(diff)) for diff in diffs]
+    if len(mses) > 1:
+      mse_loss = 0.5 * tf.add(*mses)
+    else:
+      mse_loss = 0.5 * mses[0]
+
+    pretrain_op = optim.minimize(mse_loss)
+    with tf.control_dependencies([pretrain_op]):
+      mse_loss = tf.identity(mse_loss)
+    return mse_loss
+
+  mse_loss = strategy.experimental_run(
+      functools.partial(pretrain_step_fn, walkers=data))
+  return strategy.reduce(tf.distribute.ReduceOp.MEAN, mse_loss)
diff --git a/ferminet/qmc.py b/ferminet/qmc.py
new file mode 100644
index 0000000..55deac5
--- /dev/null
+++ b/ferminet/qmc.py
@@ -0,0 +1,493 @@
+# Lint as: python3
+# Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Top-level QMC training."""
+
+import contextlib
+import functools
+import os
+import shutil
+from typing import Iterable, Mapping, Optional, Sequence, Text
+
+from absl import flags
+from absl import logging
+from ferminet import networks
+import kfac
+import numpy as np
+import tables
+import tensorflow.compat.v1 as tf
+import tensorflow_probability as tfp
+from tensorflow.core.protobuf import rewriter_config_pb2
+
+FLAGS = flags.FLAGS
+
+
+class Writer(contextlib.AbstractContextManager):
+  """Write data to CSV, as well as logging data to stdout if desired."""
+
+  def __init__(self,
+               name: str,
+               schema: Sequence[str],
+               directory: str = 'logs/',
+               iteration_key: Optional[str] = 't',
+               log: bool = True):
+    """Initialise Writer.
+
+    Args:
+      name: file name for CSV.
+      schema: sequence of keys, corresponding to each data item.
+      directory: directory path to write file to.
+      iteration_key: if not None or a null string, also include the iteration
+        index as the first column in the CSV output with the given key.
+      log: Also log each entry to stdout.
+    """
+    self._schema = schema
+    if not os.path.isdir(directory):
+      os.mkdir(directory)
+    self._filename = os.path.join(directory, name + '.csv')
+    self._iteration_key = iteration_key
+    self._log = log
+
+  def __enter__(self):
+    self._file = open(self._filename, 'w')
+    # write top row of csv
+    if self._iteration_key:
+      self._file.write(f'{self._iteration_key},')
+    self._file.write(','.join(self._schema) + '\n')
+    return self
+
+  def write(self, t: int, **data):
+    """Writes to file and stdout.
+
+    Args:
+      t: iteration index.
+      **data: data items with keys as given in schema.
+    """
+    row = [str(data.get(key, '')) for key in self._schema]
+    if self._iteration_key:
+      row.insert(0, str(t))
+    for key in data:
+      if key not in self._schema:
+        raise ValueError('Not a recognized key for writer: %s' % key)
+
+    # write the data to csv
+    self._file.write(','.join(row) + '\n')
+
+    # write the data to abseil logs
+    if self._log:
+      logging.info('Iteration %s: %s', t, data)
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    self._file.close()
+
+
+class H5Writer(contextlib.AbstractContextManager):
+  """Write data to HDF5 files."""
+
+  def __init__(self,
+               name: str,
+               schema: Mapping[str, Sequence[int]],
+               directory: str = '',
+               index_key: str = 't',
+               compression_level: int = 5):
+    """Initialise H5Writer.
+
+    Args:
+      name: file name for CSV.
+      schema: dict of keys, corresponding to each data item . All data is
+        assumed ot be 32-bit floats.
+      directory: directory path to write file to.
+      index_key: name of (integer) key used to index each entry.
+      compression_level: compression level (0-9) used to compress HDF5 file.
+    """
+    self._path = os.path.join(directory, name)
+    self._schema = schema
+    self._index_key = index_key
+    self._description = {}
+    self._file = None
+    self._complevel = compression_level
+
+  def __enter__(self):
+    if not self._schema:
+      return self
+    pos = 1
+    self._description[self._index_key] = tables.Int32Col(pos=pos)
+    for key, shape in self._schema.items():
+      pos += 1
+      self._description[key] = tables.Float32Col(pos=pos, shape=shape)
+    if not os.path.isdir(os.path.dirname(self._path)):
+      os.mkdir(os.path.dirname(self._path))
+    self._file = tables.open_file(
+        self._path,
+        mode='w',
+        title='Fermi Net Data',
+        filters=tables.Filters(complevel=self._complevel))
+    self._table = self._file.create_table(
+        where=self._file.root, name='data', description=self._description)
+    return self
+
+  def write(self, index: int, data):
+    """Write data to HDF5 file.
+
+    Args:
+      index: iteration index.
+      data: dict of arrays to write to file. Only elements with keys in the
+        schema are written.
+    """
+    if self._file:
+      h5_data = (index,)
+      for key in self._description:
+        if key != self._index_key:
+          h5_data += (data[key],)
+      self._table.append([h5_data])
+      self._table.flush()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    if self._file:
+      self._file.close()
+      self._file = None
+
+
+class QMC:
+  """Quantum Monte Carlo on a Fermi Net."""
+
+  def __init__(
+      self,
+      hamiltonian,
+      network,
+      data_gen,
+      hf_data_gen,
+      clip_el: Optional[float] = None,
+      check_loss: bool = False,
+  ):
+    """Constructs a QMC object for training a Fermi Net.
+
+    Args:
+      hamiltonian: Hamiltonian operators. See hamiltonian.py
+      network: network to represent the wavefunction, psi.
+      data_gen: mcmc.MCMC object that samples configurations from
+        psi^2, where psi is the output of the network.
+      hf_data_gen: mcmc.MCMC object that samples configurations from
+        phi^2, where phi is a Hartree-Fock state of the Hamiltonian.
+      clip_el: If not None, sets the scale at which to clip the local energy.
+        Note the local energy is only clipped in the grad_loss passed back to
+        the gradient, not in the local_energy op itself. The width of the window
+        in which the local energy is not clipped is a multiple of the total
+        variation from the median, which is the equivalent of standard deviation
+        for the l1 norm instead of l2 norm. This makes the clipping robuts to
+        outliers, which is the entire point.
+      check_loss: If True, check the loss for NaNs and raise an error if found.
+    """
+    self.hamiltonian = hamiltonian
+    self.data_gen = data_gen
+    self.hf_data_gen = hf_data_gen
+    self._clip_el = clip_el
+    self._check_loss = check_loss
+    self.network = network
+
+  def _qmc_step_fn(self, optimizer_fn, using_kfac, global_step):
+    """Training step for network given the MCMC state.
+
+    Args:
+      optimizer_fn: A function which takes as argument a LayerCollection object
+        (None) if using_kfac is True (False) and returns the optimizer.
+      using_kfac: True if optimizer_fn creates a instance of kfac.KfacOptimizer
+        and False otherwise.
+      global_step: tensorflow op for global step index.
+
+    Returns:
+      loss: per-GPU loss tensor with control dependencies for updating network.
+      local_energy: local energy for each walker
+      features: network output for each walker.
+
+    Raises:
+      RuntimeError: If using_kfac is True and optimizer_fn does not create a
+      kfac.KfacOptimizer instance or the converse.
+    """
+
+    # Note layer_collection cannot be modified after the KFac optimizer has been
+    # constructed.
+    if using_kfac:
+      layer_collection = kfac.LayerCollection()
+    else:
+      layer_collection = None
+
+    walkers = self.data_gen.walkers_per_gpu
+    features, features_sign = self.network(walkers, layer_collection)
+    optimizer = optimizer_fn(layer_collection)
+
+    if bool(using_kfac) != isinstance(optimizer, kfac.KfacOptimizer):
+      raise RuntimeError('Not using KFac but using_kfac is True.')
+
+    if layer_collection:
+      layer_collection.register_squared_error_loss(features, reuse=False)
+
+    with tf.name_scope('local_energy'):
+      kinetic_fn, potential_fn = self.hamiltonian
+      kinetic = kinetic_fn(features, walkers)
+      potential = potential_fn(walkers)
+      local_energy = kinetic + potential
+      loss = tf.reduce_mean(local_energy)
+      replica_context = tf.distribute.get_replica_context()
+      mean_op = tf.distribute.ReduceOp.MEAN
+      mean_loss = replica_context.merge_call(
+          lambda strategy, val: strategy.reduce(mean_op, val), args=(loss,))
+      grad_loss = local_energy - mean_loss
+
+      if self._clip_el is not None:
+        # clip_el should be much larger than 1, to avoid bias
+        median = tfp.stats.percentile(grad_loss, 50.0)
+        diff = tf.reduce_mean(tf.abs(grad_loss - median))
+        grad_loss_clipped = tf.clip_by_value(grad_loss,
+                                             median - self._clip_el * diff,
+                                             median + self._clip_el * diff)
+      else:
+        grad_loss_clipped = grad_loss
+
+    with tf.name_scope('step'):
+      # Create functions which take no arguments and return the ops for applying
+      # an optimisation step.
+      if not optimizer:
+        optimize_step = tf.no_op
+      else:
+        optimize_step = functools.partial(
+            optimizer.minimize,
+            features,
+            global_step=global_step,
+            var_list=self.network.trainable_variables,
+            grad_loss=grad_loss_clipped)
+
+      if self._check_loss:
+        # Apply optimisation step only if all local energies are well-defined.
+        step = tf.cond(
+            tf.reduce_any(tf.math.is_nan(mean_loss)), tf.no_op, optimize_step)
+      else:
+        # Faster, but less safe: always apply optimisation step. If the
+        # gradients are not well-defined (e.g. loss contains a NaN), then the
+        # network will also be set to NaN.
+        step = optimize_step()
+
+      # A strategy step function must return tensors, not ops so apply a
+      # control dependency to a dummy op to ensure they're executed.
+      with tf.control_dependencies([step]):
+        loss = tf.identity(loss)
+
+    return {
+        'loss': loss,
+        'local_energies': local_energy,
+        'features': features,
+        'features_sign': features_sign
+    }
+
+  def train(self,
+            optim_fn,
+            iterations: int,
+            logging_config,
+            scf_approx,
+            strategy,
+            using_kfac: bool,
+            prefix: Text = 'fermi_net',
+            global_step=None,
+            profile_iterations: Optional[Iterable[int]] = None,
+            determinism_mode: bool = False,
+            cached_data_op=None,
+            write_graph: Optional[Text] = None,
+            burn_in: int = 0,
+            mcmc_steps: int = 10):
+    """Training loop for VMC with hooks for logging and plotting.
+
+    Args:
+      optim_fn: function with zero arguments (thunk) that returns a class
+        implementing optimizer (specifically apply_gradients method).
+      iterations: number of iterations to perform for training.
+      logging_config: LoggingConfig object defining what and how to log data.
+      scf_approx: For hf pretraining, object of class Scf, otherwise None.
+      strategy: DistributionStrategy instance to use for parallelisation. Must
+        match the strategy used to build the model.
+      using_kfac: True if optim_fn creates a kfac.KfacOptimizer object, False
+        otherwise.
+      prefix: prefix to use for dataframes in bigtable. Appended with _sparse
+        and _detailed for configuration and training dataframes respectively.
+      global_step: tensorflow op for global step number. If None set to 0 and
+        incremented each iteration. Can be used to provide a learning rate
+        schedule.
+      profile_iterations: iterable of iteration numbers at which the training
+        session is to be profiled. Note: data generation step (e.g. MCMC) is not
+          included in the profile and must be added separately if required.
+      determinism_mode: activates deterministic computation when in CPU mode.
+      cached_data_op: Op which updates a variable to cache the current training
+        batch. Only necessary if using KFAC with adaptive damping.
+      write_graph: if given, write graph to given file. Must be a full path.
+      burn_in: Number of burn in steps after pretraining. If zero do not burn in
+        or reinitialize walkers.
+      mcmc_steps: Number of MCMC steps to perform between network updates.
+
+    Raises:
+      RuntimeError: if a NaN value for the loss or another accumulated statistic
+      is detected.
+    """
+    if global_step is None:
+      with strategy.scope():
+        global_step = tf.train.get_or_create_global_step()
+    if profile_iterations is None:
+      profile_iterations = []
+
+    if hasattr(self.network, 'pretrain_iterations'):
+      stats_len = max(iterations, self.network.pretrain_iterations)
+    else:
+      stats_len = iterations
+
+    with strategy.scope():
+      mcmc_step = self.data_gen.step()
+      mcmc_hf_step = self.hf_data_gen.step()
+      mcmc_reset_walkers = self.data_gen.reinitialize_walkers()
+      mcmc_update_psi = self.data_gen.update_psi()
+
+    # Pretraining
+    concat_data = tf.concat([self.hf_data_gen.walkers, self.data_gen.walkers],
+                            axis=0)
+    pretrain_ops = networks.pretrain_hartree_fock(self.network, concat_data,
+                                                  strategy, scf_approx)
+
+    qmc_step_fn = functools.partial(self._qmc_step_fn, optim_fn, using_kfac,
+                                    global_step)
+    with strategy.scope():
+      qmc_out_per_replica = strategy.experimental_run(qmc_step_fn)
+    qmc_out_per_replica = {
+        key: tf.stack(strategy.experimental_local_results(value))
+        for key, value in qmc_out_per_replica.items()
+    }
+    loss = tf.reduce_mean(qmc_out_per_replica['loss'])
+
+    stats_ops = self.data_gen.stats_ops(log_walkers=logging_config.walkers)
+    stats_ops['loss_per_replica'] = qmc_out_per_replica['loss']
+    stats_loss_ = np.zeros(shape=qmc_out_per_replica['loss'].shape.as_list())
+
+    if logging_config.wavefunction:
+      stats_ops['log_wavefunction'] = qmc_out_per_replica['features']
+      stats_ops['wavefunction_sign'] = qmc_out_per_replica['feature_sign']
+    if logging_config.local_energy:
+      stats_ops['local_energy'] = qmc_out_per_replica['local_energy']
+
+    train_schema = ['energy', 'pmove']
+    if logging_config.replicas > 1:
+      train_schema += [
+          'energy_replica:{}'.format(i) for i in range(logging_config.replicas)
+      ]
+
+    # As these data are much larger, we create a separated H5 file to avoid
+    # polluting the CSV.
+    h5_data = ('walkers', 'log_wavefunction', 'wavefunction_sign',
+               'local_energy')
+    h5_schema = {
+        key: stats_ops[key].shape.as_list()
+        for key in h5_data
+        if key in stats_ops
+    }
+
+    if write_graph:
+      tf.train.write_graph(tf.get_default_graph(), os.path.dirname(write_graph),
+                           os.path.basename(write_graph))
+
+    if determinism_mode:
+      config_proto = tf.ConfigProto(
+          inter_op_parallelism_threads=1,
+          device_count={'GPU': 0},
+          allow_soft_placement=True)
+    else:
+      config_proto = tf.ConfigProto(allow_soft_placement=True)
+    # See https://github.com/tensorflow/tensorflow/issues/23780.
+    config_proto.graph_options.rewrite_options.memory_optimization = (
+        rewriter_config_pb2.RewriterConfig.OFF)
+
+    saver = tf.train.Saver(max_to_keep=10, save_relative_paths=True)
+    scaffold = tf.train.Scaffold(saver=saver)
+    checkpoint_path = os.path.join(logging_config.result_path, 'checkpoints')
+    if logging_config.restore_path:
+      logging.info('Copying %s to %s and attempting to restore',
+                   logging_config.restore_path, checkpoint_path)
+      shutil.copytree(logging_config.restore_path, checkpoint_path)
+
+    with tf.train.MonitoredTrainingSession(
+        scaffold=scaffold,
+        checkpoint_dir=checkpoint_path,
+        save_checkpoint_secs=logging_config.save_frequency * 60,
+        save_checkpoint_steps=None,
+        save_summaries_steps=None,
+        log_step_count_steps=None,
+        config=config_proto,
+    ) as sess:
+      logging.info('Initialized variables')
+
+      if self.network.pretrain_iterations > 0:
+        logging.info('Pretraining network parameters')
+        with Writer(
+            name='pretrain_stats',
+            schema=['loss'],
+            directory=logging_config.result_path) as pretrain_writer:
+          for t in range(self.network.pretrain_iterations):
+            for _ in range(mcmc_steps):
+              sess.run(mcmc_step)
+            for _ in range(mcmc_steps):
+              sess.run(mcmc_hf_step)
+
+            pretrain_loss = sess.run(pretrain_ops)
+            pretrain_writer.write(t, loss=pretrain_loss)
+            pass
+        logging.info('Pretrained network parameters.')
+
+      if burn_in > 0:
+        logging.info('Resetting MCMC state')
+        sess.run(mcmc_reset_walkers)
+        for _ in range(burn_in):
+          for _ in range(mcmc_steps):
+            sess.run(mcmc_step)
+        logging.info('MCMC burn in complete')
+
+      with Writer(name='train_stats', schema=train_schema, directory=logging_config.result_path) as training_writer, \
+          H5Writer(name='data.h5', schema=h5_schema, directory=logging_config.result_path) as h5_writer:
+        for t in range(iterations):
+          if cached_data_op is not None:
+            sess.run(cached_data_op)
+          sess.run(mcmc_update_psi)
+          for _ in range(mcmc_steps):
+            sess.run(mcmc_step)
+          loss_, stats_ops_ = sess.run([loss, stats_ops])
+          stats_values = [loss_] + list(stats_ops_.values())
+          # Update the contents (but don't reassign) stats_loss_, so the
+          # conditional checkpoint check uses the current per-replica loss via
+          # an earlier local-scope capture on stats_loss_.
+          stats_loss_[:] = stats_ops_['loss_per_replica']
+          if any(np.isnan(stat).any() for stat in stats_values):
+            logging.info('NaN value detected. Loss: %s. Stats: %s.', loss_,
+                         stats_ops_)
+            if self._check_loss:
+              # Re-run the mcmc_reset (setting psi) as the checkpoint hook is
+              # evaluated at the end of a session run call.
+              sess.run(mcmc_update_psi)
+            raise RuntimeError(
+                'NaN value detected. Loss: {}. Stats: {}.'.format(
+                    loss_, stats_ops_))
+
+          if t % logging_config.stats_frequency == 0:
+            writer_kwargs = {
+                'energy': loss_,
+                'pmove': stats_ops_['pmove'],
+            }
+            if logging_config.replicas > 1:
+              for i in range(logging_config.replicas):
+                lpr = stats_ops_['loss_per_replica'][i]
+                writer_kwargs['energy_replica:%d' % i] = lpr
+            training_writer.write(t, **writer_kwargs)
+            h5_writer.write(t, stats_ops_)
diff --git a/ferminet/tests/elements_test.py b/ferminet/tests/elements_test.py
new file mode 100644
index 0000000..c9bf705
--- /dev/null
+++ b/ferminet/tests/elements_test.py
@@ -0,0 +1,121 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ferminet.utils.elements."""
+
+from absl.testing import absltest
+from absl.testing import parameterized
+
+from ferminet.utils import elements
+
+
+class ElementsTest(parameterized.TestCase):
+
+  def test_elements(self):
+    for n, element in elements.ATOMIC_NUMS.items():
+      self.assertEqual(n, element.atomic_number)
+      self.assertEqual(elements.SYMBOLS[element.symbol], element)
+      if element.symbol in ['Li', 'Na', 'K', 'Rb', 'Cs', 'Fr']:
+        self.assertEqual(element.period, elements.ATOMIC_NUMS[n - 1].period + 1)
+      elif element.symbol != 'H':
+        self.assertEqual(element.period, elements.ATOMIC_NUMS[n - 1].period)
+    self.assertCountEqual(
+        (element.symbol for element in elements.ATOMIC_NUMS.values()),
+        elements.SYMBOLS.keys())
+    self.assertCountEqual(
+        (element.atomic_number for element in elements.SYMBOLS.values()),
+        elements.ATOMIC_NUMS.keys())
+
+  @parameterized.parameters(
+      (elements.SYMBOLS['H'], 1, 1, 1),
+      (elements.SYMBOLS['He'], 1, 18, 0),
+      (elements.SYMBOLS['Li'], 2, 1, 1),
+      (elements.SYMBOLS['Be'], 2, 2, 0),
+      (elements.SYMBOLS['C'], 2, 14, 2),
+      (elements.SYMBOLS['N'], 2, 15, 3),
+      (elements.SYMBOLS['Al'], 3, 13, 1),
+      (elements.SYMBOLS['Zn'], 4, 12, -1),
+      (elements.SYMBOLS['Ga'], 4, 13, 1),
+      (elements.SYMBOLS['Kr'], 4, 18, 0),
+      (elements.SYMBOLS['Ce'], 6, -1, -1),
+      (elements.SYMBOLS['Ac'], 7, 3, -1),
+  )
+  def test_element_group_period(self, element, period, group, spin_config):
+    # Validate subset of elements. See below for more thorough tests using
+    # properties of the periodic table.
+    self.assertEqual(element.period, period)
+    self.assertEqual(element.group, group)
+    if element.group == -1 or 3 <= element.group <= 12:
+      with self.assertRaises(NotImplementedError):
+        _ = element.spin_config
+    else:
+      self.assertEqual(element.spin_config, spin_config)
+
+  def test_periods(self):
+    self.assertLen(elements.ATOMIC_NUMS,
+                   sum(len(period) for period in elements.PERIODS.values()))
+    period_length = {1: 2, 2: 8, 3: 8, 4: 18, 5: 18, 6: 32, 7: 32}
+    for p, es in elements.PERIODS.items():
+      self.assertLen(es, period_length[p])
+
+  def test_groups(self):
+    # Atomic numbers of first element in each period.
+    period_starts = sorted([
+        period_elements[0].atomic_number
+        for period_elements in elements.PERIODS.values()
+    ])
+    # Iterate over all elements in order of atomic number. Group should
+    # increment monotonically (except for accommodating absence of d block and
+    # presence of f block) and reset to 1 on the first element in each period.
+    for i in range(1, len(elements.ATOMIC_NUMS)+1):
+      element = elements.ATOMIC_NUMS[i]
+      if element.atomic_number in period_starts:
+        prev_group = 0
+        fblock = 0
+      if element.symbol == 'He':
+        # Full shell, not just full s subshell.
+        self.assertEqual(element.group, 18)
+      elif element.group == -1:
+        # Found a lanthanide (period 6) or actinide (period 7).
+        self.assertIn(element.period, [6, 7])
+        fblock += 1
+      elif element.atomic_number == 5 or element.atomic_number == 13:
+        # No d block (10 elements, groups 3-12) in periods 2 and 3.
+        self.assertEqual(element.group, prev_group + 11)
+      else:
+        # Group should increment monotonically.
+        self.assertEqual(element.group, prev_group + 1)
+      if element.group != -1:
+        prev_group = element.group
+      self.assertGreaterEqual(prev_group, 1)
+      self.assertLessEqual(prev_group, 18)
+      if element.group == 4 and element.period > 6:
+        # Should have seen 14 lanthanides (period 6) or 14 actinides (period 7).
+        self.assertEqual(fblock, 14)
+
+    # The periodic table (up to element 118) contains 7 periods.
+    # Hydrogen and Helium are placed in groups 1 and 18 respectively.
+    # Groups 1-2 (s-block) and 13-18 (p-block) are present in the second
+    # period onwards, groups 3-12 (d-block) the fourth period onwards.
+    # Check each group contains the expected number of elements.
+    nelements_in_group = [0]*18
+    for element in elements.ATOMIC_NUMS.values():
+      if element.group != -1:
+        nelements_in_group[element.group-1] += 1
+    self.assertListEqual(nelements_in_group, [7, 6] + [4]*10 + [6]*5 + [7])
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/ferminet/tests/hamiltonian_test.py b/ferminet/tests/hamiltonian_test.py
new file mode 100644
index 0000000..b7e2106
--- /dev/null
+++ b/ferminet/tests/hamiltonian_test.py
@@ -0,0 +1,285 @@
+# Lint as: python3
+# Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ferminet.hamiltonian."""
+
+from ferminet import hamiltonian
+from ferminet.utils import system
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+
+def _hydrogen(xs):
+  """Hydrogen (3D) atom ground state (1s) wavefunction.
+
+  Energy: -1/2 hartrees.
+
+  Args:
+    xs: tensor (batch_size, 3) of electron positions.
+
+  Returns:
+    tensor (batch_size, 1) of the (unnormalised) wavefunction at each position.
+  """
+  with tf.name_scope('Psi_H'):
+    return tf.exp(-tf.norm(xs, axis=1, keepdims=True))
+
+
+def _helium(xs):
+  """Compact parametrized Helium wavefunction.
+
+  See https://opencommons.uconn.edu/chem_educ/30/ and Phys Rev A 74, 014501
+  (2006).
+
+  Energy: -2.901188 hartrees (compared to -2.9037243770 hartrees for the exact
+  ground state).
+
+  Args:
+    xs: tensor (batch_size, 6) of electron positions.
+
+  Returns:
+    tensor (batch_size, 1) of the (unnormalised) wavefunction at each pair of
+    positions.
+  """
+  with tf.name_scope('Psi_He'):
+    x1, x2 = tf.split(xs, 2, axis=1)
+    x1n = tf.norm(x1, axis=1, keepdims=True)
+    x2n = tf.norm(x2, axis=1, keepdims=True)
+    s = x1n + x2n
+    t = x1n - x2n
+    u = tf.norm(x1 - x2, axis=1, keepdims=True)
+    return (tf.exp(-2*s)
+            * (1 + 0.5*u*tf.exp(-1.013*u))
+            * (1 + 0.2119*s*u + 0.1406*t*t - 0.003*u*u))
+
+
+def _grid_points(start, stop, npoints, ndim):
+  x1 = np.linspace(start, stop, npoints)
+  x2 = np.zeros(npoints)
+  g1 = np.array([x1] + [x2 for _ in range(ndim-1)]).T
+  g2 = np.array([x1, x1] + [x2 for _ in range(ndim-2)]).T
+  return np.ascontiguousarray(g1), np.ascontiguousarray(g2)
+
+
+def _wfn_to_log(f):
+  def _logf(x):
+    psi = f(x)
+    return tf.log(tf.abs(psi)), tf.sign(psi)
+  return _logf
+
+
+class WavefunctionTest(tf.test.TestCase):
+  """Basic tests for the toy wavefunctions defined above."""
+
+  def test_hydrogen(self):
+    grid1, grid2 = _grid_points(-1.1, 1.1, 21, 3)
+    psi_np = np.exp(-np.abs(grid1[:, 0]))
+
+    psi = _hydrogen(tf.constant(grid1))
+    with tf.train.MonitoredSession() as session:
+      psi_ = np.squeeze(session.run(psi))
+    np.testing.assert_allclose(psi_, psi_np)
+    np.testing.assert_allclose(psi_, psi_[::-1])
+
+    psi = _hydrogen(tf.constant(grid2))
+    with tf.train.MonitoredSession() as session:
+      psi_ = np.squeeze(session.run(psi))
+    np.testing.assert_allclose(psi_, psi_np**np.sqrt(2))
+    np.testing.assert_allclose(psi_, psi_[::-1])
+
+  def test_helium(self):
+    grid1, grid2 = _grid_points(-1.1, 1.0, 5, 6)
+    grid1[:, 3] += 0.5  # Avoid nuclear singularity for second electron
+    grid2[:, 3] += 0.5  # Avoid nuclear singularity for second electron
+
+    # Exact results calculated by hand...
+    psi1 = np.array([0.07484748391,
+                     0.17087261364,
+                     0.42062736263,
+                     0.14476421949,
+                     0.06836260905])
+    psi2 = np.array([0.037059842334,
+                     0.114854125198,
+                     0.403704643707,
+                     0.123475237250,
+                     0.040216273342])
+
+    psi = _helium(tf.constant(grid1))
+    with tf.train.MonitoredSession() as session:
+      psi_ = np.squeeze(session.run(psi))
+    np.testing.assert_allclose(psi_, psi1)
+
+    # Check for symmetry: swap x and z and electron 1 and electron 2.
+    grid1 = np.flip(grid1, axis=1)
+    psi = _helium(tf.constant(grid1))
+    with tf.train.MonitoredSession() as session:
+      psiz_ = np.squeeze(session.run(psi))
+    np.testing.assert_allclose(psi_, psiz_)
+
+    psi = _helium(tf.constant(grid2))
+    with tf.train.MonitoredSession() as session:
+      psi_ = np.squeeze(session.run(psi))
+    np.testing.assert_allclose(psi_, psi2)
+
+
+class HamiltonianAtomTest(tf.test.TestCase):
+
+  def test_hydrogen_atom(self):
+    xs = tf.random_uniform((6, 3), minval=-1.0, maxval=1.0)
+    atoms = [system.Atom(symbol='H', coords=(0, 0, 0))]
+    kin, pot = hamiltonian.operators(atoms, 1, 0.0)
+    logpsi = tf.log(tf.abs(_hydrogen(xs)))
+    e_loc = kin(logpsi, xs) + pot(xs)
+    with tf.train.MonitoredSession() as session:
+      e_loc_ = session.run(e_loc)
+    # Wavefunction is the ground state eigenfunction. Hence H\Psi = -1/2 \Psi.
+    np.testing.assert_allclose(e_loc_, -0.5, rtol=1.e-6, atol=1.e-7)
+
+  def test_helium_atom(self):
+    x1 = np.linspace(-1, 1, 6, dtype=np.float32)
+    x2 = np.zeros(6, dtype=np.float32) + 0.25
+    x12 = np.array([x1] + [x2 for _ in range(5)]).T
+    atoms = [system.Atom(symbol='He', coords=(0, 0, 0))]
+    hpsi_test = np.array([
+        -0.25170374, -0.43359983, -0.61270618, -1.56245542, -0.39977553,
+        -0.2300467
+    ])
+
+    log_helium = _wfn_to_log(_helium)
+    h = hamiltonian.exact_hamiltonian(atoms, 2)
+    xs = tf.constant(x12)
+    _, hpsi = h(log_helium, xs)
+    with tf.train.MonitoredSession() as session:
+      hpsi_ = session.run(hpsi)
+
+    xs = tf.reverse(xs, axis=[1])
+    _, hpsi2 = h(log_helium, xs)
+    with tf.train.MonitoredSession() as session:
+      hpsi2_ = session.run(hpsi2)
+
+    np.testing.assert_allclose(hpsi_[:, 0], hpsi_test, rtol=1.e-6)
+    np.testing.assert_allclose(hpsi_, hpsi2_, rtol=1.e-6)
+
+
+class HamiltonianMoleculeTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(HamiltonianMoleculeTest, self).setUp()
+    self.rh1 = np.array([0, 0, 0], dtype=np.float32)
+    self.rh2 = np.array([0, 0, 200], dtype=np.float32)
+    self.rhh = self.rh2 - self.rh1
+    self.dhh = 1.0 / np.sqrt(np.dot(self.rhh, self.rhh))
+    self.xs = (2 * np.random.random((6, 3)) - 1).astype(np.float32)
+    self.xs = [self.xs + self.rh1, self.xs + self.rh2]
+    self.atoms = [
+        system.Atom(symbol='H', coords=self.rh1),
+        system.Atom(symbol='H', coords=self.rh2)
+    ]
+
+  def test_hydrogen_molecular_ion(self):
+
+    def h2_psi(xs):
+      # Trial wavefunction as linear combination of H 1s wavefunctions centered
+      # on each atom.
+      return _hydrogen(xs - self.rh1) + _hydrogen(xs - self.rh2)
+
+    h = hamiltonian.exact_hamiltonian(self.atoms, 1, 0.0)
+    # H2+; 1 electron. Use points around both nuclei.
+    xs = tf.constant(np.concatenate(self.xs, axis=0))
+    psi, hpsi = h(_wfn_to_log(h2_psi), xs)
+    with tf.train.MonitoredSession() as session:
+      psi_, hpsi_ = session.run([psi, hpsi])
+    # Leading order correction to energy is nuclear interaction with the far
+    # nucleus.
+    np.testing.assert_allclose(
+        hpsi_, -(0.5 + self.dhh) * psi_, rtol=self.dhh, atol=self.dhh)
+
+  def test_hydrogen_molecule(self):
+
+    def h2_psi(xs):
+      x1, x2 = tf.split(xs, 2, axis=1)
+      # Essentially a non-interacting system.
+      return _hydrogen(x1 - self.rh1) * _hydrogen(x2 - self.rh2)
+
+    h = hamiltonian.exact_hamiltonian(self.atoms, 2, 0.0)
+    # H2; 2 electrons. Place one electron around each nucleus.
+    xs = tf.constant(np.concatenate(self.xs, axis=1))
+    psi, hpsi = h(_wfn_to_log(h2_psi), xs)
+    with tf.train.MonitoredSession() as session:
+      psi_, hpsi_ = session.run([psi, hpsi])
+    np.testing.assert_allclose(
+        hpsi_, -(1 + self.dhh) * psi_, rtol=self.dhh, atol=self.dhh)
+
+
+class R12FeaturesTest(tf.test.TestCase):
+
+  def test_r12_features_atom1(self):
+    atoms = [system.Atom(symbol='H', coords=(0, 0, 0))]
+    one = np.ones((1, 3))
+    xs = tf.constant(one, dtype=tf.float32)
+    xs12 = hamiltonian.r12_features(xs, atoms, 1, flatten=True)
+    with tf.train.MonitoredSession() as session:
+      xs12_ = session.run(xs12)
+    # Should add |x|.
+    x_test = np.concatenate([[[np.linalg.norm(one[0])]], one], axis=1)
+    np.testing.assert_allclose(xs12_, x_test)
+
+  def test_r12_features_atom2(self):
+    atoms = [system.Atom(symbol='He', coords=(0, 0, 0))]
+    one = np.concatenate([np.ones((1, 3)), 0.5 * np.ones((1, 3))], axis=1)
+    xs = tf.constant(one, dtype=tf.float32)
+    xs12 = hamiltonian.r12_features(xs, atoms, 2, flatten=True)
+    with tf.train.MonitoredSession() as session:
+      xs12_ = session.run(xs12)
+    # Should add |x_1|, |x_2|, |x1-x2|
+    norm = np.linalg.norm
+    x_test = np.concatenate([
+        [[
+            norm(one[0, :3]),
+            norm(one[0, 3:]),
+            norm(one[0, :3] - one[0, 3:]),
+        ]],
+        one
+    ],
+                            axis=1)
+    np.testing.assert_allclose(xs12_, x_test)
+
+  def test_r12_features_molecule(self):
+    atoms = [
+        system.Atom(symbol='H', coords=(0, 0, 0)),
+        system.Atom(symbol='H', coords=(1, 0, 0))
+    ]
+    one = np.concatenate([np.ones((1, 3)), 0.5 * np.ones((1, 3))], axis=1)
+    xs = tf.constant(one, dtype=tf.float32)
+    xs12 = hamiltonian.r12_features(xs, atoms, 2, flatten=True)
+    with tf.train.MonitoredSession() as session:
+      xs12_ = session.run(xs12)
+    # Should add |x_11|, |x_21|, |x_12|, |x_22|, |x1-x2|
+    norm = np.linalg.norm
+    x_test = np.concatenate([
+        [[
+            norm(one[0, :3]),
+            norm(one[0, :3] - atoms[1].coords),
+            norm(one[0, 3:]),
+            norm(one[0, 3:] - atoms[1].coords),
+            norm(one[0, :3] - one[0, 3:]),
+        ]],
+        one
+    ],
+                            axis=1)
+    np.testing.assert_allclose(xs12_, x_test, rtol=1e-6)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/ferminet/tests/mcmc_test.py b/ferminet/tests/mcmc_test.py
new file mode 100644
index 0000000..aa87fb1
--- /dev/null
+++ b/ferminet/tests/mcmc_test.py
@@ -0,0 +1,119 @@
+# Lint as: python3
+# Copyright 2020 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ferminet.mcmc."""
+
+from absl.testing import parameterized
+
+from ferminet import hamiltonian
+from ferminet import mcmc
+from ferminet.utils import system
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+
+def _hydrogen(xs):
+  """Hydrogen (3D) atom ground state (1s) wavefunction.
+
+  Energy: -1/2 hartrees.
+
+  Args:
+    xs: tensor (batch_size, 3) of electron positions.
+
+  Returns:
+    tensor (batch_size, 1) of the (unnormalised) wavefunction at each position.
+  """
+  with tf.name_scope('Psi_H'):
+    return tf.exp(-tf.norm(xs, axis=1, keepdims=True))
+
+
+def _helium(xs):
+  """Compact parametrized Helium wavefunction.
+
+  See https://opencommons.uconn.edu/chem_educ/30/ and Phys Rev A 74, 014501
+  (2006).
+
+  Energy: -2.901188 hartrees (compared to -2.9037243770 hartrees for the exact
+  ground state).
+
+  Args:
+    xs: tensor (batch_size, 6) of electron positions.
+
+  Returns:
+    tensor (batch_size, 1) of the (unnormalised) wavefunction at each pair of
+    positions.
+  """
+  with tf.name_scope('Psi_He'):
+    x1, x2 = tf.split(xs, 2, axis=1)
+    x1n = tf.norm(x1, axis=1, keepdims=True)
+    x2n = tf.norm(x2, axis=1, keepdims=True)
+    s = x1n + x2n
+    t = x1n - x2n
+    u = tf.norm(x1 - x2, axis=1, keepdims=True)
+    return (tf.exp(-2*s)
+            * (1 + 0.5*u*tf.exp(-1.013*u))
+            * (1 + 0.2119*s*u + 0.1406*t*t - 0.003*u*u))
+
+
+def _run_mcmc(atoms,
+              nelectrons,
+              net,
+              batch_size=1024,
+              steps=10,
+              dtype=tf.float32):
+  gen = mcmc.MCMC(
+      net,
+      batch_size,
+      [0] * 3 * nelectrons,
+      1.0,
+      0.1,
+      dtype=dtype)
+  kin, pot = hamiltonian.operators(atoms, nelectrons, 0.0)
+  walkers = tf.squeeze(gen.walkers)
+  psi, _ = net(walkers)
+  e_loc = tf.reduce_sum(kin(psi, walkers) + pot(walkers)) / batch_size
+  e = []
+  mcmc_step = gen.step()
+  with tf.train.MonitoredSession() as session:
+    for _ in range(steps):
+      session.run(mcmc_step)
+      e.append(session.run(e_loc))
+  return np.array(e)
+
+
+class McmcTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      {'dtype': tf.float64},
+      {'dtype': tf.float64},
+  )
+  def test_hydrogen_atom(self, dtype):
+    atoms = [system.Atom(symbol='H', coords=(0, 0, 0))]
+    def net(x):
+      psi = _hydrogen(x)
+      return (tf.log(tf.abs(psi)), tf.sign(psi))
+    e = _run_mcmc(atoms, 1, net, dtype=dtype)
+    np.testing.assert_allclose(e, -np.ones_like(e) / 2)
+
+  def test_helium_atom(self):
+    atoms = [system.Atom(symbol='He', coords=(0, 0, 0))]
+    def net(x):
+      psi = _helium(x)
+      return (tf.log(tf.abs(psi)), tf.sign(psi))
+    e = _run_mcmc(atoms, 2, net, steps=500)
+    np.testing.assert_allclose(e[100:].mean(), -2.901188, atol=5.e-3)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/ferminet/tests/networks_test.py b/ferminet/tests/networks_test.py
new file mode 100644
index 0000000..a7d45fd
--- /dev/null
+++ b/ferminet/tests/networks_test.py
@@ -0,0 +1,394 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for ferminet.networks."""
+
+from absl.testing import parameterized
+from ferminet import networks
+from ferminet.utils import scf
+from ferminet.utils import system
+import numpy as np
+import pyscf
+import tensorflow.compat.v1 as tf
+
+
+class NetworksTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(NetworksTest, self).setUp()
+    # disable use of temp directory in pyscf.
+    # Test calculations are small enough to fit in RAM and we don't need
+    # checkpoint files.
+    pyscf.lib.param.TMPDIR = None
+
+  @parameterized.parameters(
+      {
+          'hidden_units': [8, 8],
+          'after_det': [8, 8, 2],
+      },
+      {
+          'hidden_units': [[8, 8], [8, 8]],
+          'after_det': [8, 8, 2],
+      },
+  )
+  def test_ferminet(self, hidden_units, after_det):
+    """Check that FermiNet is actually antisymmetric."""
+    atoms = [
+        system.Atom(symbol='H', coords=(0, 0, -1.0)),
+        system.Atom(symbol='H', coords=(0, 0, 1.0)),
+    ]
+    ne = (3, 2)
+    x1 = tf.random_normal([3, 3 * sum(ne)])
+    xs = tf.split(x1, sum(ne), axis=1)
+
+    # swap indices to test antisymmetry
+    x2 = tf.concat([xs[1], xs[0]] + xs[2:], axis=1)
+    x3 = tf.concat(
+        xs[:ne[0]] + [xs[ne[0] + 1], xs[ne[0]]] + xs[ne[0] + 2:], axis=1)
+
+    ferminet = networks.FermiNet(
+        atoms=atoms,
+        nelectrons=ne,
+        slater_dets=4,
+        hidden_units=hidden_units,
+        after_det=after_det)
+    y1 = ferminet(x1)
+    y2 = ferminet(x2)
+    y3 = ferminet(x3)
+    with tf.train.MonitoredSession() as session:
+      out1, out2, out3 = session.run([y1, y2, y3])
+      np.testing.assert_allclose(out1, -out2, rtol=4.e-5, atol=1.e-6)
+      np.testing.assert_allclose(out1, -out3, rtol=4.e-5, atol=1.e-6)
+
+  def test_ferminet_mask(self):
+    """Check that FermiNet with a decaying mask on the output works."""
+    atoms = [
+        system.Atom(symbol='H', coords=(0, 0, -1.0)),
+        system.Atom(symbol='H', coords=(0, 0, 1.0)),
+    ]
+    ne = (3, 2)
+    hidden_units = [[8, 8], [8, 8]]
+    after_det = [8, 8, 2]
+    x = tf.random_normal([3, 3 * sum(ne)])
+
+    ferminet = networks.FermiNet(
+        atoms=atoms,
+        nelectrons=ne,
+        slater_dets=4,
+        hidden_units=hidden_units,
+        after_det=after_det,
+        envelope=True)
+    y = ferminet(x)
+    with tf.train.MonitoredSession() as session:
+      session.run(y)
+
+  @parameterized.parameters(1, 3)
+  def test_ferminet_size(self, size):
+    atoms = [
+        system.Atom(symbol='H', coords=(0, 0, -1.0)),
+        system.Atom(symbol='H', coords=(0, 0, 1.0)),
+    ]
+    ne = (size, size)
+    nout = 1
+    batch_size = 3
+    ndet = 4
+    hidden_units = [[8, 8], [8, 8]]
+    after_det = [8, 8, nout]
+    x = tf.random_normal([batch_size, 3 * sum(ne)])
+
+    ferminet = networks.FermiNet(
+        atoms=atoms,
+        nelectrons=ne,
+        slater_dets=ndet,
+        hidden_units=hidden_units,
+        after_det=after_det)
+    y = ferminet(x)
+    for i in range(len(ne)):
+      self.assertEqual(ferminet._dets[i].shape.as_list(), [batch_size, ndet])
+      self.assertEqual(ferminet._orbitals[i].shape.as_list(),
+                       [batch_size, ndet, size, size])
+    self.assertEqual(y.shape.as_list(), [batch_size, nout])
+
+  @parameterized.parameters(
+      {
+          'hidden_units': [8, 8],
+          'after_det': [8, 8, 2]
+      },
+      {
+          'hidden_units': [[8, 8], [8, 8]],
+          'after_det': [8, 8, 2]
+      },
+  )
+  def test_ferminet_pretrain(self, hidden_units, after_det):
+    """Check that FermiNet pretraining runs."""
+    atoms = [
+        system.Atom(symbol='Li', coords=(0, 0, -1.0)),
+        system.Atom(symbol='Li', coords=(0, 0, 1.0)),
+    ]
+    ne = (4, 2)
+    x = tf.random_normal([1, 10, 3 * sum(ne)])
+
+    strategy = tf.distribute.get_strategy()
+
+    with strategy.scope():
+      ferminet = networks.FermiNet(
+          atoms=atoms,
+          nelectrons=ne,
+          slater_dets=4,
+          hidden_units=hidden_units,
+          after_det=after_det,
+          pretrain_iterations=10)
+
+    # Test Hartree fock pretraining - no change of position.
+    hf_approx = scf.Scf(atoms, nelectrons=ne)
+    hf_approx.run()
+    pretrain_op_hf = networks.pretrain_hartree_fock(ferminet, x, strategy,
+                                                    hf_approx)
+    self.assertEqual(ferminet.pretrain_iterations, 10)
+    with tf.train.MonitoredSession() as session:
+      for _ in range(ferminet.pretrain_iterations):
+        session.run(pretrain_op_hf)
+
+
+class LogDetMatmulTest(tf.test.TestCase, parameterized.TestCase):
+
+  def _make_mats(self, n, m, l1, l2, k, singular=False):
+    x1 = np.random.randn(n, m, l1, l1).astype(np.float32)
+    if singular:
+      # Make one matrix singular
+      x1[0, 0, 0] = 2.0 * x1[0, 0, 1]
+    x2 = np.random.randn(n, m, l2, l2).astype(np.float32)
+    w = np.random.randn(m, k).astype(np.float32)
+
+    x1_tf = tf.constant(x1)
+    x2_tf = tf.constant(x2)
+    w_tf = tf.constant(w)
+
+    output = networks.logdet_matmul(x1_tf, x2_tf, w_tf)
+    return x1, x2, w, x1_tf, x2_tf, w_tf, output
+
+  def _cofactor(self, x):
+    u, s, v = np.linalg.svd(x)
+    ss = np.tile(s[..., None], [1, s.shape[0]])
+    np.fill_diagonal(ss, 1.0)
+    z = np.prod(ss, axis=0)
+    return np.dot(u, np.dot(np.diag(z), v)).transpose()
+
+  @parameterized.parameters(True, False)
+  def test_logdet_matmul(self, singular):
+    n = 4
+    m = 5
+    l1 = 3
+    l2 = 4
+    k = 2
+    x1, x2, w, _, _, _, output = self._make_mats(
+        n, m, l1, l2, k, singular=singular)
+
+    with tf.Session() as session:
+      logout, signout = session.run(output)
+    tf_result = np.exp(logout) * signout
+    det1 = np.zeros([n, m], dtype=np.float32)
+    det2 = np.zeros([n, m], dtype=np.float32)
+    for i in range(n):
+      for j in range(m):
+        det1[i, j] = np.linalg.det(x1[i, j])
+        det2[i, j] = np.linalg.det(x2[i, j])
+    np_result = np.dot(det1 * det2, w)
+    np.testing.assert_allclose(np_result, tf_result, atol=1e-5, rtol=1e-5)
+
+  @parameterized.parameters(True, False)
+  def test_logdet_matmul_grad(self, singular):
+    n = 4
+    m = 5
+    l1 = 3
+    l2 = 4
+    k = 2
+    result = self._make_mats(n, m, l1, l2, k, singular=singular)
+    x1, x2, w, x1_tf, x2_tf, w_tf, output = result
+
+    with tf.Session():
+      theor_x1, numer_x1 = tf.test.compute_gradient(
+          x1_tf, [n, m, l1, l1], output[0], [n, k], x_init_value=x1)
+      np.testing.assert_allclose(theor_x1, numer_x1, atol=1e-2, rtol=1e-3)
+
+      theor_x2, numer_x2 = tf.test.compute_gradient(
+          x2_tf, [n, m, l2, l2], output[0], [n, k], x_init_value=x2)
+      np.testing.assert_allclose(theor_x2, numer_x2, atol=1e-2, rtol=1e-3)
+
+      theor_w, numer_w = tf.test.compute_gradient(
+          w_tf, [m, k], output[0], [n, k], x_init_value=w)
+      np.testing.assert_allclose(theor_w, numer_w, atol=1e-2, rtol=1e-3)
+
+  @parameterized.parameters(True, False)
+  def test_logdet_matmul_grad_grad(self, singular):
+    n = 2
+    m = 3
+    l1 = 2
+    l2 = 3
+    k = 2
+    result = self._make_mats(n, m, l1, l2, k, singular=singular)
+    x1, x2, w, x1_tf, x2_tf, w_tf, output = result
+    glog = np.random.randn(*(output[0].shape.as_list())).astype(np.float32)
+    glog_tf = tf.constant(glog)
+    grad_op = tf.gradients(output[0], [x1_tf, x2_tf, w_tf], grad_ys=glog_tf)
+    inp_op = [x1_tf, x2_tf, w_tf, glog_tf]
+    inp_np = [x1, x2, w, glog]
+
+    with tf.Session():
+      for i in range(len(inp_op)):
+        for j in range(len(grad_op)):
+          theor, numer = tf.test.compute_gradient(
+              inp_op[i],
+              inp_op[i].shape,
+              grad_op[j],
+              grad_op[j].shape,
+              x_init_value=inp_np[i])
+          np.testing.assert_allclose(theor, numer, atol=1e-2, rtol=1.3e-3)
+
+
+class AdjugateGradTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      (2, False),
+      (2, True),
+      (8, False),
+      (8, True),
+  )
+  def test_grad_adj(self, dim, singular):
+
+    def cofactor_tf(xs):
+      s, u, v = tf.linalg.svd(xs)
+      return networks.cofactor(u, s, v)
+
+    @tf.custom_gradient
+    def cofactor_tf_custom(xs):
+      s, u, v = tf.linalg.svd(xs)
+
+      def grad(dy):
+        r = networks.rho(s, 0.0)
+        return networks.grad_cofactor(u, v, r, dy)
+
+      return networks.cofactor(u, s, v), grad
+
+    n, m = 6, 3
+    x = np.random.randn(n, m, dim, dim).astype(np.float32)
+    if singular:
+      # Make one matrix singular
+      x[0, 0, 0] = 2.0 * x[0, 0, 1]
+    x_tf = tf.constant(x)
+    adj1 = cofactor_tf(x_tf)
+    grad_adj_backprop = tf.gradients(adj1, x_tf)
+    adj2 = cofactor_tf_custom(x_tf)
+    grad_adj_closed = tf.gradients(adj2, x_tf)
+
+    with tf.train.MonitoredSession() as session:
+      backprop, closed = session.run([grad_adj_backprop, grad_adj_closed])
+
+    np.testing.assert_allclose(backprop, closed, atol=1.e-5, rtol=1.e-3)
+
+
+class GammaTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      ((5,), 0),
+      ((5,), 2),
+      ((
+          12,
+          6,
+          5,
+      ), 0),
+      ((
+          12,
+          1,
+          8,
+      ), 2),
+      ((
+          12,
+          8,
+          8,
+      ), 2),
+  )
+  def test_gamma(self, shape, nsingular):
+
+    s = tf.Variable(tf.random_uniform(shape))
+    if nsingular > 0:
+      zeros = list(shape)
+      zeros[-1] = nsingular
+      s = s[..., shape[-1] - nsingular:].assign(tf.zeros(zeros))
+
+    g = networks.gamma(s)
+
+    with tf.train.MonitoredSession() as session:
+      s_, g_ = session.run([s, g])
+
+    s_flat = np.reshape(s_, (-1, shape[-1]))
+    gamma_exact = np.ones_like(s_flat)
+    for i, row in enumerate(s_flat):
+      for j in range(shape[-1]):
+        row[j], r_j = 1.0, row[j]
+        gamma_exact[i, j] = np.prod(row)
+        row[j] = r_j
+    gamma_exact = np.reshape(gamma_exact, g_.shape)
+    np.testing.assert_allclose(g_, gamma_exact, atol=1.e-7)
+
+
+class RhoTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      (None, 2, 0),
+      (None, 5, 0),
+      (None, 5, 2),
+      ((8, 4), 5, 2),
+  )
+  def test_rho(self, tile, dim, nsingular):
+
+    assert dim > 0
+    assert dim > nsingular
+    s = tf.Variable(tf.random_uniform((dim,)))
+    s = s[dim - nsingular:].assign(tf.zeros(nsingular))
+
+    s_batch = s
+    if tile:
+      for _ in range(len(tile)):
+        s_batch = tf.expand_dims(s_batch, 0)
+      s_batch = tf.tile(s_batch, list(tile) + [1])
+
+    r = networks.rho(s)
+    r_batch = networks.rho(s_batch)
+
+    with tf.train.MonitoredSession() as session:
+      s_, r_, r_batch_ = session.run([s, r, r_batch])
+
+    rho_exact = np.zeros((dim, dim), dtype=np.float32)
+    for i in range(dim - 1):
+      s_[i], s_i = 1.0, s_[i]
+      for j in range(i + 1, dim):
+        s_[j], s_j = 1.0, s_[j]
+        rho_exact[i, j] = np.prod(s_)
+        s_[j] = s_j
+      s_[i] = s_i
+    rho_exact = rho_exact + np.transpose(rho_exact)
+
+    atol = 1e-5
+    rtol = 1e-5
+    np.testing.assert_allclose(r_, rho_exact, atol=atol, rtol=rtol)
+
+    if tile:
+      r_batch_ = np.reshape(r_batch_, (-1, dim, dim))
+      for i in range(r_batch_[0].shape[0]):
+        np.testing.assert_allclose(r_batch_[i], rho_exact, atol=atol, rtol=rtol)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/ferminet/tests/scf_test.py b/ferminet/tests/scf_test.py
new file mode 100644
index 0000000..1bf7f37
--- /dev/null
+++ b/ferminet/tests/scf_test.py
@@ -0,0 +1,238 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ferminet.scf."""
+
+from typing import List, Tuple
+from absl.testing import parameterized
+from ferminet.utils import scf
+from ferminet.utils import system
+import numpy as np
+import pyscf
+import tensorflow.compat.v1 as tf
+
+
+def create_o2_hf(bond_length):
+  molecule = [
+      system.Atom('O', (0, 0, 0)),
+      system.Atom('O', (0, 0, bond_length))
+  ]
+  spin = 2
+  oxygen_atomic_number = 8
+  nelectrons = [oxygen_atomic_number + spin, oxygen_atomic_number - spin]
+  hf = scf.Scf(molecule=molecule, nelectrons=nelectrons, restricted=False)
+  return hf
+
+
+class ScfTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(ScfTest, self).setUp()
+    # disable use of temp directory in pyscf.
+    # Test calculations are small enough to fit in RAM and we don't need
+    # checkpoint files.
+    pyscf.lib.param.TMPDIR = None
+
+  @parameterized.parameters(
+      {
+          'molecule': [system.Atom('He', (0, 0, 0))],
+          'nelectrons': (1, 1)
+      },
+      {
+          'molecule': [system.Atom('N', (0, 0, 0))],
+          'nelectrons': (5, 2)
+      },
+      {
+          'molecule': [system.Atom('N', (0, 0, 0))],
+          'nelectrons': (5, 3)
+      },
+      {
+          'molecule': [system.Atom('N', (0, 0, 0))],
+          'nelectrons': (4, 2)
+      },
+      {
+          'molecule': [system.Atom('O', (0, 0, 0))],
+          'nelectrons': (5, 3),
+          'restricted': False,
+      },
+      {
+          'molecule': [
+              system.Atom('N', (0, 0, 0)),
+              system.Atom('N', (0, 0, 1.4))
+          ],
+          'nelectrons': (7, 7)
+      },
+      {
+          'molecule': [
+              system.Atom('O', (0, 0, 0)),
+              system.Atom('O', (0, 0, 1.4))
+          ],
+          'nelectrons': (9, 7),
+          'restricted': False,
+      },
+  )
+  def test_scf_interface(self,
+                         molecule: List[system.Atom],
+                         nelectrons: Tuple[int, int],
+                         restricted: bool = True):
+    """Tests SCF interface to a pyscf calculation.
+
+    pyscf has its own tests so only check that we can run calculations over
+    atoms and simple diatomics using the interface in ferminet.scf.
+
+    Args:
+      molecule: List of system.Atom objects giving atoms in the molecule.
+      nelectrons: Tuple containing number of alpha and beta electrons.
+      restricted: If true, run a restricted Hartree-Fock calculation, otherwise
+        run an unrestricted Hartree-Fock calculation.
+    """
+    npts = 100
+    xs = np.random.randn(npts, 3)
+    hf = scf.Scf(molecule=molecule,
+                 nelectrons=nelectrons,
+                 restricted=restricted)
+    hf.run()
+    mo_vals = hf.eval_mos(xs)
+    self.assertLen(mo_vals, 2)  # alpha-spin orbitals and beta-spin orbitals.
+    for spin_mo_vals in mo_vals:
+      # Evalute npts points on M orbitals/functions - (npts, M) array.
+      self.assertEqual(spin_mo_vals.shape, (npts, hf._mol.nao_nr()))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_tf_eval_mos(self, dtype):
+    """Tests tensorflow op for evaluating Hartree-Fock orbitals.
+
+    Args:
+      dtype: numpy type to use for position vectors.
+    """
+    xs = np.random.randn(100, 3).astype(dtype)
+    hf = create_o2_hf(1.4)
+    hf.run()
+
+    # Evaluate MOs at positions xs directly in pyscf.
+    # hf.eval_mos returns np.float64 arrays always. Cast to dtype for
+    # like-for-like comparison.
+    mo_vals = [mo_val.astype(dtype) for mo_val in hf.eval_mos(xs, deriv=True)]
+
+    # Evaluate MOs as a tensorflow op.
+    tf_xs = tf.constant(xs)
+    tf_mo_vals = hf.tf_eval_mos(tf_xs, deriv=True)
+    tf_dmo_vals = tf.gradients(tf.square(tf_mo_vals), tf_xs)[0]
+    tf_dmo_vals_shape = tf.shape(tf_dmo_vals)
+    with tf.train.MonitoredSession() as session:
+      tf_mo_vals_, tf_dmo_vals_ = session.run([tf_mo_vals, tf_dmo_vals])
+      tf_dmo_vals_shape_ = session.run(tf_dmo_vals_shape)
+    tf_mo_vals_ = np.split(tf_mo_vals_, 2, axis=-1)
+
+    for np_val, tf_val in zip(mo_vals, tf_mo_vals_):
+      self.assertEqual(np_val.dtype, tf_val.dtype)
+      np.testing.assert_allclose(np_val[0], tf_val)
+
+    np.testing.assert_array_equal(tf_dmo_vals_shape_, xs.shape)
+    np.testing.assert_array_equal(tf_dmo_vals_shape_, tf_dmo_vals_.shape)
+    # Compare analytic derivative of orbital^2 with tensorflow backprop.
+    # Sum over the orbital index and spin channel because of definition of
+    # tensorflow gradients.
+    np_deriv = sum(
+        np.sum(2 * np_val[1:] * np.expand_dims(np_val[0], 0), axis=-1)
+        for np_val in mo_vals)
+    # Tensorflow returns (N,3) but pyscf returns (3, N, M).
+    np_deriv = np.transpose(np_deriv)
+    self.assertEqual(np_deriv.dtype, tf_dmo_vals_.dtype)
+    if dtype == np.float32:
+      atol = 1e-6
+      rtol = 1e-6
+    else:
+      atol = 0
+      rtol = 1.e-7
+    np.testing.assert_allclose(np_deriv, tf_dmo_vals_, atol=atol, rtol=rtol)
+
+  def test_tf_eval_mos_deriv(self):
+
+    hf = create_o2_hf(1.4)
+    hf.run()
+    xs = tf.random_normal((100, 3))
+    tf_mos_derivs = hf.tf_eval_mos(xs, deriv=True)
+    tf_mos = hf.tf_eval_mos(xs, deriv=False)
+    with tf.train.MonitoredSession() as session:
+      tf_mos_, tf_mos_derivs_ = session.run([tf_mos, tf_mos_derivs])
+    np.testing.assert_allclose(tf_mos_, tf_mos_derivs_)
+
+  def test_tf_eval_hf(self):
+
+    # Check we get consistent answers between multiple calls to eval_mos and
+    # a single tf_eval_hf call.
+    molecule = [system.Atom('O', (0, 0, 0))]
+    nelectrons = (5, 3)
+    hf = scf.Scf(molecule=molecule,
+                 nelectrons=nelectrons,
+                 restricted=False)
+    hf.run()
+
+    batch = 100
+    xs = [np.random.randn(batch, 3) for _ in range(sum(nelectrons))]
+    mos = []
+    for i, x in enumerate(xs):
+      ispin = 0 if i < nelectrons[0] else 1
+      orbitals = hf.eval_mos(x)[ispin]
+      # Select occupied orbitals via Aufbau.
+      mos.append(orbitals[:, :nelectrons[ispin]])
+    np_mos = (np.stack(mos[:nelectrons[0]], axis=1),
+              np.stack(mos[nelectrons[0]:], axis=1))
+
+    tf_xs = tf.constant(np.stack(xs, axis=1))
+    tf_mos = hf.tf_eval_hf(tf_xs, deriv=True)
+    with tf.train.MonitoredSession() as session:
+      tf_mos_ = session.run(tf_mos)
+
+    for i, (np_mos_mat, tf_mos_mat) in enumerate(zip(np_mos, tf_mos_)):
+      self.assertEqual(np_mos_mat.shape, tf_mos_mat.shape)
+      self.assertEqual(np_mos_mat.shape, (batch, nelectrons[i], nelectrons[i]))
+      np.testing.assert_allclose(np_mos_mat, tf_mos_mat)
+
+  def test_tf_eval_slog_wavefuncs(self):
+
+    # Check TensorFlow evaluation runs and gives correct shapes.
+    molecule = [system.Atom('O', (0, 0, 0))]
+    nelectrons = (5, 3)
+    total_electrons = sum(nelectrons)
+    num_spatial_dim = 3
+    hf = scf.Scf(molecule=molecule,
+                 nelectrons=nelectrons,
+                 restricted=False)
+    hf.run()
+
+    batch = 100
+    rng = np.random.RandomState(1)
+    flat_positions_np = rng.randn(batch,
+                                  total_electrons * num_spatial_dim)
+
+    flat_positions_tf = tf.constant(flat_positions_np)
+    for method in [hf.tf_eval_slog_slater_determinant,
+                   hf.tf_eval_slog_hartree_product]:
+      slog_wavefunc, signs = method(flat_positions_tf)
+      with tf.train.MonitoredSession() as session:
+        slog_wavefunc_, signs_ = session.run([slog_wavefunc, signs])
+      self.assertEqual(slog_wavefunc_.shape, (batch, 1))
+      self.assertEqual(signs_.shape, (batch, 1))
+    hartree_product = hf.tf_eval_hartree_product(flat_positions_tf)
+    with tf.train.MonitoredSession() as session:
+      hartree_product_ = session.run(hartree_product)
+    np.testing.assert_allclose(hartree_product_,
+                               np.exp(slog_wavefunc_) * signs_)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/ferminet/tests/system_test.py b/ferminet/tests/system_test.py
new file mode 100644
index 0000000..cb28f98
--- /dev/null
+++ b/ferminet/tests/system_test.py
@@ -0,0 +1,100 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ferminet.utils.system."""
+
+from absl.testing import absltest
+from absl.testing import parameterized
+from ferminet.utils import system
+from ferminet.utils import units
+import numpy as np
+
+
+class SystemAtomCoordsTest(absltest.TestCase):
+
+  def test_atom_coords(self):
+    xs = np.random.uniform(size=3)
+    atom = system.Atom(symbol='H', coords=xs, units='angstrom')
+    np.testing.assert_allclose(atom.coords / xs, [units.BOHR_ANGSTROM]*3)
+    np.testing.assert_allclose(atom.coords_angstrom, xs)
+
+  def test_atom_units(self):
+    system.Atom(symbol='H', coords=[1, 2, 3], units='bohr')
+    system.Atom(symbol='H', coords=[1, 2, 3], units='angstrom')
+    with self.assertRaises(ValueError):
+      system.Atom(symbol='H', coords=[1, 2, 3], units='dummy')
+
+
+class SystemCreationsTest(parameterized.TestCase):
+
+  @parameterized.parameters(
+      {'symbol': 'He', 'charge': 0},
+      {'symbol': 'C', 'charge': 0},
+      {'symbol': 'Ne', 'charge': 0},
+      {'symbol': 'Ne', 'charge': 1},
+      {'symbol': 'Ne', 'charge': -1},
+  )
+  def test_create_atom(self, symbol, charge):
+    mol, spins = system.atom(symbol, charge=charge)
+    self.assertLen(mol, 1)
+    self.assertEqual(mol[0].symbol, symbol)
+    self.assertEqual(sum(spins), mol[0].atomic_number - charge)
+    np.testing.assert_allclose(np.asarray(mol[0].coords), np.zeros(3))
+
+  @parameterized.parameters(
+      {'symbol': 'LiH'},
+      {'symbol': 'Li2'},
+      {'symbol': 'N2'},
+      {'symbol': 'CO'},
+      {'symbol': 'CH4'},
+      {'symbol': 'NH3'},
+      {'symbol': 'C2H4'},
+      {'symbol': 'C4H6'},
+  )
+  def test_create_molecule(self, symbol):
+    _, _ = system.molecule(symbol)
+
+  @parameterized.parameters(
+      {'n': 10, 'r': 1.0},
+      {'n': 11, 'r': 1.0},
+      {'n': 20, 'r': 2.0},
+  )
+  def test_create_hydrogen_chain(self, n, r):
+    mol, spins = system.hn(n, r)
+    self.assertLen(mol, n)
+    for atom in mol:
+      self.assertAlmostEqual(atom.coords[0], 0)
+      self.assertAlmostEqual(atom.coords[1], 0)
+    for atom1, atom2 in zip(mol[:-1], mol[1:]):
+      self.assertAlmostEqual(atom2.coords[2] - atom1.coords[2], r)
+    self.assertEqual(spins, (n - n // 2, n // 2))
+
+  @parameterized.parameters(
+      {'r': 1.0, 'angle': np.pi/4.0},
+      {'r': 1.0, 'angle': np.pi/6.0},
+      {'r': 2.0, 'angle': np.pi/4.0},
+  )
+  def test_create_hydrogen_circle(self, r, angle):
+    mol, spins = system.h4_circle(r, angle)
+    self.assertEqual(spins, (2, 2))
+    self.assertLen(mol, 4)
+    for atom in mol:
+      self.assertAlmostEqual(atom.coords[2], 0)
+      theta = np.abs(np.arctan(atom.coords[1] / atom.coords[0]))
+      self.assertAlmostEqual(theta, angle)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/ferminet/tests/train_test.py b/ferminet/tests/train_test.py
new file mode 100644
index 0000000..7daf988
--- /dev/null
+++ b/ferminet/tests/train_test.py
@@ -0,0 +1,268 @@
+# Lint as: python3
+# Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ferminet.train."""
+
+import os
+
+from absl import flags
+from absl.testing import parameterized
+from ferminet import train
+from ferminet.utils import system
+import numpy as np
+import pyscf
+import tensorflow.compat.v1 as tf
+
+FLAGS = flags.FLAGS
+# Default flags are sufficient so mark FLAGS as parsed so we can run the tests
+# with py.test, which imports this file rather than runs it.
+FLAGS.mark_as_parsed()
+
+
+class MolVmcTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(MolVmcTest, self).setUp()
+    # disable use of temp directory in pyscf.
+    # Test calculations are small enough to fit in RAM and we don't need
+    # checkpoint files.
+    pyscf.lib.param.TMPDIR = None
+
+  def _get_atoms(self, name, kwargs=None):
+    configs = {
+        'h2': lambda _: system.diatomic('H', 'H', bond_length=1.4),
+        'helium': lambda _: system.atom('He'),
+        'helium_triplet': lambda _: system.atom('He', spins=(2, 0)),
+        # of each spin. Handle entirely spin-polarised systems.
+        'hydrogen': lambda _: system.atom('H'),
+        'lithium': lambda _: system.atom('Li'),
+        'hn': lambda kwargs: system.hn(r=1.4, **kwargs),
+    }
+    return configs[name](kwargs)
+
+  @parameterized.parameters(
+      {
+          'name': 'h2',
+          'use_kfac': False
+      },
+      {
+          'name': 'helium',
+          'use_kfac': False
+      },
+      {
+          'name': 'lithium',
+          'use_kfac': False
+      },
+      {
+          'name': 'helium',
+          'use_kfac': False
+      },
+      {
+          'name': 'helium_triplet',
+          'use_kfac': False
+      },
+      {
+          'name': 'hn',
+          'args': {'n': 3},
+          'use_kfac': False
+      },
+      {
+          'name': 'hn',
+          'args': {'n': 3},
+          'use_kfac': True
+      },
+      {
+          'name': 'hn',
+          'args': {'n': 5},
+          'use_kfac': True
+      },
+      {
+          'name': 'hn',
+          'args': {'n': 3},
+          'use_kfac': True
+      },
+      {
+          'name': 'hn',
+          'args': {'n': 3},
+          'use_kfac': False
+      },
+      {
+          'name': 'hn',
+          'args': {'n': 3},
+          'use_kfac': True
+      },
+      {
+          'name': 'hn',
+          'args': {'n': 3},
+          'use_kfac': True,
+      },
+  )
+  def test_system(self,
+                  name,
+                  args=None,
+                  use_kfac=True):
+    atoms, electrons = self._get_atoms(name, args)
+    train.train(
+        atoms,
+        electrons,
+        batch_size=8,
+        network_config=train.NetworkConfig(determinants=2),
+        pretrain_config=train.PretrainConfig(iterations=20),
+        optim_config=train.OptimConfig(iterations=2, use_kfac=use_kfac),
+        mcmc_config=train.MCMCConfig(steps=2),
+        logging_config=train.LoggingConfig(
+            result_path=self.create_tempdir().full_path),
+        multi_gpu=False)
+
+  def test_restart(self):
+    atoms, electrons = self._get_atoms('lithium')
+
+    batch_size = 8
+    network_config = train.NetworkConfig(
+        determinants=2, hidden_units=((64, 16),) * 3)
+    optim_config = train.OptimConfig(iterations=10, use_kfac=False)
+    result_directory = self.create_tempdir().full_path
+
+    # First run
+    train.train(
+        atoms,
+        electrons,
+        batch_size=batch_size,
+        network_config=network_config,
+        pretrain_config=train.PretrainConfig(iterations=20),
+        optim_config=optim_config,
+        mcmc_config=train.MCMCConfig(steps=10),
+        logging_config=train.LoggingConfig(
+            result_path=os.path.join(result_directory, 'run1')),
+        multi_gpu=False)
+
+    # Restart from first run
+    # Already done pretraining and burn in, so just want to resume training
+    # immediately.
+    tf.reset_default_graph()
+    train.train(
+        atoms,
+        electrons,
+        batch_size=batch_size,
+        network_config=network_config,
+        pretrain_config=train.PretrainConfig(iterations=0),
+        optim_config=optim_config,
+        mcmc_config=train.MCMCConfig(burn_in=0, steps=10),
+        logging_config=train.LoggingConfig(
+            result_path=os.path.join(result_directory, 'run2'),
+            restore_path=os.path.join(result_directory, 'run1', 'checkpoints')
+        ),
+        multi_gpu=False)
+
+
+class AssignElectronsTest(parameterized.TestCase):
+
+  def _expected_result(self, molecule, e_per_atom):
+    nelectrons = [sum(spin_electrons) for spin_electrons in zip(*e_per_atom)]
+    if nelectrons[0] == nelectrons[1]:
+      for electrons in e_per_atom:
+        if electrons[0] < electrons[1]:
+          flip = True
+          break
+        elif electrons[0] > electrons[1]:
+          flip = False
+          break
+      if flip:
+        e_per_atom = [electrons[::-1] for electrons in e_per_atom]
+    return np.concatenate(
+        [np.tile(atom.coords, e[0]) for atom, e in zip(molecule, e_per_atom)] +
+        [np.tile(atom.coords, e[1]) for atom, e in zip(molecule, e_per_atom)])
+
+  @parameterized.parameters(
+      {
+          'molecule': [('O', 0, 0, 0)],
+          'electrons': (5, 3),
+          'electrons_per_atom': ((5, 3),),
+      },
+      {
+          'molecule': [('O', 0, 0, 0), ('N', 1.2, 0, 0)],
+          'electrons': (8, 7),
+          'electrons_per_atom': ((3, 5), (5, 2)),
+      },
+      {
+          'molecule': [('O', 0, 0, 0), ('N', 1.2, 0, 0)],
+          'electrons': (7, 8),
+          'electrons_per_atom': ((5, 3), (2, 5)),
+      },
+      {
+          'molecule': [('O', 0, 0, 0), ('N', 1.2, 0, 0)],
+          'electrons': (9, 7),
+          'electrons_per_atom': ((4, 5), (5, 2)),
+      },
+      {
+          'molecule': [('O', 0, 0, 0), ('N', 1.2, 0, 0)],
+          'electrons': (10, 7),
+          'electrons_per_atom': ((5, 5), (5, 2)),
+      },
+      {
+          'molecule': [('O', 0, 0, 0), ('N', 1.2, 0, 0)],
+          'electrons': (7, 7),
+          'electrons_per_atom': ((5, 3), (2, 4)),
+      },
+      {
+          'molecule': [('O', 0, 0, 0), ('O', 1.2, 0, 0)],
+          'electrons': (9, 7),
+          'electrons_per_atom': ((4, 4), (5, 3)),
+      },
+      {
+          'molecule': [('O', 0, 0, 0), ('O', 20, 0, 0)],
+          'electrons': (10, 6),
+          'electrons_per_atom': ((5, 3), (5, 3)),
+      },
+      {
+          'molecule': [('H', 0, 0, 0), ('H', 1.2, 0, 0)],
+          'electrons': (1, 1),
+          'electrons_per_atom': ((1, 0), (0, 1)),
+      },
+      {
+          'molecule': [('B', 0, 0, 0), ('H', 1.2, 0, 0), ('H', -0.6, 0.6, 0),
+                       ('H', -0.6, -0.6, 0)],
+          'electrons': (4, 4),
+          'electrons_per_atom': ((3, 2), (0, 1), (1, 0), (0, 1)),
+      },
+      {
+          'molecule': [('B', 0, 0, 0), ('H', 1.2, 0, 0)],
+          'electrons': (3, 2),
+          'electrons_per_atom': ((3, 2), (0, 0)),
+      },
+      {
+          'molecule': [('B', 0, 0, 0), ('H', 1.2, 0, 0), ('H', -0.6, 0.6, 0)],
+          'electrons': (3, 2),
+          'electrons_per_atom': ((3, 2), (0, 0), (0, 0)),
+      },
+  )
+  def test_assign_electrons(self, molecule, electrons, electrons_per_atom):
+    molecule = [system.Atom(v[0], v[1:]) for v in molecule]
+    e_means = train.assign_electrons(molecule, electrons)
+    expected_e_means = self._expected_result(molecule, electrons_per_atom)
+    np.testing.assert_allclose(e_means, expected_e_means)
+
+    e_means = train.assign_electrons(molecule[::-1], electrons)
+    if any(atom.symbol != molecule[0].symbol for atom in molecule):
+      expected_e_means = self._expected_result(molecule[::-1],
+                                               electrons_per_atom[::-1])
+    else:
+      expected_e_means = self._expected_result(molecule[::-1],
+                                               electrons_per_atom)
+    np.testing.assert_allclose(e_means, expected_e_means)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/ferminet/tests/units_test.py b/ferminet/tests/units_test.py
new file mode 100644
index 0000000..40abbde
--- /dev/null
+++ b/ferminet/tests/units_test.py
@@ -0,0 +1,83 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ferminet.utils.units."""
+
+from absl.testing import absltest
+from ferminet.utils import units
+import numpy as np
+
+
+class UnitsTest(absltest.TestCase):
+
+  def test_angstrom2bohr(self):
+    self.assertAlmostEqual(units.angstrom2bohr(2), 3.77945225091, places=10)
+
+  def test_angstrom2bohr_numpy(self):
+    x = np.random.uniform(size=(3,))
+    x1 = units.angstrom2bohr(x)
+    x2 = np.array([units.angstrom2bohr(v) for v in x])
+    np.testing.assert_allclose(x1, x2)
+
+  def test_bohr2angstrom(self):
+    self.assertAlmostEqual(units.bohr2angstrom(2), 1.05835442134, places=10)
+
+  def test_bohr2angstrom_numpy(self):
+    x = np.random.uniform(size=(3,))
+    x1 = units.bohr2angstrom(x)
+    x2 = np.array([units.bohr2angstrom(v) for v in x])
+    np.testing.assert_allclose(x1, x2)
+
+  def test_angstrom_bohr_idempotent(self):
+    x = np.random.uniform()
+    x1 = units.bohr2angstrom(units.angstrom2bohr(x))
+    self.assertAlmostEqual(x, x1, places=10)
+
+  def test_bohr_angstrom_idempotent(self):
+    x = np.random.uniform()
+    x1 = units.angstrom2bohr(units.bohr2angstrom(x))
+    self.assertAlmostEqual(x, x1, places=10)
+
+  def test_hartree2kcal(self):
+    self.assertAlmostEqual(units.hartree2kcal(2), 1255.018948, places=10)
+
+  def test_hartree2kcal_numpy(self):
+    x = np.random.uniform(size=(3,))
+    x1 = units.hartree2kcal(x)
+    x2 = np.array([units.hartree2kcal(v) for v in x])
+    np.testing.assert_allclose(x1, x2)
+
+  def test_kcal2hartree(self):
+    self.assertAlmostEqual(units.kcal2hartree(2), 0.00318720287, places=10)
+
+  def test_kcal2hartree_numpy(self):
+    x = np.random.uniform(size=(3,))
+    x1 = units.kcal2hartree(x)
+    x2 = np.array([units.kcal2hartree(v) for v in x])
+    np.testing.assert_allclose(x1, x2)
+
+  def test_hartree_kcal_idempotent(self):
+    x = np.random.uniform()
+    x1 = units.kcal2hartree(units.hartree2kcal(x))
+    self.assertAlmostEqual(x, x1, places=10)
+
+  def test_kcal_hartree_idempotent(self):
+    x = np.random.uniform()
+    x1 = units.hartree2kcal(units.kcal2hartree(x))
+    self.assertAlmostEqual(x, x1, places=10)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/ferminet/train.py b/ferminet/train.py
new file mode 100644
index 0000000..75772d2
--- /dev/null
+++ b/ferminet/train.py
@@ -0,0 +1,542 @@
+# Lint as: python3
+# Copyright 2018 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Learn ground state wavefunctions for molecular systems using VMC."""
+
+import copy
+import os
+from typing import Any, Mapping, Optional, Sequence, Tuple
+
+from absl import logging
+import attr
+
+from ferminet import hamiltonian
+from ferminet import mcmc
+from ferminet import mean_corrected_kfac_opt
+from ferminet import networks
+from ferminet import qmc
+from ferminet.utils import elements
+from ferminet.utils import scf
+from ferminet.utils import system
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+
+def _validate_directory(obj, attribute, value):
+  """Validates value is a directory."""
+  del obj
+  if value and not os.path.isdir(value):
+    raise ValueError(f'{attribute.name} is not a directory')
+
+
+@attr.s(auto_attribs=True)
+class LoggingConfig:
+  """Logging information for Fermi Nets.
+
+  Attributes:
+    result_path: directory to use for saving model parameters and calculations
+      results. Created if does not exist.
+    save_frequency: frequency (in minutes) at which parameters are saved.
+    restore_path: directory to use for restoring model parameters.
+    stats_frequency: frequency (in iterations) at which statistics (via stats
+      hooks) are updated and stored.
+    replicas: the number of replicas used during training. Will be set
+      automatically.
+    walkers: If true, log walkers at every step.
+    wavefunction: If true, log wavefunction at every step.
+    local_energy: If true, log local energy at every step.
+    config: dictionary of additional information about the calculation setup.
+      Reported along with calculation statistics.
+  """
+  result_path: str = '.'
+  save_frequency: float = 10
+  restore_path: str = attr.ib(default=None, validator=_validate_directory)
+  stats_frequency: int = 1
+  replicas: int = 1
+  walkers: bool = False
+  wavefunction: bool = False
+  local_energy: bool = False
+  config: Mapping[str, Any] = attr.ib(converter=dict,
+                                      default=attr.Factory(dict))
+
+
+@attr.s(auto_attribs=True)
+class MCMCConfig:
+  """Markov Chain Monte Carlo configuration for Fermi Nets.
+
+  Attributes:
+    burn_in: Number of burn in steps after pretraining.
+    steps: 'Number of MCMC steps to make between network updates.
+    init_width: Width of (atom-centred) Gaussians used to generate initial
+      electron configurations.
+    move_width: Width of Gaussian used for random moves.
+    init_means: Iterable of 3*nelectrons giving the mean initial position of
+      each electron. Configurations are drawn using Gaussians of width
+      init_width at each 3D position. Alpha electrons are listed before beta
+      electrons. If empty, electrons are assigned to atoms based upon the
+      isolated atom spin configuration. Expert use only.
+  """
+  burn_in: int = 100
+  steps: int = 10
+  init_width: float = 0.8
+  move_width: float = 0.02
+  init_means: Optional[Sequence[float]] = None
+
+
+@attr.s(auto_attribs=True)
+class PretrainConfig:
+  """Hartree-Fock pretraining algorithm configuration for Fermi Nets.
+
+  Attributes:
+    iterations: Number of iterations for which to pretrain the network to match
+      Hartree-Fock orbitals.
+    basis: Basis set used to run Hartree-Fock calculation in PySCF.
+  """
+  iterations: int = 1000
+  basis: str = 'sto-3g'
+
+
+@attr.s(auto_attribs=True)
+class OptimConfig:
+  """Optimization configuration for Fermi Nets.
+
+  Attributes:
+    iterations: Number of iterations')
+    clip_el: If not none, scale at which to clip local energy.
+    learning_rate: learning rate.
+    learning_rate_decay: exponent of learning rate decay.
+    learning_rate_delay: scale of the rate decay.
+    use_kfac: Use the K-FAC optimizer if true, ADAM optimizer otherwise.
+    check_loss: Apply gradient update only if the loss is not NaN.  If true,
+      training could be slightly slower but the checkpoint written out when a
+      NaN is detected will be with the network weights which led to the NaN.
+    deterministic: CPU only mode that also enforces determinism. Will run
+      *significantly* slower if used.
+  """
+  iterations: int = 1000000
+  learning_rate: float = 1.e-4
+  learning_rate_decay: float = 1.0
+  learning_rate_delay: float = 10000.0
+  clip_el: float = 5.0
+  use_kfac: bool = True
+  check_loss: bool = False
+  deterministic: bool = False
+
+
+@attr.s(auto_attribs=True)
+class KfacConfig:
+  """K-FAC configuration - see docs at https://github.com/tensorflow/kfac/."""
+  invert_every: int = 1
+  cov_update_every: int = 1
+  damping: float = 0.001
+  cov_ema_decay: float = 0.95
+  momentum: float = 0.0
+  momentum_type: str = attr.ib(
+      default='regular',
+      validator=attr.validators.in_(
+          ['regular', 'adam', 'qmodel', 'qmodel_fixedmu']))
+  adapt_damping: bool = False
+  damping_adaptation_decay: float = 0.9
+  damping_adaptation_interval: int = 5
+  min_damping: float = 1.e-5
+  norm_constraint: float = 0.001
+
+
+@attr.s(auto_attribs=True)
+class NetworkConfig:
+  """Network configuration for Fermi Net.
+
+  Attributes:
+    architecture: The choice of architecture to run the calculation with. Either
+      "ferminet" or "slater" for the Fermi Net and standard Slater determinant
+      respectively.
+    hidden_units: Number of hidden units in each layer of the network. If
+      the Fermi Net with one- and two-electron streams is used, a tuple is
+      provided for each layer, with the first element giving the number of
+      hidden units in the one-electron stream and the second element giving the
+      number of units in the two-electron stream. Otherwise, each layer is
+      represented by a single integer.
+    determinants: Number of determinants to use.
+    r12_en_features: Include r12/distance features between electrons and nuclei.
+      Highly recommended.
+    r12_ee_features: Include r12/distance features between pairs of electrons.
+      Highly recommended.
+    pos_ee_features: Include electron-electron position features. Highly
+      recommended.
+    use_envelope: Include multiplicative exponentially-decaying envelopes on
+      each orbital. Calculations will not converge if set to False.
+    backflow: Include backflow transformation in input coordinates. '
+      Only for use if network_architecture == "slater". Implies build_backflow
+      is also True.
+    build_backflow: Create backflow weights but do not include backflow
+      coordinate transformation in the netwrok. Use to train a Slater-Jastrow
+      architecture and then train a Slater-Jastrow-Backflow architecture
+      based on it in a two-stage optimization process.
+    residual: Use residual connections in network. Recommended.
+    after_det: Number of hidden units in each layer of the neural network after
+      the determinants. By default, just takes a  weighted sum of
+      determinants with no nonlinearity.
+    jastrow_en: Include electron-nuclear Jastrow factor. Only relevant with
+      Slater-Jastrow-Backflow architectures.
+    jastrow_ee: Include electron-electron Jastrow factor. Only relevant with
+      Slater-Jastrow-Backflow architectures.
+    jastrow_een: Include electron-electron-nuclear Jastrow factor. Only
+      relevant with Slater-Jastrow-Backflow architectures.
+  """
+  architecture: str = attr.ib(
+      default='ferminet', validator=attr.validators.in_(['ferminet', 'slater']))
+  hidden_units: Sequence[Tuple[int, int]] = ((256, 32),) * 4
+  determinants: int = 16
+  r12_en_features: bool = True
+  r12_ee_features: bool = True
+  pos_ee_features: bool = True
+  use_envelope: bool = True
+  backflow: bool = False
+  build_backflow: bool = False
+  residual: bool = True
+  after_det: Sequence[int] = (1,)
+  jastrow_en: bool = False
+  jastrow_ee: bool = False
+  jastrow_een: bool = False
+
+
+def assign_electrons(molecule, electrons):
+  """Assigns electrons to atoms using non-interacting spin configurations.
+
+  Args:
+    molecule: List of Hamiltonian.Atom objects for each atom in the system.
+    electrons: Pair of ints giving number of alpha (spin-up) and beta
+      (spin-down) electrons.
+
+  Returns:
+    1D np.ndarray of length 3N containing initial mean positions of each
+    electron based upon the atom positions, where N is the total number of
+    electrons. The first 3*electrons[0] positions correspond to the alpha
+    (spin-up) electrons and the next 3*electrons[1] to the beta (spin-down)
+    electrons.
+
+  Raises:
+    RuntimeError: if a different number of electrons or different spin
+    polarisation is generated.
+  """
+  # Assign electrons based upon unperturbed atoms and ignore impact of
+  # fractional nuclear charge.
+  nuclei = [int(round(atom.charge)) for atom in molecule]
+  total_charge = sum(nuclei) - sum(electrons)
+  # Construct a dummy iso-electronic neutral system.
+  neutral_molecule = [copy.copy(atom) for atom in molecule]
+  if total_charge != 0:
+    logging.warning(
+        'Charged system. Using heuristics to set initial electron positions')
+    charge = 1 if total_charge > 0 else -1
+  while total_charge != 0:
+    # Poor proxy for electronegativity.
+    atom_index = nuclei.index(max(nuclei) if total_charge < 0 else min(nuclei))
+    atom = neutral_molecule[atom_index]
+    atom.charge -= charge
+    atom.atomic_number = int(round(atom.charge))
+    if int(round(atom.charge)) == 0:
+      neutral_molecule.pop(atom_index)
+    else:
+      atom.symbol = elements.ATOMIC_NUMS[atom.atomic_number].symbol
+    total_charge -= charge
+    nuclei = [int(round(atom.charge)) for atom in neutral_molecule]
+
+  spin_pol = lambda electrons: electrons[0] - electrons[1]
+  abs_spin_pol = abs(spin_pol(electrons))
+  if len(neutral_molecule) == 1:
+    elecs_atom = [electrons]
+  else:
+    elecs_atom = []
+    spin_pol_assigned = 0
+    for ion in neutral_molecule:
+      # Greedily assign up and down electrons based upon the ground state spin
+      # configuration of an isolated atom.
+      atom_spin_pol = elements.ATOMIC_NUMS[ion.atomic_number].spin_config
+      nelec = ion.atomic_number
+      na = (nelec + atom_spin_pol) // 2
+      nb = nelec - na
+      # Attempt to keep spin polarisation as close to 0 as possible.
+      if (spin_pol_assigned > 0 and
+          spin_pol_assigned + atom_spin_pol > abs_spin_pol):
+        elec_atom = [nb, na]
+      else:
+        elec_atom = [na, nb]
+      spin_pol_assigned += spin_pol(elec_atom)
+      elecs_atom.append(elec_atom)
+
+  electrons_assigned = [sum(e) for e in zip(*elecs_atom)]
+  spin_pol_assigned = spin_pol(electrons_assigned)
+  if np.sign(spin_pol_assigned) == -np.sign(abs_spin_pol):
+    # Started with the wrong guess for spin-up vs spin-down.
+    elecs_atom = [e[::-1] for e in elecs_atom]
+    spin_pol_assigned = -spin_pol_assigned
+
+  if spin_pol_assigned != abs_spin_pol:
+    logging.info('Spin polarisation does not match isolated atoms. '
+                 'Using heuristics to set initial electron positions.')
+  while spin_pol_assigned != abs_spin_pol:
+    atom_spin_pols = [abs(spin_pol(e)) for e in elecs_atom]
+    atom_index = atom_spin_pols.index(max(atom_spin_pols))
+    elec_atom = elecs_atom[atom_index]
+    if spin_pol_assigned < abs_spin_pol and elec_atom[0] <= elec_atom[1]:
+      elec_atom[0] += 1
+      elec_atom[1] -= 1
+      spin_pol_assigned += 2
+    elif spin_pol_assigned < abs_spin_pol and elec_atom[0] > elec_atom[1]:
+      elec_atom[0] -= 1
+      elec_atom[1] += 1
+      spin_pol_assigned += 2
+    elif spin_pol_assigned > abs_spin_pol and elec_atom[0] > elec_atom[1]:
+      elec_atom[0] -= 1
+      elec_atom[1] += 1
+      spin_pol_assigned -= 2
+    else:
+      elec_atom[0] += 1
+      elec_atom[1] -= 1
+      spin_pol_assigned -= 2
+
+  electrons_assigned = [sum(e) for e in zip(*elecs_atom)]
+  if spin_pol(electrons_assigned) == -spin_pol(electrons):
+    elecs_atom = [e[::-1] for e in elecs_atom]
+    electrons_assigned = electrons_assigned[::-1]
+
+  logging.info(
+      'Electrons assigned %s.', ', '.join([
+          '{}: {}'.format(atom.symbol, elec_atom)
+          for atom, elec_atom in zip(molecule, elecs_atom)
+      ]))
+  if any(e != e_assign for e, e_assign in zip(electrons, electrons_assigned)):
+    raise RuntimeError(
+        'Assigned incorrect number of electrons ([%s instead of %s]' %
+        (electrons_assigned, electrons))
+  if any(min(ne) < 0 for ne in zip(*elecs_atom)):
+    raise RuntimeError('Assigned negative number of electrons!')
+  electron_positions = np.concatenate([
+      np.tile(atom.coords, e[0])
+      for atom, e in zip(neutral_molecule, elecs_atom)
+  ] + [
+      np.tile(atom.coords, e[1])
+      for atom, e in zip(neutral_molecule, elecs_atom)
+  ])
+  return electron_positions
+
+
+def train(molecule: Sequence[system.Atom],
+          spins: Tuple[int, int],
+          batch_size: int,
+          network_config: Optional[NetworkConfig] = None,
+          pretrain_config: Optional[PretrainConfig] = None,
+          optim_config: Optional[OptimConfig] = None,
+          kfac_config: Optional[KfacConfig] = None,
+          mcmc_config: Optional[MCMCConfig] = None,
+          logging_config: Optional[LoggingConfig] = None,
+          multi_gpu: bool = False,
+          double_precision: bool = False,
+          graph_path: Optional[str] = None):
+  """Configures and runs training loop.
+
+  Args:
+    molecule: molecule description.
+    spins: pair of ints specifying number of spin-up and spin-down electrons
+      respectively.
+    batch_size: batch size. Also referred to as the number of Markov Chain Monte
+      Carlo configurations/walkers.
+    network_config: network configuration. Default settings in NetworkConfig are
+      used if not specified.
+    pretrain_config: pretraining configuration. Default settings in
+      PretrainConfig are used if not specified.
+    optim_config: optimization configuration. Default settings in OptimConfig
+      are used if not specified.
+    kfac_config: K-FAC configuration. Default settings in KfacConfig are used if
+      not specified.
+    mcmc_config: Markov Chain Monte Carlo configuration. Default settings in
+      MCMCConfig are used if not specified.
+    logging_config: logging and checkpoint configuration. Default settings in
+      LoggingConfig are used if not specified.
+    multi_gpu: Use all available GPUs. Default: use only a single GPU.
+    double_precision: use tf.float64 instead of tf.float32 for all operations.
+      Warning - double precision is not currently functional with K-FAC.
+    graph_path: directory to save a representation of the TF graph to. Not saved
+
+  Raises:
+    RuntimeError: if mcmc_config.init_means is supplied but is of the incorrect
+    length.
+  """
+
+  if not mcmc_config:
+    mcmc_config = MCMCConfig()
+  if not logging_config:
+    logging_config = LoggingConfig()
+  if not pretrain_config:
+    pretrain_config = PretrainConfig()
+  if not optim_config:
+    optim_config = OptimConfig()
+  if not kfac_config:
+    kfac_config = KfacConfig()
+  if not network_config:
+    network_config = NetworkConfig()
+
+  nelectrons = sum(spins)
+  precision = tf.float64 if double_precision else tf.float32
+
+  if multi_gpu:
+    strategy = tf.distribute.MirroredStrategy()
+  else:
+    # Get the default (single-device) strategy.
+    strategy = tf.distribute.get_strategy()
+  if multi_gpu:
+    batch_size = batch_size // strategy.num_replicas_in_sync
+    logging.info('Setting per-GPU batch size to %s.', batch_size)
+    logging_config.replicas = strategy.num_replicas_in_sync
+  logging.info('Running on %s replicas.', strategy.num_replicas_in_sync)
+
+  # Create a re-entrant variable scope for network.
+  with tf.variable_scope('model') as model:
+    pass
+
+  with strategy.scope():
+    with tf.variable_scope(model, auxiliary_name_scope=False) as model1:
+      with tf.name_scope(model1.original_name_scope):
+        fermi_net = networks.FermiNet(
+            atoms=molecule,
+            nelectrons=spins,
+            slater_dets=network_config.determinants,
+            hidden_units=network_config.hidden_units,
+            after_det=network_config.after_det,
+            architecture=network_config.architecture,
+            r12_ee_features=network_config.r12_ee_features,
+            r12_en_features=network_config.r12_en_features,
+            pos_ee_features=network_config.pos_ee_features,
+            build_backflow=network_config.build_backflow,
+            use_backflow=network_config.backflow,
+            jastrow_en=network_config.jastrow_en,
+            jastrow_ee=network_config.jastrow_ee,
+            jastrow_een=network_config.jastrow_een,
+            logdet=True,
+            envelope=network_config.use_envelope,
+            residual=network_config.residual,
+            pretrain_iterations=pretrain_config.iterations)
+
+    scf_approx = scf.Scf(
+        molecule,
+        nelectrons=spins,
+        restricted=False,
+        basis=pretrain_config.basis)
+    if pretrain_config.iterations > 0:
+      scf_approx.run()
+
+    hamiltonian_ops = hamiltonian.operators(molecule, nelectrons)
+    if mcmc_config.init_means:
+      if len(mcmc_config.init_means) != 3 * nelectrons:
+        raise RuntimeError('Initial electron positions of incorrect shape. '
+                           '({} not {})'.format(
+                               len(mcmc_config.init_means), 3 * nelectrons))
+      init_means = [float(x) for x in mcmc_config.init_means]
+    else:
+      init_means = assign_electrons(molecule, spins)
+
+    # Build the MCMC state inside the same variable scope as the network.
+    with tf.variable_scope(model, auxiliary_name_scope=False) as model1:
+      with tf.name_scope(model1.original_name_scope):
+        data_gen = mcmc.MCMC(
+            fermi_net,
+            batch_size,
+            init_mu=init_means,
+            init_sigma=mcmc_config.init_width,
+            move_sigma=mcmc_config.move_width,
+            dtype=precision)
+    with tf.variable_scope('HF_data_gen'):
+      hf_data_gen = mcmc.MCMC(
+          scf_approx.tf_eval_slog_hartree_product,
+          batch_size,
+          init_mu=init_means,
+          init_sigma=mcmc_config.init_width,
+          move_sigma=mcmc_config.move_width,
+          dtype=precision)
+
+    with tf.name_scope('learning_rate_schedule'):
+      global_step = tf.train.get_or_create_global_step()
+      lr = optim_config.learning_rate * tf.pow(
+          (1.0 / (1.0 + (tf.cast(global_step, tf.float32) /
+                         optim_config.learning_rate_delay))),
+          optim_config.learning_rate_decay)
+
+    if optim_config.learning_rate < 1.e-10:
+      logging.warning('Learning rate less than 10^-10. Not using an optimiser.')
+      optim_fn = lambda _: None
+      update_cached_data = None
+    elif optim_config.use_kfac:
+      cached_data = tf.get_variable(
+          'MCMC_cache',
+          initializer=tf.zeros(shape=data_gen.walkers.shape, dtype=precision),
+          use_resource=True,
+          trainable=False,
+          dtype=precision,
+      )
+      if kfac_config.adapt_damping:
+        update_cached_data = tf.assign(cached_data, data_gen.walkers)
+      else:
+        update_cached_data = None
+      optim_fn = lambda layer_collection: mean_corrected_kfac_opt.MeanCorrectedKfacOpt(  # pylint: disable=g-long-lambda
+          invert_every=kfac_config.invert_every,
+          cov_update_every=kfac_config.cov_update_every,
+          learning_rate=lr,
+          norm_constraint=kfac_config.norm_constraint,
+          damping=kfac_config.damping,
+          cov_ema_decay=kfac_config.cov_ema_decay,
+          momentum=kfac_config.momentum,
+          momentum_type=kfac_config.momentum_type,
+          loss_fn=lambda x: tf.nn.l2_loss(fermi_net(x)[0]),
+          train_batch=data_gen.walkers,
+          prev_train_batch=cached_data,
+          layer_collection=layer_collection,
+          batch_size=batch_size,
+          adapt_damping=kfac_config.adapt_damping,
+          is_chief=True,
+          damping_adaptation_decay=kfac_config.damping_adaptation_decay,
+          damping_adaptation_interval=kfac_config.damping_adaptation_interval,
+          min_damping=kfac_config.min_damping,
+          use_passed_loss=False,
+          estimation_mode='exact',
+      )
+    else:
+      adam = tf.train.AdamOptimizer(lr)
+      optim_fn = lambda _: adam
+      update_cached_data = None
+
+    qmc_net = qmc.QMC(
+        hamiltonian_ops,
+        fermi_net,
+        data_gen,
+        hf_data_gen,
+        clip_el=optim_config.clip_el,
+        check_loss=optim_config.check_loss,
+    )
+
+  qmc_net.train(
+      optim_fn,
+      optim_config.iterations,
+      logging_config,
+      using_kfac=optim_config.use_kfac,
+      strategy=strategy,
+      scf_approx=scf_approx,
+      global_step=global_step,
+      determinism_mode=optim_config.deterministic,
+      cached_data_op=update_cached_data,
+      write_graph=os.path.abspath(graph_path) if graph_path else None,
+      burn_in=mcmc_config.burn_in,
+      mcmc_steps=mcmc_config.steps,
+  )
diff --git a/ferminet/utils/analysis_tools.py b/ferminet/utils/analysis_tools.py
new file mode 100644
index 0000000..dbe318a
--- /dev/null
+++ b/ferminet/utils/analysis_tools.py
@@ -0,0 +1,114 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tools for reading and analysing QMC data."""
+
+from typing import Iterable, Optional, Text, Union
+
+from absl import logging
+import numpy as np
+import pandas as pd
+
+from pyblock import pd_utils as blocking
+
+
+def _format_network(network_option: Union[int, Iterable[int]]) -> Text:
+  """Formats a network configuration to a (short) string.
+
+  Args:
+    network_option: a integer or iterable of integers.
+
+  Returns:
+    String representation of the network option. If the network option is an
+    iterable of the form [V, V, ...], return NxV, where N is the length of the
+    iterable.
+  """
+  try:
+    # pytype doesn't handle try...except TypeError gracefully.
+    if all(xi == network_option[0] for xi in network_option[1:]):  # pytype: disable=unsupported-operands
+      return '{}x{}'.format(len(network_option), network_option[0])  # pytype: disable=unsupported-operands,wrong-arg-types
+    else:
+      return '{}'.format(network_option)
+  except TypeError:
+    return '{}'.format(network_option)
+
+
+def estimate_stats(df: pd.DataFrame,
+                   burn_in: int,
+                   groups: Optional[Iterable[Text]] = None,
+                   group_by_work_unit: bool = True) -> pd.DataFrame:
+  """Estimates statistics for the (local) energy.
+
+  Args:
+    df: pd.DataFrame containing local energy data in the 'eigenvalues' column.
+    burn_in: number of data points to discard before accumulating statistics to
+      allow for learning and equilibration time.
+    groups: list of column names in df to group by. The statistics for each
+      group are returned, along with the corresponding data for the group.  The
+      group columns should be sufficient to distinguish between separate work
+      units/calculations in df.
+    group_by_work_unit: add 'work_unit_id' to the list of groups if not already
+      present and 'work_unit_id' is a column in df. This is usually helpful
+      for safety, when each work unit is a separate calculation and should be
+      treated separately statistically.
+
+  Returns:
+    pandas DataFrame containing estimates of the mean, standard error and error
+    in the standard error from a blocking analysis of the local energy for each
+    group in df.
+
+  Raises:
+    RuntimeError: If groups is empty or None and group_by_work_unit is False. If
+    df does not contain a key to group over, insert a dummy column with
+    identical values or use pyblock directly.
+  """
+  wid = 'work_unit_id'
+  if groups is None:
+    groups = []
+  else:
+    groups = list(groups)
+  if group_by_work_unit and wid not in groups and wid in df.columns:
+    groups.append(wid)
+  if not groups:
+    raise RuntimeError(
+        'Must group by at least one key or set group_by_work_unit to True.')
+  if len(groups) == 1:
+    index_dict = {'index': groups[0]}
+  else:
+    index_dict = {'level_{}'.format(i): group for i, group in enumerate(groups)}
+  stats_dict = {
+      'mean': 'energy',
+      'standard error': 'stderr',
+      'standard error error': 'stderrerr'
+  }
+  def block(key, values):
+    blocked = blocking.reblock_summary(blocking.reblock(values)[1])
+    if not blocked.empty:
+      return blocked.iloc[0]
+    else:
+      logging.warning('Reblocking failed to estimate statistics for %s.', key)
+      return pd.Series({statistic: np.nan for statistic in stats_dict})
+  stats = (
+      pd.DataFrame.from_dict({
+          n: block(n, d.eigenvalues[burn_in:])
+          for n, d in df.groupby(groups) if not d[burn_in:].eigenvalues.empty
+      }, orient='index')
+      .reset_index()
+      .rename(index_dict, axis=1)
+      .rename(stats_dict, axis=1)
+  )
+  stats = stats.sort_values(by=groups).reset_index(drop=True)
+  stats['burn_in'] = burn_in
+  return stats
diff --git a/ferminet/utils/elements.py b/ferminet/utils/elements.py
new file mode 100644
index 0000000..0faa914
--- /dev/null
+++ b/ferminet/utils/elements.py
@@ -0,0 +1,221 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Basic data on chemical elements."""
+
+import collections
+
+import attr
+
+
+@attr.s
+class Element:
+  """Chemical element.
+
+  Attributes:
+    symbol: official symbol of element.
+    atomic_number: atomic number of element.
+    period: period to which the element belongs.
+  """
+  symbol = attr.ib()
+  atomic_number = attr.ib()
+  period = attr.ib()
+
+  @property
+  def group(self):
+    """Group to which element belongs. Set to -1 for actines and lanthanides."""
+    is_lanthanide = (58 <= self.atomic_number <= 71)
+    is_actinide = (90 <= self.atomic_number <= 103)
+    if is_lanthanide or is_actinide:
+      return -1
+    if self.symbol == 'He':
+      # n=1 shell only has s orbital -> He is a noble gas.
+      return 18
+    period_starts = (1, 3, 11, 19, 37, 55, 87)
+    period_start = period_starts[self.period - 1]
+    group_ = self.atomic_number - period_start + 1
+    # Adjust for absence of d block in periods 2 and 3.
+    if self.period < 4 and group_ > 2:
+      group_ += 10
+    # Adjust for Lanthanides and Actinides in periods 6 and 7.
+    if self.period >= 6 and group_ > 3:
+      group_ -= 14
+    return group_
+
+  @property
+  def spin_config(self):
+    """Canonical spin configuration (via Hund's rules) of neutral atom.
+
+    Returns:
+      Number of unpaired electrons (as required by PySCF) in the neutral atom's
+      ground state.
+    """
+    unpaired = {1: 1, 2: 0, 13: 1, 14: 2, 15: 3, 16: 2, 17: 1, 18: 0}
+    if self.group in unpaired:
+      return unpaired[self.group]
+    else:
+      raise NotImplementedError(
+          'Spin configuration for transition metals not set.')
+
+
+# Atomic symbols for all known elements
+# Generated using
+#   def _element(symbol, atomic_number):
+#     # period_start[n] = atomic number of group 1 element in (n+1)-th period.
+#     period_start = (1, 3, 11, 19, 37, 55, 87)
+#     for p, group1_no in enumerate(period_start):
+#       if atomic_number < group1_no:
+#         # In previous period but n is 0-based.
+#         period = p
+#         break
+#     else:
+#       period = p + 1
+#     return Element(symbol=symbol, atomic_number=atomic_number, period=period)
+#   [_element(s, n+1) for n, s in enumerate(symbols)]
+# where symbols is the list of chemical symbols of all elements.
+_ELEMENTS = (
+    Element(symbol='H', atomic_number=1, period=1),
+    Element(symbol='He', atomic_number=2, period=1),
+    Element(symbol='Li', atomic_number=3, period=2),
+    Element(symbol='Be', atomic_number=4, period=2),
+    Element(symbol='B', atomic_number=5, period=2),
+    Element(symbol='C', atomic_number=6, period=2),
+    Element(symbol='N', atomic_number=7, period=2),
+    Element(symbol='O', atomic_number=8, period=2),
+    Element(symbol='F', atomic_number=9, period=2),
+    Element(symbol='Ne', atomic_number=10, period=2),
+    Element(symbol='Na', atomic_number=11, period=3),
+    Element(symbol='Mg', atomic_number=12, period=3),
+    Element(symbol='Al', atomic_number=13, period=3),
+    Element(symbol='Si', atomic_number=14, period=3),
+    Element(symbol='P', atomic_number=15, period=3),
+    Element(symbol='S', atomic_number=16, period=3),
+    Element(symbol='Cl', atomic_number=17, period=3),
+    Element(symbol='Ar', atomic_number=18, period=3),
+    Element(symbol='K', atomic_number=19, period=4),
+    Element(symbol='Ca', atomic_number=20, period=4),
+    Element(symbol='Sc', atomic_number=21, period=4),
+    Element(symbol='Ti', atomic_number=22, period=4),
+    Element(symbol='V', atomic_number=23, period=4),
+    Element(symbol='Cr', atomic_number=24, period=4),
+    Element(symbol='Mn', atomic_number=25, period=4),
+    Element(symbol='Fe', atomic_number=26, period=4),
+    Element(symbol='Co', atomic_number=27, period=4),
+    Element(symbol='Ni', atomic_number=28, period=4),
+    Element(symbol='Cu', atomic_number=29, period=4),
+    Element(symbol='Zn', atomic_number=30, period=4),
+    Element(symbol='Ga', atomic_number=31, period=4),
+    Element(symbol='Ge', atomic_number=32, period=4),
+    Element(symbol='As', atomic_number=33, period=4),
+    Element(symbol='Se', atomic_number=34, period=4),
+    Element(symbol='Br', atomic_number=35, period=4),
+    Element(symbol='Kr', atomic_number=36, period=4),
+    Element(symbol='Rb', atomic_number=37, period=5),
+    Element(symbol='Sr', atomic_number=38, period=5),
+    Element(symbol='Y', atomic_number=39, period=5),
+    Element(symbol='Zr', atomic_number=40, period=5),
+    Element(symbol='Nb', atomic_number=41, period=5),
+    Element(symbol='Mo', atomic_number=42, period=5),
+    Element(symbol='Tc', atomic_number=43, period=5),
+    Element(symbol='Ru', atomic_number=44, period=5),
+    Element(symbol='Rh', atomic_number=45, period=5),
+    Element(symbol='Pd', atomic_number=46, period=5),
+    Element(symbol='Ag', atomic_number=47, period=5),
+    Element(symbol='Cd', atomic_number=48, period=5),
+    Element(symbol='In', atomic_number=49, period=5),
+    Element(symbol='Sn', atomic_number=50, period=5),
+    Element(symbol='Sb', atomic_number=51, period=5),
+    Element(symbol='Te', atomic_number=52, period=5),
+    Element(symbol='I', atomic_number=53, period=5),
+    Element(symbol='Xe', atomic_number=54, period=5),
+    Element(symbol='Cs', atomic_number=55, period=6),
+    Element(symbol='Ba', atomic_number=56, period=6),
+    Element(symbol='La', atomic_number=57, period=6),
+    Element(symbol='Ce', atomic_number=58, period=6),
+    Element(symbol='Pr', atomic_number=59, period=6),
+    Element(symbol='Nd', atomic_number=60, period=6),
+    Element(symbol='Pm', atomic_number=61, period=6),
+    Element(symbol='Sm', atomic_number=62, period=6),
+    Element(symbol='Eu', atomic_number=63, period=6),
+    Element(symbol='Gd', atomic_number=64, period=6),
+    Element(symbol='Tb', atomic_number=65, period=6),
+    Element(symbol='Dy', atomic_number=66, period=6),
+    Element(symbol='Ho', atomic_number=67, period=6),
+    Element(symbol='Er', atomic_number=68, period=6),
+    Element(symbol='Tm', atomic_number=69, period=6),
+    Element(symbol='Yb', atomic_number=70, period=6),
+    Element(symbol='Lu', atomic_number=71, period=6),
+    Element(symbol='Hf', atomic_number=72, period=6),
+    Element(symbol='Ta', atomic_number=73, period=6),
+    Element(symbol='W', atomic_number=74, period=6),
+    Element(symbol='Re', atomic_number=75, period=6),
+    Element(symbol='Os', atomic_number=76, period=6),
+    Element(symbol='Ir', atomic_number=77, period=6),
+    Element(symbol='Pt', atomic_number=78, period=6),
+    Element(symbol='Au', atomic_number=79, period=6),
+    Element(symbol='Hg', atomic_number=80, period=6),
+    Element(symbol='Tl', atomic_number=81, period=6),
+    Element(symbol='Pb', atomic_number=82, period=6),
+    Element(symbol='Bi', atomic_number=83, period=6),
+    Element(symbol='Po', atomic_number=84, period=6),
+    Element(symbol='At', atomic_number=85, period=6),
+    Element(symbol='Rn', atomic_number=86, period=6),
+    Element(symbol='Fr', atomic_number=87, period=7),
+    Element(symbol='Ra', atomic_number=88, period=7),
+    Element(symbol='Ac', atomic_number=89, period=7),
+    Element(symbol='Th', atomic_number=90, period=7),
+    Element(symbol='Pa', atomic_number=91, period=7),
+    Element(symbol='U', atomic_number=92, period=7),
+    Element(symbol='Np', atomic_number=93, period=7),
+    Element(symbol='Pu', atomic_number=94, period=7),
+    Element(symbol='Am', atomic_number=95, period=7),
+    Element(symbol='Cm', atomic_number=96, period=7),
+    Element(symbol='Bk', atomic_number=97, period=7),
+    Element(symbol='Cf', atomic_number=98, period=7),
+    Element(symbol='Es', atomic_number=99, period=7),
+    Element(symbol='Fm', atomic_number=100, period=7),
+    Element(symbol='Md', atomic_number=101, period=7),
+    Element(symbol='No', atomic_number=102, period=7),
+    Element(symbol='Lr', atomic_number=103, period=7),
+    Element(symbol='Rf', atomic_number=104, period=7),
+    Element(symbol='Db', atomic_number=105, period=7),
+    Element(symbol='Sg', atomic_number=106, period=7),
+    Element(symbol='Bh', atomic_number=107, period=7),
+    Element(symbol='Hs', atomic_number=108, period=7),
+    Element(symbol='Mt', atomic_number=109, period=7),
+    Element(symbol='Ds', atomic_number=110, period=7),
+    Element(symbol='Rg', atomic_number=111, period=7),
+    Element(symbol='Cn', atomic_number=112, period=7),
+    Element(symbol='Nh', atomic_number=113, period=7),
+    Element(symbol='Fl', atomic_number=114, period=7),
+    Element(symbol='Mc', atomic_number=115, period=7),
+    Element(symbol='Lv', atomic_number=116, period=7),
+    Element(symbol='Ts', atomic_number=117, period=7),
+    Element(symbol='Og', atomic_number=118, period=7),
+)
+
+
+ATOMIC_NUMS = {element.atomic_number: element for element in _ELEMENTS}
+
+
+# Lookup by symbol instead of atomic number.
+SYMBOLS = {element.symbol: element for element in _ELEMENTS}
+
+
+# Lookup by period.
+PERIODS = collections.defaultdict(list)
+for element in _ELEMENTS:
+  PERIODS[element.period].append(element)
+PERIODS = {period: tuple(elements) for period, elements in PERIODS.items()}
diff --git a/ferminet/utils/scf.py b/ferminet/utils/scf.py
new file mode 100644
index 0000000..887332d
--- /dev/null
+++ b/ferminet/utils/scf.py
@@ -0,0 +1,348 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Interaction with Hartree-Fock solver in pyscf."""
+
+# Abbreviations used:
+# SCF: self-consistent field (method). Another name for Hartree-Fock
+# HF: Hartree-Fock method.
+# RHF: restricted Hartre-Fock. Require molecular orbital for the i-th alpha-spin
+#   and i-th beta-spin electrons to have the same spatial component.
+# ROHF: restricted open-shell Hartree-Fock. Same as RHF except allows the number
+#   of alpha and beta electrons to differ.
+# UHF: unrestricted Hartre-Fock. Permits breaking of spin symmetry and hence
+#   alpha and beta electrons to have different spatial components.
+# AO: Atomic orbital. Underlying basis set (typically Gaussian-type orbitals and
+#   built into pyscf).
+# MO: molecular orbitals/Hartree-Fock orbitals. Single-particle orbitals which
+#   are solutions to the Hartree-Fock equations.
+
+
+from typing import Callable, Sequence, Text, Tuple
+
+from absl import logging
+from ferminet.utils import system
+import numpy as np
+import pyscf
+import tensorflow.compat.v1 as tf
+
+
+class Scf:
+  """Helper class for running Hartree-Fock (self-consistent field) with pyscf.
+
+  Attributes:
+    molecule: list of system.Atom objects giving the atoms in the molecule
+      and their positions.
+    nelectrons: Tuple with number of alpha electrons and beta electrons.
+    basis: Basis set to use, best specified with the relevant string for a
+      built-in basis set in pyscf. A user-defined basis set can be used
+      (advanced). See https://sunqm.github.io/pyscf/gto.html#input-basis for
+        more details.
+    restricted: If true, use the restriced Hartree-Fock method, otherwise use
+      the unrestricted Hartree-Fock method.
+  """
+
+  def __init__(self,
+               molecule: Sequence[system.Atom],
+               nelectrons: Tuple[int, int],
+               basis: Text = 'cc-pVTZ',
+               restricted: bool = True):
+    self.molecule = molecule
+    self.nelectrons = nelectrons
+    self.basis = basis
+    self.restricted = restricted
+    self._mean_field = None
+    self._mol = None
+
+    self._spin = nelectrons[0] - nelectrons[1]
+    pyscf.lib.param.TMPDIR = None
+
+  def run(self):
+    """Runs the Hartree-Fock calculation.
+
+    Returns:
+      A pyscf scf object (i.e. pyscf.scf.rhf.RHF, pyscf.scf.uhf.UHF or
+      pyscf.scf.rohf.ROHF depending on the spin and restricted settings).
+
+    Raises:
+      RuntimeError: If the number of electrons in the PySCF molecule is not
+      consistent with self.nelectrons.
+    """
+    if any(atom.atomic_number - atom.charge > 1.e-8 for atom in self.molecule):
+      logging.info(
+          'Fractional nuclear charge detected. Running SCF on atoms with integer charge.'
+      )
+    nuclear_charge = sum(atom.atomic_number for atom in self.molecule)
+    charge = nuclear_charge - sum(self.nelectrons)
+    self._mol = pyscf.gto.Mole(
+        atom=[[atom.symbol, atom.coords] for atom in self.molecule],
+        unit='bohr')
+    self._mol.basis = self.basis
+    self._mol.spin = self._spin
+    self._mol.charge = charge
+    self._mol.build()
+    if self._mol.nelectron != sum(self.nelectrons):
+      raise RuntimeError('PySCF molecule not consistent with QMC molecule.')
+    if self.restricted:
+      self._mean_field = pyscf.scf.RHF(self._mol)
+    else:
+      self._mean_field = pyscf.scf.UHF(self._mol)
+    self._mean_field.init_guess = 'atom'
+    self._mean_field.kernel()
+    return self._mean_field
+
+  def eval_mos(self, positions: np.ndarray,
+               deriv: bool = False) -> Tuple[np.ndarray, np.ndarray]:
+    """Evaluates the Hartree-Fock single-particle orbitals at a set of points.
+
+    Args:
+      positions: numpy array of shape (N, 3) of the positions in space at which
+        to evaluate the Hartree-Fock orbitals.
+      deriv: If True, also calculate the first derivatives of the
+        single-particle orbitals.
+
+    Returns:
+      Pair of numpy float64 arrays of shape (N, M) (deriv=False) or (4, N, M)
+      (deriv=True), where 2M is the number of Hartree-Fock orbitals. The (i-th,
+      j-th) element in the first (second) array gives the value of the j-th
+      alpha (beta) Hartree-Fock orbital at the i-th electron position in
+      positions. For restricted (RHF, ROHF) calculations, the two arrays will be
+      identical.
+      If deriv=True, the first index contains [value, x derivative, y
+      derivative, z derivative].
+
+    Raises:
+      RuntimeError: If Hartree-Fock calculation has not been performed using
+        `run`.
+      NotImplementedError: If Hartree-Fock calculation used Cartesian
+        Gaussian-type orbitals as the underlying basis set.
+    """
+    if self._mean_field is None:
+      raise RuntimeError('Mean-field calculation has not been run.')
+    if self.restricted:
+      coeffs = (self._mean_field.mo_coeff,)
+    else:
+      coeffs = self._mean_field.mo_coeff
+    # Assumes self._mol.cart (use of Cartesian Gaussian-type orbitals and
+    # integrals) is False (default behaviour of pyscf).
+    if self._mol.cart:
+      raise NotImplementedError(
+          'Evaluation of molecular orbitals using cartesian GTOs.')
+    # Note sph refers to the use of spherical GTO basis sets rather than
+    # Cartesian GO basis sets. The coordinate system used for the electron
+    # positions is Cartesian in both cases.
+    gto_op = 'GTOval_sph_deriv1' if deriv else 'GTOval_sph'
+    ao_values = self._mol.eval_gto(gto_op, positions)
+    mo_values = tuple(np.matmul(ao_values, coeff) for coeff in coeffs)
+    if self.restricted:
+      # duplicate for beta electrons.
+      mo_values *= 2
+    return mo_values
+
+  def tf_eval_mos(self, positions: tf.Tensor, deriv: bool = False,
+                 ) -> Tuple[tf.Tensor, Callable[[tf.Tensor], tf.Tensor]]:
+    """Evaluates the Hartree-Fock single-particle orbitals as a tensorflow op.
+
+    See: `eval_mos` for evaluating the orbitals given positions as numpy arrays.
+
+    Args:
+      positions: Tensor of shape (N, 3) containing N 3D position vectors at
+        which the Hartree-Fock orbitals are evaluated.
+      deriv: If True, also evaluates the first derivatives of the orbitals with
+        respect to positions and makes them available via tf.gradients.
+
+    Returns:
+      Tensor of shape (N, 2M) and the same dtype as positions. The first
+      (second) M elements along the last axis gives the value of the alpha-spin
+      (beta-spin) Hartree-Fock orbitals at the desired positions.
+
+    Raises:
+      RuntimeError: If the first derivatives are requested but deriv is False.
+    """
+
+    # Evaluate MOs in a nested function to avoid issues with custom_gradients
+    # and self or with the deriv flag argument.
+
+    @tf.custom_gradient
+    def _eval_mos(positions: tf.Tensor
+                 ) -> Tuple[tf.Tensor, Callable[[tf.Tensor], tf.Tensor]]:
+      """Computes Hartree-Fock orbitals at given positions."""
+      mo_values_derivs = tf.py_func(self.eval_mos, [positions, deriv],
+                                    (tf.float64, tf.float64))
+      mo_values_derivs = [
+          tf.cast(vals, positions.dtype) for vals in mo_values_derivs
+      ]
+      if deriv:
+        mo_values = tf.concat([vals[0] for vals in mo_values_derivs], axis=-1)
+        mo_derivs = tf.concat([vals[1:] for vals in mo_values_derivs], axis=-1)
+      else:
+        mo_values = tf.concat(mo_values_derivs, axis=-1)
+        mo_derivs = None
+      def grad(dmo: tf.Tensor) -> tf.Tensor:
+        """Computes gradients of orbitals with respect to positions."""
+        # dmo, reverse sensitivity, is a (N, 2M) tensor.
+        # mo_derivs is the full Jacobian, (3, N, 2M), where mo_derivs[i,j,k] is
+        # d\phi_{k} / dr^j_i (derivative of the k-th orbital with respect to the
+        # i-th component of the j-th position vector).
+        # positions is a (N, 3) tensor => tf.gradients(mo, positions) is (N, 3).
+        # tensorflow gradients use summation over free indices (i.e. orbitals).
+        if mo_derivs is None:
+          raise RuntimeError(
+              'Gradients not computed in forward pass. Set derivs=True.')
+        g = tf.reduce_sum(mo_derivs * tf.expand_dims(dmo, 0), axis=-1)
+        return tf.linalg.transpose(g)
+      return mo_values, grad
+
+    return _eval_mos(positions)
+
+  def tf_eval_hf(self, positions: tf.Tensor,
+                 deriv: bool = False) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Evaluates Hartree-Fock occupied orbitals at a set of electron positions.
+
+    Note: pyscf evaluates all orbitals at each position, which is somewhat
+    wasteful for this use-case, where we only need occupied orbitals and further
+    factorise the wavefunction into alpha and beta parts.
+
+    Args:
+      positions: (N, nelectrons, 3) tensor. Batch of positions for each
+        electron. The first self.nelectrons[0] elements in the second axis refer
+        to alpha electrons, the next self.electrons[1] positions refer to the
+        beta electrons.
+      deriv: If True, also evaluates the first derivatives of the orbitals with
+        respect to positions and makes them available via tf.gradients.
+
+    Returns:
+      Tuple of tensors for alpha and beta electrons respectively of shape
+      (N, nspin, nspin), where nspin is the corresponding number of electrons of
+      that spin. The (i, j, k) element gives the value of the k-th orbital for
+      the j-th electron in batch i. The product of the determinants of the
+      tensors hence gives the Hartree-Fock determinant at the set of input
+      positions.
+
+    Raises:
+      RuntimeError: If the sum of alpha and beta electrons does not equal
+      nelectrons.
+    """
+    if sum(self.nelectrons) != positions.shape.as_list()[1]:
+      raise RuntimeError(
+          'positions does not contain correct number of electrons.')
+    # Evaluate MOs at all electron positions.
+    # pyscf evaluates all 2M MOs at each 3D position vector. Hance reshape into
+    # (N*nelectron, 3) tensor for pyscf input.
+    one_e_positions = tf.reshape(positions, (-1, positions.shape[-1]))
+    mos = self.tf_eval_mos(one_e_positions, deriv)
+    nbasis = tf.shape(mos)[-1] // 2
+    # Undo reshaping of electron positions to give (N, nelectrons, 2M), where
+    # the (i, j, k) element gives the value of the k-th MO at the j-th electron
+    # position in the i-th batch.
+    mos = tf.reshape(mos, (-1, sum(self.nelectrons), 2*nbasis))
+    # Return (using Aufbau principle) the matrices for the occupied alpha and
+    # beta orbitals. Number of alpha electrons given by electrons[0].
+    alpha_spin = mos[:, :self.nelectrons[0], :self.nelectrons[0]]
+    beta_spin = mos[:, self.nelectrons[0]:, nbasis:nbasis+self.nelectrons[1]]
+    return alpha_spin, beta_spin
+
+  def tf_eval_slog_slater_determinant(self, flat_positions: tf.Tensor
+                                     ) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Evaluates the signed log HF slater determinant at given flat_positions.
+
+    Interface is chosen to be similar to that used for QMC networks.
+
+    Args:
+      flat_positions: (N, total_electrons * 3) tensor. Batch of flattened
+        positions for each electron, reshapeable to (N, total_electrons, 3)
+        where the first self.nelectrons[0] elements in the second axis refer to
+        alpha electrons, the next self.nelectrons[1] positions refer to the beta
+        electrons.
+
+    Returns:
+      log_abs_slater_determinant: (N, 1) tensor
+      # log of absolute value of slater determinant
+      sign: (N, 1) tensor. sign of wavefunction.
+
+    Raises:
+      RuntimeError: on incorrect shape of flat_positions.
+    """
+    xs = tf.reshape(flat_positions,
+                    [flat_positions.shape[0],
+                     sum(self.nelectrons), -1])
+    if xs.shape[2] != 3:
+      msg = 'flat_positions must be of shape (N, total_electrons*3)'
+      raise RuntimeError(msg)
+    matrices = self.tf_eval_hf(xs)
+    slogdets = [tf.linalg.slogdet(elem) for elem in matrices]
+    sign_alpha, sign_beta = [elem[0] for elem in slogdets]
+    log_abs_wf_alpha, log_abs_wf_beta = [elem[1] for elem in slogdets]
+    log_abs_slater_determinant = tf.expand_dims(tf.math.add(log_abs_wf_alpha,
+                                                            log_abs_wf_beta), 1)
+    sign = tf.expand_dims(tf.math.multiply(sign_alpha, sign_beta), 1)
+    return log_abs_slater_determinant, sign
+
+  def tf_eval_slog_hartree_product(self, flat_positions: tf.Tensor,
+                                   deriv: bool = False
+                                  ) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Evaluates the signed log Hartree product without anti-symmetrization.
+
+    Interface is chosen to be similar to that used for QMC networks.
+
+    Args:
+      flat_positions: (N, total_electrons * 3) tensor. Batch of flattened
+        positions for each electron, reshapeable to (N, total_electrons, 3)
+        where the first self.nelectrons[0] elements in the second axis refer to
+        alpha electrons, the next self.nelectrons[1] positions refer to the beta
+        electrons.
+      deriv: specifies if we will need derivatives.
+
+    Returns:
+      log_abs_hartree_product: (N, 1) tensor. log of abs of Hartree product.
+      sign: (N, 1) tensor. sign of Hartree product.
+    """
+    xs = tf.reshape(flat_positions,
+                    [int(flat_positions.shape[0]),
+                     sum(self.nelectrons), -1])
+    matrices = self.tf_eval_hf(xs, deriv)
+    log_abs_alpha, log_abs_beta = [
+        tf.linalg.trace(tf.log(tf.abs(elem))) for elem in matrices
+    ]
+    sign_alpha, sign_beta = [
+        tf.reduce_prod(tf.sign(tf.linalg.diag_part(elem)), axis=1)
+        for elem in matrices
+    ]
+    log_abs_hartree_product = tf.expand_dims(
+        tf.math.add(log_abs_alpha, log_abs_beta), axis=1)
+    sign = tf.expand_dims(tf.math.multiply(sign_alpha, sign_beta), axis=1)
+    return log_abs_hartree_product, sign
+
+  def tf_eval_hartree_product(self, flat_positions: tf.Tensor,
+                              deriv: bool = False) -> tf.Tensor:
+    """Evaluates the signed log Hartree product without anti-symmetrization.
+
+    Interface is chosen to be similar to that used for QMC networks. This is
+    a convenience wrapper around tf_eval_slog_hartree_product and is hence not
+    optimised.
+
+    Args:
+      flat_positions: (N, total_electrons * 3) tensor. Batch of flattened
+        positions for each electron, reshapeable to (N, total_electrons, 3)
+        where the first self.nelectrons[0] elements in the second axis refer to
+        alpha electrons, the next self.nelectrons[1] positions refer to the beta
+        electrons.
+      deriv: specifies if we will need derivatives.
+
+    Returns:
+      (N,1) tensor containing the Hartree product.
+    """
+    log_abs, sign = self.tf_eval_slog_hartree_product(flat_positions, deriv)
+    return tf.exp(log_abs) * sign
diff --git a/ferminet/utils/system.py b/ferminet/utils/system.py
new file mode 100644
index 0000000..1ea512a
--- /dev/null
+++ b/ferminet/utils/system.py
@@ -0,0 +1,265 @@
+# Lint as: python3
+# Copyright 2020 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions to create different kinds of systems."""
+
+from typing import Sequence
+import attr
+from ferminet.utils import elements
+from ferminet.utils import units as unit_conversion
+import numpy as np
+
+
+# Default bond lengths in angstrom for some diatomics.
+# Bond lengths from either the G3 dataset:
+# 1. http://www.cse.anl.gov/OldCHMwebsiteContent/compmat/comptherm.htm
+# 2. L. A. Curtiss, P. C. Redfern, K. Raghavachari, and J. A. Pople,
+#    J. Chem. Phys, 109, 42 (1998).
+# or from NIST (https://cccbdb.nist.gov/diatomicexpbondx.asp).
+diatomic_bond_lengths = {
+    'BeH': 1.348263,
+    'CN': 1.134797,
+    'ClF': 1.659091,
+    'F2': 1.420604,
+    'H2': 0.737164,
+    'HCl': 1.2799799,
+    'Li2': 2.77306,
+    'LiH': 1.639999,
+    'N2': 1.129978,
+    'NH': 1.039428,
+    'CO': 1.150338,
+    'BH': 1.2324,
+    'PN': 1.491,
+    'AlH': 1.648,
+    'AlN': 1.786,
+}
+
+
+# Default spin polarisation for a few diatomics of interest.
+# Otherwise default to either singlet (doublet) for even (odd) numbers of
+# electrons. Units: number of unpaired electrons.
+diatomic_spin_polarisation = {
+    'B2': 2,
+    'O2': 2,
+    'NH': 2,
+    'AlN': 2,
+}
+
+
+@attr.s
+class Atom:  # pytype: disable=invalid-function-definition
+  """Atom information for Hamiltonians.
+
+  The nuclear charge is inferred from the symbol if not given, in which case the
+  symbol must be the IUPAC symbol of the desired element.
+
+  Attributes:
+    symbol: Element symbol.
+    coords: An iterable of atomic coordinates. Always a list of floats and in
+      bohr after initialisation. Default: place atom at origin.
+    charge: Nuclear charge. Default: nuclear charge (atomic number) of atom of
+      the given name.
+    atomic_number: Atomic number associated with element. Default: atomic number
+      of element of the given symbol. Should match charge unless fractional
+      nuclear charges are being used.
+    units: String giving units of coords. Either bohr or angstrom. Default:
+      bohr. If angstrom, coords are converted to be in bohr and units to the
+      string 'bohr'.
+    coords_angstrom: list of atomic coordinates in angstrom.
+    coords_array: Numpy array of atomic coordinates in bohr.
+  """
+  symbol = attr.ib()
+  coords = attr.ib(
+      converter=lambda xs: tuple(float(x) for x in xs),
+      default=(0.0, 0.0, 0.0))  # type: Sequence[float]
+  charge = attr.ib(converter=float)
+  atomic_number = attr.ib(converter=int)
+  units = attr.ib(
+      default='bohr', validator=attr.validators.in_(['bohr', 'angstrom']))
+
+  @charge.default
+  def _set_default_charge(self):
+    return elements.SYMBOLS[self.symbol].atomic_number
+
+  @atomic_number.default
+  def _set_default_atomic_number(self):
+    return elements.SYMBOLS[self.symbol].atomic_number
+
+  def __attrs_post_init__(self):
+    if self.units == 'angstrom':
+      self.coords = [unit_conversion.angstrom2bohr(x) for x in self.coords]
+      self.units = 'bohr'
+
+  @property
+  def coords_angstrom(self):
+    return [unit_conversion.bohr2angstrom(x) for x in self.coords]
+
+  @property
+  def coords_array(self):
+    if not hasattr(self, '_coords_arr'):
+      self._coords_arr = np.array(self.coords)
+    return self._coords_arr
+
+
+def atom(symbol, spins=None, charge=0):
+  """Return configuration for a single atom.
+
+  Args:
+    symbol: The atomic symbol from the periodic table
+    spins (optional): A tuple with the number of spin-up and spin-down electrons
+    charge (optional): If zero (default), create a neutral atom, otherwise
+      create an anion if charge is negative or cation if charge is positive.
+  Returns:
+    A list with a single Atom object located at zero, and a tuple with the spin
+    configuration of the electrons.
+  """
+  atomic_number = elements.SYMBOLS[symbol].atomic_number
+  if charge > atomic_number:
+    raise ValueError('Cannot have a cation with charge larger than the '
+                     'atomic number. Charge: {}, Atomic Number{}'.format(
+                         charge, atomic_number))
+  if spins is None:
+    spin_polarisation = elements.ATOMIC_NUMS[atomic_number-charge].spin_config
+    nalpha = (atomic_number + spin_polarisation) // 2
+    spins = (nalpha, atomic_number - charge - nalpha)
+  return [Atom(symbol=symbol, coords=(0.0, 0.0, 0.0))], spins
+
+
+def diatomic(symbol1, symbol2, bond_length, spins=None, charge=0, units='bohr'):
+  """Return configuration for a diatomic molecule."""
+  if spins is None:
+    atomic_number_1 = elements.SYMBOLS[symbol1].atomic_number
+    atomic_number_2 = elements.SYMBOLS[symbol2].atomic_number
+    total_charge = atomic_number_1 + atomic_number_2 - charge
+    if total_charge % 2 == 0:
+      spins = (total_charge // 2, total_charge // 2)
+    else:
+      spins = ((total_charge + 1)// 2, (total_charge - 1) // 2)
+
+  return [
+      Atom(symbol=symbol1, coords=(0.0, 0.0, bond_length/2.0), units=units),
+      Atom(symbol=symbol2, coords=(0.0, 0.0, -bond_length/2.0), units=units)
+  ], spins
+
+
+def molecule(symbol, bond_length=0.0, units='bohr'):
+  """Hardcoded molecular geometries from the original Fermi Net paper."""
+
+  if symbol in diatomic_bond_lengths:
+    if symbol[-1] == '2':
+      symbs = [symbol[:-1], symbol[:-1]]
+    else:  # Split a camel-case string on the second capital letter
+      split_idx = None
+      for i in range(1, len(symbol)):
+        if split_idx is None and symbol[i].isupper():
+          split_idx = i
+      if split_idx is None:
+        raise ValueError('Cannot find second atomic symbol: {}'.format(symbol))
+      symbs = [symbol[:split_idx], symbol[split_idx:]]
+
+    atomic_number_1 = elements.SYMBOLS[symbs[0]].atomic_number
+    atomic_number_2 = elements.SYMBOLS[symbs[1]].atomic_number
+    total_charge = atomic_number_1 + atomic_number_2
+    if symbol in diatomic_spin_polarisation:
+      spin_pol = diatomic_spin_polarisation[symbol]
+      spins = ((total_charge + spin_pol) // 2, (total_charge + spin_pol) // 2)
+    elif total_charge % 2 == 0:
+      spins = (total_charge // 2, total_charge // 2)
+    else:
+      spins = ((total_charge + 1)// 2, (total_charge - 1) // 2)
+
+    if bond_length == 0.0:
+      bond_length = diatomic_bond_lengths[symbol]
+      units = 'angstrom'
+    return diatomic(symbs[0], symbs[1],
+                    bond_length,
+                    units=units,
+                    spins=spins)
+
+  if bond_length != 0.0:
+    raise ValueError('Bond length argument only appropriate for diatomics.')
+
+  if symbol == 'CH4':
+    return [
+        Atom(symbol='C', coords=(0.0, 0.0, 0.0), units='bohr'),
+        Atom(symbol='H', coords=(1.18886, 1.18886, 1.18886), units='bohr'),
+        Atom(symbol='H', coords=(-1.18886, -1.18886, 1.18886), units='bohr'),
+        Atom(symbol='H', coords=(1.18886, -1.18886, -1.18886), units='bohr'),
+        Atom(symbol='H', coords=(-1.18886, 1.18886, -1.18886), units='bohr'),
+    ], (5, 5)
+
+  if symbol == 'NH3':
+    return [
+        Atom(symbol='N', coords=(0.0, 0.0, 0.22013), units='bohr'),
+        Atom(symbol='H', coords=(0.0, 1.77583, -0.51364), units='bohr'),
+        Atom(symbol='H', coords=(1.53791, -0.88791, -0.51364), units='bohr'),
+        Atom(symbol='H', coords=(-1.53791, -0.88791, -0.51364), units='bohr'),
+    ], (5, 5)
+
+  if symbol == 'C2H4' or symbol == 'ethene' or symbol == 'ethylene':
+    return [
+        Atom(symbol='C', coords=(0.0, 0.0, 1.26135), units='bohr'),
+        Atom(symbol='C', coords=(0.0, 0.0, -1.26135), units='bohr'),
+        Atom(symbol='H', coords=(0.0, 1.74390, 2.33889), units='bohr'),
+        Atom(symbol='H', coords=(0.0, -1.74390, 2.33889), units='bohr'),
+        Atom(symbol='H', coords=(0.0, 1.74390, -2.33889), units='bohr'),
+        Atom(symbol='H', coords=(0.0, -1.74390, -2.33889), units='bohr'),
+    ], (8, 8)
+
+  if symbol == 'C4H6' or symbol == 'bicyclobutane':
+    return [
+        Atom(symbol='C', coords=(0.0, 2.13792, 0.58661), units='bohr'),
+        Atom(symbol='C', coords=(0.0, -2.13792, 0.58661), units='bohr'),
+        Atom(symbol='C', coords=(1.41342, 0.0, -0.58924), units='bohr'),
+        Atom(symbol='C', coords=(-1.41342, 0.0, -0.58924), units='bohr'),
+        Atom(symbol='H', coords=(0.0, 2.33765, 2.64110), units='bohr'),
+        Atom(symbol='H', coords=(0.0, 3.92566, -0.43023), units='bohr'),
+        Atom(symbol='H', coords=(0.0, -2.33765, 2.64110), units='bohr'),
+        Atom(symbol='H', coords=(0.0, -3.92566, -0.43023), units='bohr'),
+        Atom(symbol='H', coords=(2.67285, 0.0, -2.19514), units='bohr'),
+        Atom(symbol='H', coords=(-2.67285, 0.0, -2.19514), units='bohr'),
+    ], (15, 15)
+
+  raise ValueError('Not a recognized molecule: {}'.format(symbol))
+
+
+def hn(n, r, charge=0, units='bohr'):
+  """Return a hydrogen chain with n atoms and separation r."""
+  m = n - charge  # number of electrons
+  if m % 2 == 0:
+    spins = (m//2, m//2)
+  else:
+    spins = ((m+1)//2, (m-1)//2)
+  lim = r * (n-1) / 2.0
+  return [Atom(symbol='H', coords=(0.0, 0.0, z), units=units)
+          for z in np.linspace(-lim, lim, n)], spins
+
+
+def h4_circle(r, theta, units='bohr'):
+  """Return 4 hydrogen atoms arranged in a circle, a failure case of CCSD(T)."""
+  return [
+      Atom(symbol='H',
+           coords=(r*np.cos(theta), r*np.sin(theta), 0.0),
+           units=units),
+      Atom(symbol='H',
+           coords=(-r*np.cos(theta), r*np.sin(theta), 0.0),
+           units=units),
+      Atom(symbol='H',
+           coords=(r*np.cos(theta), -r*np.sin(theta), 0.0),
+           units=units),
+      Atom(symbol='H',
+           coords=(-r*np.cos(theta), -r*np.sin(theta), 0.0),
+           units=units)
+  ], (2, 2)
diff --git a/ferminet/utils/units.py b/ferminet/utils/units.py
new file mode 100644
index 0000000..8d441f2
--- /dev/null
+++ b/ferminet/utils/units.py
@@ -0,0 +1,46 @@
+# Lint as: python3
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Basic definition of units and converters useful for chemistry."""
+
+from typing import Union
+import numpy as np
+
+# 1 Bohr = 0.52917721067 (12) x 10^{-10} m
+# https://physics.nist.gov/cgi-bin/cuu/Value?bohrrada0
+# Note: pyscf uses a slightly older definition of 0.52917721092 angstrom.
+ANGSTROM_BOHR = 0.52917721067
+BOHR_ANGSTROM = 1. / ANGSTROM_BOHR
+
+# 1 Hartree = 627.509474 kcal/mol
+# https://en.wikipedia.org/wiki/Hartree
+KCAL_HARTREE = 627.509474
+HARTREE_KCAL = 1. / KCAL_HARTREE
+
+
+def bohr2angstrom(x_b: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
+  return x_b * ANGSTROM_BOHR
+
+
+def angstrom2bohr(x_a: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
+  return x_a * BOHR_ANGSTROM
+
+
+def hartree2kcal(x_b: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
+  return x_b * KCAL_HARTREE
+
+
+def kcal2hartree(x_a: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
+  return x_a * HARTREE_KCAL
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..da8ef52
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,52 @@
+# Copyright 2018-2019 DeepMind Technologies Limited and Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Setup for pip package."""
+
+import unittest
+from setuptools import find_packages
+from setuptools import setup
+
+REQUIRED_PACKAGES = [
+    'absl-py', 'kfac>=0.2.3', 'numpy<1.19.0', 'pandas', 'pyscf', 'pyblock',
+    'dm-sonnet<2.0', 'tables', 'tensorflow>=1.14,<2.0',
+    'tensorflow_probability==0.8'
+]
+EXTRA_PACKAGES = {
+    'tensorflow-gpu': ['tensorflow-gpu>=1.14,<2.0'],
+}
+
+
+def ferminet_test_suite():
+  test_loader = unittest.TestLoader()
+  test_suite = test_loader.discover('ferminet/tests', pattern='*_test.py')
+  return test_suite
+
+
+setup(
+    name='ferminet',
+    version='0.1',
+    description='A library to train networks to represent ground state wavefunctions of fermionic systems',
+    url='https://github.com/deepmind/ferminet',
+    author='DeepMind',
+    author_email='no-reply@google.com',
+    # Contained modules and scripts.
+    scripts=['bin/ferminet'],
+    packages=find_packages(),
+    install_requires=REQUIRED_PACKAGES,
+    extras_require=EXTRA_PACKAGES,
+    platforms=['any'],
+    license='Apache 2.0',
+    test_suite='setup.ferminet_test_suite',
+)