diff --git a/doc/docs/Python_User_Interface.md b/doc/docs/Python_User_Interface.md
index c539ea90b..f4696fdaf 100644
--- a/doc/docs/Python_User_Interface.md
+++ b/doc/docs/Python_User_Interface.md
@@ -114,7 +114,6 @@ def __init__(self,
              filename_prefix=None,
              output_volume=None,
              output_single_precision=False,
-             load_structure='',
              geometry_center=Vector3<0.0, 0.0, 0.0>,
              force_all_components=False,
              split_chunks_evenly=True,
@@ -314,11 +313,6 @@ Python. `Vector3` is a `meep` class.
   that are specified by geometric objects. You should list any materials other
   than scalar dielectrics that are returned by `material_function` here.
 
-+ **`load_structure` [`string`]** — If not empty, Meep will load the structure
-  file specified by this string. The file must have been created by
-  `mp.dump_structure`. Defaults to an empty string. See [Load and Dump
-  Structure](#load-and-dump-structure) for more information.
-
 + **`chunk_layout` [`string` or `Simulation` instance or `BinaryPartition` class]** —
   This will cause the `Simulation` to use the chunk layout described by either
   (1) an `.h5` file (created using `Simulation.dump_chunk_layout`), (2) another
@@ -2305,13 +2299,15 @@ is a 1d array of `nfreq` dimensions.
 
 These functions dump the raw ε and μ data to disk and load it back for doing multiple simulations with the same materials but different sources etc. The only prerequisite is that the dump/load simulations have the same [chunks](Chunks_and_Symmetry.md) (i.e. the same grid, number of processors, symmetries, and PML). When using `split_chunks_evenly=False`, you must also dump the original chunk layout using `dump_chunk_layout` and load it into the new `Simulation` using the `chunk_layout` parameter. Currently only stores dispersive and non-dispersive $\varepsilon$ and $\mu$ but not nonlinearities. Note that loading data from a file in this way overwrites any `geometry` data passed to the `Simulation` constructor.
 
+For dump_structure and load_structure, when `single_parallel_file=True` (the default) - all chunks write to the same/single file after computing their respective offsets into this file. When set to 'False', each chunk writes writes its data to a separate file that is appropriately named by its chunk index.
+
 
 <a id="Simulation.dump_structure"></a>
 
 <div class="class_members" markdown="1">
 
 ```python
-def dump_structure(self, fname):
+def dump_structure(self, fname, single_parallel_file=True):
 ```
 
 <div class="method_docstring" markdown="1">
@@ -2327,7 +2323,7 @@ Dumps the structure to the file `fname`.
 <div class="class_members" markdown="1">
 
 ```python
-def load_structure(self, fname):
+def load_structure(self, fname, single_parallel_file=True):
 ```
 
 <div class="method_docstring" markdown="1">
diff --git a/doc/docs/Python_User_Interface.md.in b/doc/docs/Python_User_Interface.md.in
index a1c7eec14..4fa2065a9 100644
--- a/doc/docs/Python_User_Interface.md.in
+++ b/doc/docs/Python_User_Interface.md.in
@@ -351,6 +351,8 @@ And this [`DftNear2Far`](#DftNear2Far) method:
 
 These functions dump the raw ε and μ data to disk and load it back for doing multiple simulations with the same materials but different sources etc. The only prerequisite is that the dump/load simulations have the same [chunks](Chunks_and_Symmetry.md) (i.e. the same grid, number of processors, symmetries, and PML). When using `split_chunks_evenly=False`, you must also dump the original chunk layout using `dump_chunk_layout` and load it into the new `Simulation` using the `chunk_layout` parameter. Currently only stores dispersive and non-dispersive $\varepsilon$ and $\mu$ but not nonlinearities. Note that loading data from a file in this way overwrites any `geometry` data passed to the `Simulation` constructor.
 
+For dump_structure and load_structure, when `single_parallel_file=True` (the default) - all chunks write to the same/single file after computing their respective offsets into this file. When set to 'False', each chunk writes writes its data to a separate file that is appropriately named by its chunk index.
+
 @@ Simulation.dump_structure @@
 @@ Simulation.load_structure @@
 @@ Simulation.dump_chunk_layout @@
diff --git a/python/simulation.py b/python/simulation.py
index 62a8b54a4..aa059198b 100644
--- a/python/simulation.py
+++ b/python/simulation.py
@@ -961,7 +961,6 @@ def __init__(self,
                  filename_prefix=None,
                  output_volume=None,
                  output_single_precision=False,
-                 load_structure='',
                  geometry_center=mp.Vector3(),
                  force_all_components=False,
                  split_chunks_evenly=True,
@@ -1158,11 +1157,6 @@ def __init__(self,
           that are specified by geometric objects. You should list any materials other
           than scalar dielectrics that are returned by `material_function` here.
 
-        + **`load_structure` [`string`]** — If not empty, Meep will load the structure
-          file specified by this string. The file must have been created by
-          `mp.dump_structure`. Defaults to an empty string. See [Load and Dump
-          Structure](#load-and-dump-structure) for more information.
-
         + **`chunk_layout` [`string` or `Simulation` instance or `BinaryPartition` class]** —
           This will cause the `Simulation` to use the chunk layout described by either
           (1) an `.h5` file (created using `Simulation.dump_chunk_layout`), (2) another
@@ -1235,7 +1229,6 @@ def __init__(self,
         self.is_cylindrical = False
         self.material_function = material_function
         self.epsilon_func = epsilon_func
-        self.load_structure_file = load_structure
         self.dft_objects = []
         self._is_initialized = False
         self.force_all_components = force_all_components
@@ -1246,6 +1239,9 @@ def __init__(self,
         self.fragment_stats = None
         self._output_stats = os.environ.get('MEEP_STATS', None)
 
+        self.load_single_parallel_file = True
+        self.load_structure_file = None
+
         self.special_kz = False
         if self.cell_size.z == 0 and self.k_point and self.k_point.z != 0:
             if kz_2d == "complex":
@@ -1717,7 +1713,8 @@ def _init_structure(self, k=False):
             self.num_chunks = self.chunk_layout.numchunks()
 
         if self.load_structure_file:
-            self.load_structure(self.load_structure_file)
+            self.load_structure(
+                self.load_structure_file, self.load_single_parallel_file)
 
     def _is_outer_boundary(self, vol, direction, side):
 
@@ -1861,22 +1858,22 @@ def set_materials(self, geometry=None, default_material=None):
             None
         )
 
-    def dump_structure(self, fname):
+    def dump_structure(self, fname, single_parallel_file=True):
         """
         Dumps the structure to the file `fname`.
         """
         if self.structure is None:
             raise ValueError("Fields must be initialized before calling dump_structure")
-        self.structure.dump(fname)
+        self.structure.dump(fname, single_parallel_file)
 
-    def load_structure(self, fname):
+    def load_structure(self, fname, single_parallel_file=True):
         """
         Loads a structure from the file `fname`. A file name to load can also be passed to
         the `Simulation` constructor via the `load_structure` keyword argument.
         """
         if self.structure is None:
             raise ValueError("Fields must be initialized before calling load_structure")
-        self.structure.load(fname)
+        self.structure.load(fname, single_parallel_file)
 
     def dump_chunk_layout(self, fname):
         """
@@ -1898,6 +1895,41 @@ def load_chunk_layout(self, br, source):
             ## source is either filename (string)
             self.structure.load_chunk_layout(source, br)
 
+    def _get_load_dump_dirname(self, dirname, single_parallel_file):
+        """
+        Get the dirname to dump simulation state to.
+        """
+        if single_parallel_file:
+            dump_dirname = dirname
+        else:
+            # When doing a sharded dump (each process to its own file), use
+            # the process rank to get a unique name.
+            dump_dirname = os.path.join(dirname, 'rank%02d' % mp.my_rank())
+        return dump_dirname
+
+    def dump(self, dirname, structure=True, single_parallel_file=True):
+        """
+        Dumps simulation state.
+        """
+        dump_dirname = self._get_load_dump_dirname(dirname, single_parallel_file)
+        os.makedirs(dump_dirname, exist_ok=True)
+
+        if structure:
+            structure_dump_filename = os.path.join(dump_dirname, 'structure.h5')
+            self.dump_structure(structure_dump_filename, single_parallel_file)
+
+    def load(self, dirname, structure=True, single_parallel_file=True):
+        """
+        Loads simulation state.
+
+        This should called right after the Simulation object has been created
+        but before 'init_sim' is called.
+        """
+        dump_dirname = self._get_load_dump_dirname(dirname, single_parallel_file)
+        self.load_single_parallel_file = single_parallel_file
+        if structure:
+          self.load_structure_file = os.path.join(dump_dirname, 'structure.h5')
+
     def init_sim(self):
         if self._is_initialized:
             return
diff --git a/python/tests/test_simulation.py b/python/tests/test_simulation.py
index ed751082e..f85afeb7f 100644
--- a/python/tests/test_simulation.py
+++ b/python/tests/test_simulation.py
@@ -302,7 +302,7 @@ def test_in_box_volumes(self):
         sim.field_energy_in_box(tv)
         sim.field_energy_in_box(v)
 
-    def _load_dump_structure(self, chunk_file=False, chunk_sim=False):
+    def _load_dump_structure(self, chunk_file=False, chunk_sim=False, single_parallel_file=True):
         from meep.materials import Al
         resolution = 50
         cell = mp.Vector3(5, 5)
@@ -329,12 +329,14 @@ def get_ref_field_point(sim):
             ref_field_points.append(p.real)
 
         sim1.run(mp.at_every(5, get_ref_field_point), until=50)
-        dump_fn = os.path.join(self.temp_dir, 'test_load_dump_structure.h5')
+
+        dump_dirname = os.path.join(self.temp_dir, 'test_load_dump')
+        sim1.dump(dump_dirname, structure=True, single_parallel_file=single_parallel_file)
+
         dump_chunk_fname = None
         chunk_layout = None
-        sim1.dump_structure(dump_fn)
         if chunk_file:
-            dump_chunk_fname = os.path.join(self.temp_dir, 'test_load_dump_structure_chunks.h5')
+            dump_chunk_fname = os.path.join(dump_dirname, 'chunk_layout.h5')
             sim1.dump_chunk_layout(dump_chunk_fname)
             chunk_layout = dump_chunk_fname
         if chunk_sim:
@@ -345,9 +347,8 @@ def get_ref_field_point(sim):
                             boundary_layers=pml_layers,
                             sources=[sources],
                             symmetries=symmetries,
-                            chunk_layout=chunk_layout,
-                            load_structure=dump_fn)
-
+                            chunk_layout=chunk_layout)
+        sim.load(dump_dirname, structure=True, single_parallel_file=single_parallel_file)
         field_points = []
 
         def get_field_point(sim):
@@ -362,6 +363,10 @@ def get_field_point(sim):
     def test_load_dump_structure(self):
         self._load_dump_structure()
 
+    @unittest.skipIf(not mp.with_mpi(), "MPI specific test")
+    def test_load_dump_structure_sharded(self):
+        self._load_dump_structure(single_parallel_file=False)
+
     def test_load_dump_chunk_layout_file(self):
         self._load_dump_structure(chunk_file=True)
 
diff --git a/src/h5file.cpp b/src/h5file.cpp
index 484640942..366e5d63c 100644
--- a/src/h5file.cpp
+++ b/src/h5file.cpp
@@ -150,7 +150,7 @@ void h5file::close_id() {
 
 /* note: if parallel is true, then *all* processes must call this,
    and all processes will use I/O. */
-h5file::h5file(const char *filename_, access_mode m, bool parallel_) {
+h5file::h5file(const char *filename_, access_mode m, bool parallel_, bool local_) {
   cur_dataname = NULL;
   id = (void *)malloc(sizeof(hid_t));
   cur_id = (void *)malloc(sizeof(hid_t));
@@ -160,7 +160,12 @@ h5file::h5file(const char *filename_, access_mode m, bool parallel_) {
   filename = new char[strlen(filename_) + 1];
   strcpy(filename, filename_);
   mode = m;
+
+  if (parallel_ && local_) {
+    meep::abort("Can not open h5file (%s) in both parallel and local mode.", filename);
+  }
   parallel = parallel_;
+  local = local_;
 }
 
 h5file::~h5file() {
@@ -191,7 +196,9 @@ void h5file::remove() {
   extending = 0;
 
   IF_EXCLUSIVE(if (parallel) all_wait(), (void)0);
-  if (am_master() && std::remove(filename)) meep::abort("error removing file %s", filename);
+  if (am_master() || local) {
+    if (std::remove(filename)) meep::abort("error removing file %s", filename);
+  }
 }
 
 h5file::extending_s *h5file::get_extending(const char *dataname) const {
@@ -226,7 +233,7 @@ void h5file::set_cur(const char *dataname, void *data_id) {
 
 void h5file::read_size(const char *dataname, int *rank, size_t *dims, int maxrank) {
 #ifdef HAVE_HDF5
-  if (parallel || am_master()) {
+  if (parallel || am_master() || local) {
     hid_t file_id = HID(get_id()), space_id, data_id;
 
     CHECK(file_id >= 0, "error opening HDF5 input file");
@@ -253,7 +260,7 @@ void h5file::read_size(const char *dataname, int *rank, size_t *dims, int maxran
     H5Sclose(space_id);
   }
 
-  if (!parallel) {
+  if (!parallel && !local) {
     *rank = broadcast(0, *rank);
     broadcast(0, dims, *rank);
 
@@ -291,7 +298,7 @@ void *h5file::read(const char *dataname, int *rank, size_t *dims, int maxrank,
                    bool single_precision) {
 #ifdef HAVE_HDF5
   void *data = 0;
-  if (parallel || am_master()) {
+  if (parallel || am_master() || local) {
     int i, N;
     hid_t file_id = HID(get_id()), space_id, data_id;
 
@@ -345,7 +352,7 @@ void *h5file::read(const char *dataname, int *rank, size_t *dims, int maxrank,
     if (close_data_id) H5Dclose(data_id);
   }
 
-  if (!parallel) {
+  if (!parallel && !local) {
     *rank = broadcast(0, *rank);
     broadcast(0, dims, *rank);
     size_t N = 1;
@@ -375,7 +382,7 @@ char *h5file::read(const char *dataname) {
 #ifdef HAVE_HDF5
   char *data = 0;
   int len = 0;
-  if (parallel || am_master()) {
+  if (parallel || am_master() || local) {
     hid_t file_id = HID(get_id()), space_id, data_id, type_id;
 
     CHECK(file_id >= 0, "error opening HDF5 input file");
@@ -404,7 +411,7 @@ char *h5file::read(const char *dataname) {
     H5Dclose(data_id);
   }
 
-  if (!parallel) {
+  if (!parallel && !local) {
     len = broadcast(0, len);
     if (!am_master()) data = new char[len];
     broadcast(0, data, len);
@@ -442,7 +449,7 @@ void h5file::remove_data(const char *dataname) {
   if (dataset_exists(dataname)) {
     /* this is hackish ...need to pester HDF5 developers to make
        H5Gunlink a collective operation for parallel mode */
-    if (!parallel || am_master()) {
+    if (!parallel || am_master() || local) {
       H5Gunlink(file_id, dataname); /* delete it */
       H5Fflush(file_id, H5F_SCOPE_GLOBAL);
     }
@@ -471,7 +478,7 @@ void h5file::create_data(const char *dataname, int rank, const size_t *dims, boo
   unset_cur();
   remove_data(dataname); // HDF5 gives error if we H5Dcreate existing dataset
 
-  if (IF_EXCLUSIVE(!parallel || am_master(), 1)) {
+  if (local || IF_EXCLUSIVE(!parallel || am_master(), 1)) {
     hsize_t *dims_copy = new hsize_t[rank1 + append_data];
     hsize_t *maxdims = new hsize_t[rank1 + append_data];
     hsize_t N = 1;
@@ -708,12 +715,12 @@ void h5file::done_writing_chunks() {
      I'm assuming(?) that non-extensible datasets will use different
      files, etcetera, for different timesteps.  All of this hackery
      goes away if we just use an MPI-compiled version of HDF5. */
-  if (parallel && cur_dataname && get_extending(cur_dataname)) prevent_deadlock(); // closes id
+  if (parallel && !local && cur_dataname && get_extending(cur_dataname)) prevent_deadlock(); // closes id
 }
 
 void h5file::write(const char *dataname, int rank, const size_t *dims, void *data,
                    bool single_precision) {
-  if (parallel || am_master()) {
+  if (parallel || am_master() || local) {
     size_t *start = new size_t[rank + 1];
     for (int i = 0; i < rank; i++)
       start[i] = 0;
@@ -732,7 +739,7 @@ void h5file::write(const char *dataname, int rank, const size_t *dims, void *dat
 
 void h5file::write(const char *dataname, const char *data) {
 #ifdef HAVE_HDF5
-  if (IF_EXCLUSIVE(am_master(), parallel || am_master())) {
+  if (local || IF_EXCLUSIVE(am_master(), parallel || am_master())) {
     hid_t file_id = HID(get_id()), type_id, data_id, space_id;
 
     CHECK(file_id >= 0, "error opening HDF5 output file");
diff --git a/src/meep.hpp b/src/meep.hpp
index aeb28211e..7f336280f 100644
--- a/src/meep.hpp
+++ b/src/meep.hpp
@@ -384,7 +384,13 @@ class h5file {
 public:
   typedef enum { READONLY, READWRITE, WRITE } access_mode;
 
-  h5file(const char *filename_, access_mode m = READWRITE, bool parallel_ = true);
+  // If 'parallel_' is true, then we assume that all processes will be doing
+  // I/O, else we assume that *only* the master is doing I/O and all other
+  // processes will send/receive data to/from the master.
+  // If 'local_' is true, then 'parallel_' *must* be false and assumes that
+  // each process is writing to a local non-shared file and the filename is
+  // unique to the process.
+  h5file(const char *filename_, access_mode m = READWRITE, bool parallel_ = true, bool local_ = false);
   ~h5file(); // closes the files (and any open dataset)
 
   bool ok();
@@ -423,6 +429,7 @@ class h5file {
   access_mode mode;
   char *filename;
   bool parallel;
+  bool local;
 
   bool is_cur(const char *dataname);
   void unset_cur();
@@ -849,9 +856,15 @@ class structure {
   std::vector<int> get_chunk_owners() const;
 
   // structure_dump.cpp
-  void dump(const char *filename);
+  // Dump structure to specified file. If 'single_parallel_file'
+  // is 'true' (the default) - then all processes write to the same/single file
+  // file after computing their respective offsets into this file. When set to
+  // 'false', each process writes data for the chunks it owns to a separate
+  // (process unique) file.
+  void dump(const char *filename, bool single_parallel_file=true);
+  void load(const char *filename, bool single_parallel_file=true);
+
   void dump_chunk_layout(const char *filename);
-  void load(const char *filename);
   void load_chunk_layout(const char *filename, boundary_region &br);
   void load_chunk_layout(const std::vector<grid_volume> &gvs,
                          const std::vector<int> &ids,
@@ -889,7 +902,7 @@ class structure {
   void changing_chunks();
   // Helper methods for dumping and loading susceptibilities
   void set_chiP_from_file(h5file *file, const char *dataset, field_type ft);
-  void write_susceptibility_params(h5file *file, const char *dname, int EorH);
+  void write_susceptibility_params(h5file *file, bool single_parallel_file, const char *dname, int EorH);
 
   std::unique_ptr<binary_partition> bp;
 };
diff --git a/src/structure_dump.cpp b/src/structure_dump.cpp
index ac57a7c99..d0fcb91a5 100644
--- a/src/structure_dump.cpp
+++ b/src/structure_dump.cpp
@@ -33,7 +33,9 @@ namespace meep {
 
 // Write the parameters required to reconstruct the susceptibility (id, noise_amp (for noisy),
 // omega_0, gamma, no_omega_0_denominator)
-void structure::write_susceptibility_params(h5file *file, const char *dname, int EorH) {
+void structure::write_susceptibility_params(h5file *file,
+                                            bool single_parallel_file,
+                                            const char *dname, int EorH) {
   // Get number of susceptibility params from first chunk, since all chunks will have
   // the same susceptibility list.
   size_t params_ntotal = 0;
@@ -47,7 +49,7 @@ void structure::write_susceptibility_params(h5file *file, const char *dname, int
   size_t params_start = 0;
   file->create_data(dname, 1, &params_ntotal, false /* append_data */,
                     sizeof(realnum) == sizeof(float) /* single_precision */);
-  if (am_master()) {
+  if (am_master() || !single_parallel_file) {
     susceptibility *sus = chunks[0]->chiP[EorH];
     while (sus) {
       sus->dump_params(file, &params_start);
@@ -85,36 +87,60 @@ void structure::dump_chunk_layout(const char *filename) {
   delete[] nums;
 }
 
-void structure::dump(const char *filename) {
-  if (verbosity > 0) master_printf("creating epsilon output file \"%s\"...\n", filename);
-
-  // make/save a num_chunks x NUM_FIELD_COMPONENTS x 5 array counting
-  // the number of entries in the chi1inv array for each chunk.
-  size_t *num_chi1inv_ = new size_t[num_chunks * NUM_FIELD_COMPONENTS * 5];
-  memset(num_chi1inv_, 0, sizeof(size_t) * size_t(num_chunks * NUM_FIELD_COMPONENTS * 5));
+void structure::dump(const char *filename, bool single_parallel_file) {
+  if (verbosity > 0) printf("creating epsilon from file \"%s\" (%d)...\n", filename, single_parallel_file);
+
+  /*
+   * make/save a num_chunks x NUM_FIELD_COMPONENTS x 5 array counting
+   * the number of entries in the chi1inv array for each chunk.
+   *
+   * When 'single_parallel_file' is true, we are creating a single block of data
+   * for ALL chunks (that are merged using MPI). Otherwise, we are just
+   * making a copy of just the chunks that are ours.
+   */
+  int my_num_chunks = 0;
+  for (int i = 0; i < num_chunks; i++) {
+    my_num_chunks += (single_parallel_file || chunks[i]->is_mine());
+  }
+  size_t num_chi1inv_size = my_num_chunks * NUM_FIELD_COMPONENTS * 5;
+  std::vector<size_t> num_chi1inv_(num_chi1inv_size);
   size_t my_ntot = 0;
-  for (int i = 0; i < num_chunks; i++)
+  for (int i = 0, chunk_i = 0; i < num_chunks; i++) {
     if (chunks[i]->is_mine()) {
       size_t ntot = chunks[i]->gv.ntot();
-      for (int c = 0; c < NUM_FIELD_COMPONENTS; ++c)
-        for (int d = 0; d < 5; ++d)
+      for (int c = 0; c < NUM_FIELD_COMPONENTS; ++c) {
+        for (int d = 0; d < 5; ++d) {
           if (chunks[i]->chi1inv[c][d])
-            my_ntot += (num_chi1inv_[(i * NUM_FIELD_COMPONENTS + c) * 5 + d] = ntot);
+            my_ntot += (num_chi1inv_[(chunk_i * NUM_FIELD_COMPONENTS + c) * 5 + d] = ntot);
+        }
+      }
     }
-  size_t *num_chi1inv = new size_t[num_chunks * NUM_FIELD_COMPONENTS * 5];
-  sum_to_master(num_chi1inv_, num_chi1inv, num_chunks * NUM_FIELD_COMPONENTS * 5);
-  delete[] num_chi1inv_;
+    chunk_i += (chunks[i]->is_mine() || single_parallel_file);
+  }
+
+  std::vector<size_t> num_chi1inv;
+  if (single_parallel_file) {
+    num_chi1inv.resize(num_chi1inv_size);
+    sum_to_master(num_chi1inv_.data(), num_chi1inv.data(), num_chi1inv_size);
+  } else {
+    num_chi1inv = std::move(num_chi1inv_);
+  }
 
   // determine total dataset size and offset of this process's data
-  size_t my_start = partial_sum_to_all(my_ntot) - my_ntot;
-  size_t ntotal = sum_to_all(my_ntot);
+  size_t my_start = 0;
+  size_t ntotal = my_ntot;
+  if (single_parallel_file) {
+    my_start = partial_sum_to_all(my_ntot) - my_ntot;
+    ntotal = sum_to_all(my_ntot);
+  }
 
-  h5file file(filename, h5file::WRITE, true);
-  size_t dims[3] = {(size_t)num_chunks, NUM_FIELD_COMPONENTS, 5};
+  h5file file(filename, h5file::WRITE, single_parallel_file, !single_parallel_file);
+  size_t dims[3] = {(size_t)my_num_chunks, NUM_FIELD_COMPONENTS, 5};
   size_t start[3] = {0, 0, 0};
   file.create_data("num_chi1inv", 3, dims);
-  if (am_master()) file.write_chunk(3, start, dims, num_chi1inv);
-  delete[] num_chi1inv;
+  if (am_master() || !single_parallel_file) {
+    file.write_chunk(3, start, dims, num_chi1inv.data());
+  }
 
   // write the data
   file.create_data("chi1inv", 1, &ntotal, false /* append_data */, false /* single_precision */);
@@ -147,7 +173,7 @@ void structure::dump(const char *filename) {
     // Write the number of susceptibilites
     size_t len = 2;
     file.create_data("num_sus", 1, &len);
-    if (am_master()) {
+    if (am_master() || !single_parallel_file) {
       size_t start = 0;
       size_t ntot = 2;
       file.write_chunk(1, &start, &ntot, num_sus);
@@ -157,13 +183,11 @@ void structure::dump(const char *filename) {
   // Get number of non-null sigma entries for each chiP in each chunk.
   // Assumes each susceptibility in the chiP[E_stuff] list has the
   // same number of non-null sigma elements. Likewise for chiP[H_stuff]
-  size_t *my_num_sigmas[2];
-  size_t *num_sigmas[2];
+  std::vector<size_t> my_num_sigmas[2];
+  std::vector<size_t> num_sigmas[2];
   for (int ft = 0; ft < 2; ++ft) {
-    my_num_sigmas[ft] = new size_t[num_chunks];
-    num_sigmas[ft] = new size_t[num_chunks];
-    memset(my_num_sigmas[ft], 0, sizeof(size_t) * num_chunks);
-    memset(num_sigmas[ft], 0, sizeof(size_t) * num_chunks);
+    my_num_sigmas[ft].resize(num_chunks);
+    num_sigmas[ft].resize(num_chunks);
   }
 
   for (int i = 0; i < num_chunks; ++i) {
@@ -199,7 +223,7 @@ void structure::dump(const char *filename) {
 
   file.prevent_deadlock();
   for (int ft = 0; ft < 2; ++ft) {
-    sum_to_all(my_num_sigmas[ft], num_sigmas[ft], num_chunks);
+    sum_to_all(my_num_sigmas[ft].data(), num_sigmas[ft].data(), num_chunks);
   }
 
   size_t num_E_sigmas = 0;
@@ -210,17 +234,13 @@ void structure::dump(const char *filename) {
   }
 
   // Allocate space for component and direction of non-null sigmas
-  size_t *my_sigma_cd[2] = {NULL, NULL};
-  my_sigma_cd[E_stuff] = new size_t[num_E_sigmas * 2];
-  memset(my_sigma_cd[E_stuff], 0, sizeof(size_t) * num_E_sigmas * 2);
-  my_sigma_cd[H_stuff] = new size_t[num_H_sigmas * 2];
-  memset(my_sigma_cd[H_stuff], 0, sizeof(size_t) * num_H_sigmas * 2);
-
-  size_t *sigma_cd[2] = {NULL, NULL};
-  sigma_cd[E_stuff] = new size_t[num_E_sigmas * 2];
-  memset(sigma_cd[E_stuff], 0, sizeof(size_t) * num_E_sigmas * 2);
-  sigma_cd[H_stuff] = new size_t[num_H_sigmas * 2];
-  memset(sigma_cd[H_stuff], 0, sizeof(size_t) * num_H_sigmas * 2);
+  std::vector<size_t> my_sigma_cd[2];
+  my_sigma_cd[E_stuff].resize(num_E_sigmas * 2);
+  my_sigma_cd[H_stuff].resize(num_H_sigmas * 2);
+
+  std::vector<size_t> sigma_cd[2];
+  sigma_cd[E_stuff].resize(num_E_sigmas * 2);
+  sigma_cd[H_stuff].resize(num_H_sigmas * 2);
 
   // Find component and direction of non-null sigmas
   {
@@ -246,8 +266,8 @@ void structure::dump(const char *filename) {
         }
       }
     }
-    bw_or_to_all(my_sigma_cd[E_stuff], sigma_cd[E_stuff], num_E_sigmas * 2);
-    bw_or_to_all(my_sigma_cd[H_stuff], sigma_cd[H_stuff], num_H_sigmas * 2);
+    bw_or_to_all(my_sigma_cd[E_stuff].data(), sigma_cd[E_stuff].data(), num_E_sigmas * 2);
+    bw_or_to_all(my_sigma_cd[H_stuff].data(), sigma_cd[H_stuff].data(), num_H_sigmas * 2);
   }
 
   // Write location (component and direction) data of non-null sigmas (sigma[c][d])
@@ -256,9 +276,9 @@ void structure::dump(const char *filename) {
     file.create_data("sigma_cd", 1, &len);
     size_t start = 0;
     for (int ft = 0; ft < 2; ++ft) {
-      if (am_master()) {
-        size_t count = (ft == 0 ? num_E_sigmas : num_H_sigmas) * 2;
-        file.write_chunk(1, &start, &count, sigma_cd[ft]);
+      if (am_master() || !single_parallel_file) {
+        size_t count = sigma_cd[ft].size();
+        file.write_chunk(1, &start, &count, sigma_cd[ft].data());
         start += count;
       }
     }
@@ -292,15 +312,8 @@ void structure::dump(const char *filename) {
     }
   }
 
-  write_susceptibility_params(&file, "E_params", E_stuff);
-  write_susceptibility_params(&file, "H_params", H_stuff);
-
-  for (int ft = 0; ft < 2; ++ft) {
-    delete[] sigma_cd[ft];
-    delete[] my_sigma_cd[ft];
-    delete[] num_sigmas[ft];
-    delete[] my_num_sigmas[ft];
-  }
+  write_susceptibility_params(&file, single_parallel_file, "E_params", E_stuff);
+  write_susceptibility_params(&file, single_parallel_file, "H_params", H_stuff);
 }
 
 // Reconstruct the chiP lists of susceptibilities from the params hdf5 data
@@ -406,7 +419,7 @@ void structure::set_chiP_from_file(h5file *file, const char *dataset, field_type
   size_t dims[3] = {0, 0, 0};
 
   file->read_size(dataset, &rank, dims, 1);
-  if (rank != 1) meep::abort("inconsistent data size in structure::load");
+  if (rank != 1) meep::abort("inconsistent data size in structure::set_chiP_from_file");
 
   if (dims[0] != 0) {
     for (int i = 0; i < num_chunks; ++i) {
@@ -542,35 +555,49 @@ void structure::load_chunk_layout(const std::vector<grid_volume> &gvs,
   check_chunks();
 }
 
-void structure::load(const char *filename) {
-  h5file file(filename, h5file::READONLY, true);
+void structure::load(const char *filename, bool single_parallel_file) {
+  h5file file(filename, h5file::READONLY, single_parallel_file, !single_parallel_file);
 
-  if (verbosity > 0) master_printf("reading epsilon from file \"%s\"...\n", filename);
+  if (verbosity > 0) printf("reading epsilon from file \"%s\" (%d)...\n", filename, single_parallel_file);
+
+  /*
+   * make/save a num_chunks x NUM_FIELD_COMPONENTS x 5 array counting
+   * the number of entries in the chi1inv array for each chunk.
+   *
+   * When 'single_parallel_file' is true, we are creating a single block of data
+   * for ALL chunks (that are merged using MPI). Otherwise, we are just
+   * making a copy of just the chunks that are ours.
+   */
+  int my_num_chunks = 0;
+  for (int i = 0; i < num_chunks; i++) {
+    my_num_chunks += (single_parallel_file || chunks[i]->is_mine());
+  }
+  size_t num_chi1inv_size = my_num_chunks * NUM_FIELD_COMPONENTS * 5;
+  std::vector<size_t> num_chi1inv(num_chi1inv_size);
 
-  // make/save a num_chunks x NUM_FIELD_COMPONENTS x 5 array counting
-  // the number of entries in the chi1inv array for each chunk.
-  size_t *num_chi1inv = new size_t[num_chunks * NUM_FIELD_COMPONENTS * 5];
   int rank;
-  size_t dims[3], _dims[3] = {(size_t)num_chunks, NUM_FIELD_COMPONENTS, 5};
+  size_t dims[3], _dims[3] = {(size_t)my_num_chunks, NUM_FIELD_COMPONENTS, 5};
   size_t start[3] = {0, 0, 0};
   file.read_size("num_chi1inv", &rank, dims, 3);
   if (rank != 3 || _dims[0] != dims[0] || _dims[1] != dims[1] || _dims[2] != dims[2])
     meep::abort("chunk mismatch in structure::load");
-  if (am_master()) file.read_chunk(3, start, dims, num_chi1inv);
+  if (am_master() || !single_parallel_file) file.read_chunk(3, start, dims, num_chi1inv.data());
 
-  file.prevent_deadlock();
-  broadcast(0, num_chi1inv, dims[0] * dims[1] * dims[2]);
+  if (single_parallel_file) {
+    file.prevent_deadlock();
+    broadcast(0, num_chi1inv.data(), dims[0] * dims[1] * dims[2]);
+  }
 
   changing_chunks();
 
   // allocate data as needed and check sizes
   size_t my_ntot = 0;
-  for (int i = 0; i < num_chunks; i++)
+  for (int i = 0, chunk_i = 0; i < num_chunks; i++) {
     if (chunks[i]->is_mine()) {
       size_t ntot = chunks[i]->gv.ntot();
       for (int c = 0; c < NUM_FIELD_COMPONENTS; ++c)
         for (int d = 0; d < 5; ++d) {
-          size_t n = num_chi1inv[(i * NUM_FIELD_COMPONENTS + c) * 5 + d];
+          size_t n = num_chi1inv[(chunk_i * NUM_FIELD_COMPONENTS + c) * 5 + d];
           if (n == 0) {
             delete[] chunks[i]->chi1inv[c][d];
             chunks[i]->chi1inv[c][d] = NULL;
@@ -582,14 +609,25 @@ void structure::load(const char *filename) {
           }
         }
     }
+    chunk_i += (chunks[i]->is_mine() || single_parallel_file);
+  }
 
   // determine total dataset size and offset of this process's data
-  size_t my_start = partial_sum_to_all(my_ntot) - my_ntot;
-  size_t ntotal = sum_to_all(my_ntot);
+  size_t my_start = 0;
+  size_t ntotal = my_ntot;
+  if (single_parallel_file) {
+    my_start = partial_sum_to_all(my_ntot) - my_ntot;
+    ntotal = sum_to_all(my_ntot);
+  }
 
   // read the data
   file.read_size("chi1inv", &rank, dims, 1);
-  if (rank != 1 || dims[0] != ntotal) meep::abort("inconsistent data size in structure::load");
+  if (rank != 1 || dims[0] != ntotal) {
+    meep::abort(
+        "inconsistent data size for chi1inv in structure::load (rank, dims[0]): "
+        "(%d, %zu) != (1, %zu)",
+        rank, dims[0], ntotal);
+  }
   for (int i = 0; i < num_chunks; i++)
     if (chunks[i]->is_mine()) {
       size_t ntot = chunks[i]->gv.ntot();
@@ -611,44 +649,44 @@ void structure::load(const char *filename) {
     size_t dims[] = {0, 0, 0};
     file.read_size("num_sus", &rank, dims, 1);
     if (dims[0] > 0) {
-      if (am_master()) {
+      if (am_master() || !single_parallel_file) {
         size_t start = 0;
         size_t count = 2;
         file.read_chunk(rank, &start, &count, num_sus);
       }
     }
-    file.prevent_deadlock();
-    broadcast(0, num_sus, 2);
+    if (single_parallel_file) {
+      file.prevent_deadlock();
+      broadcast(0, num_sus, 2);
+    }
   }
 
   // Allocate and read non-null sigma entry data
-  size_t *num_sigmas[2] = {NULL, NULL};
-  num_sigmas[E_stuff] = new size_t[num_chunks];
-  memset(num_sigmas[E_stuff], 0, sizeof(size_t) * num_chunks);
-  num_sigmas[H_stuff] = new size_t[num_chunks];
-  memset(num_sigmas[H_stuff], 0, sizeof(size_t) * num_chunks);
+  std::vector<size_t> num_sigmas[2];
+  num_sigmas[E_stuff].resize(num_chunks);
+  num_sigmas[H_stuff].resize(num_chunks);
 
   // Read num_sigmas data
   {
     int rank = 0;
     size_t dims[] = {0, 0, 0};
     file.read_size("num_sigmas", &rank, dims, 1);
-    if (dims[0] != (size_t)num_chunks * 2) { meep::abort("inconsistent data size in structure::load"); }
-    if (am_master()) {
+    if (dims[0] != (size_t)num_chunks * 2) { meep::abort("inconsistent data size for num_sigmas in structure::load"); }
+    if (am_master() || !single_parallel_file) {
       size_t start = 0;
       size_t count = num_chunks;
-      file.read_chunk(rank, &start, &count, num_sigmas[E_stuff]);
+      file.read_chunk(rank, &start, &count, num_sigmas[E_stuff].data());
       start += count;
-      file.read_chunk(rank, &start, &count, num_sigmas[H_stuff]);
+      file.read_chunk(rank, &start, &count, num_sigmas[H_stuff].data());
+    }
+    if (single_parallel_file) {
+      file.prevent_deadlock();
+      broadcast(0, num_sigmas[E_stuff].data(), num_chunks);
+      broadcast(0, num_sigmas[H_stuff].data(), num_chunks);
     }
-    file.prevent_deadlock();
-    broadcast(0, num_sigmas[E_stuff], num_chunks);
-    broadcast(0, num_sigmas[H_stuff], num_chunks);
   }
 
   // Allocate space for component and direction data of the non-null susceptibilities
-  size_t *sigma_cd[2] = {NULL, NULL};
-
   size_t nsig[2] = {0, 0};
   for (int ft = 0; ft < 2; ++ft) {
     for (int i = 0; i < num_chunks; ++i) {
@@ -656,10 +694,9 @@ void structure::load(const char *filename) {
     }
   }
 
-  sigma_cd[E_stuff] = new size_t[nsig[E_stuff] * 2];
-  memset(sigma_cd[E_stuff], 0, sizeof(size_t) * nsig[E_stuff] * 2);
-  sigma_cd[H_stuff] = new size_t[nsig[H_stuff] * 2];
-  memset(sigma_cd[H_stuff], 0, sizeof(size_t) * nsig[H_stuff] * 2);
+  std::vector<size_t> sigma_cd[2];
+  sigma_cd[E_stuff].resize(nsig[E_stuff] * 2);
+  sigma_cd[H_stuff].resize(nsig[H_stuff] * 2);
 
   // Read the component/direction data
   {
@@ -667,20 +704,22 @@ void structure::load(const char *filename) {
     size_t dims[] = {0, 0, 0};
     file.read_size("sigma_cd", &rank, dims, 1);
     if (dims[0] != 2 * (nsig[E_stuff] + nsig[H_stuff])) {
-      meep::abort("inconsistent data size in structure::load");
+      meep::abort("inconsistent data size for sigma_cd in structure::load");
     }
 
-    if (am_master()) {
+    if (am_master() || !single_parallel_file) {
       size_t start = 0;
       for (int ft = 0; ft < 2; ++ft) {
-        size_t count = nsig[ft] * 2;
-        file.read_chunk(rank, &start, &count, sigma_cd[ft]);
+        size_t count = sigma_cd[ft].size();
+        file.read_chunk(rank, &start, &count, sigma_cd[ft].data());
         start += count;
       }
     }
-    file.prevent_deadlock();
-    broadcast(0, sigma_cd[E_stuff], nsig[E_stuff] * 2);
-    broadcast(0, sigma_cd[H_stuff], nsig[H_stuff] * 2);
+    if (single_parallel_file) {
+      file.prevent_deadlock();
+      broadcast(0, sigma_cd[E_stuff].data(), nsig[E_stuff] * 2);
+      broadcast(0, sigma_cd[H_stuff].data(), nsig[H_stuff] * 2);
+    }
   }
 
   for (int ft = 0; ft < 2; ++ft) {
@@ -730,10 +769,5 @@ void structure::load(const char *filename) {
       }
     }
   }
-
-  for (int ft = 0; ft < 2; ++ft) {
-    delete[] num_sigmas[ft];
-    delete[] sigma_cd[ft];
-  }
 }
 } // namespace meep