diff --git a/python/simulation.py b/python/simulation.py index 9eca33bda..fcb32827f 100644 --- a/python/simulation.py +++ b/python/simulation.py @@ -953,6 +953,7 @@ def __init__(self, progress_interval=4, subpixel_tol=1e-4, subpixel_maxeval=100000, + loop_tile_base=0, ensure_periodicity=True, num_chunks=0, Courant=0.5, @@ -1206,6 +1207,7 @@ def __init__(self, self.eps_averaging = eps_averaging self.subpixel_tol = subpixel_tol self.subpixel_maxeval = subpixel_maxeval + self.loop_tile_base = loop_tile_base self.ensure_periodicity = ensure_periodicity self.extra_materials = extra_materials self.default_material = default_material @@ -1921,7 +1923,8 @@ def init_sim(self): self.structure, self.m if self.is_cylindrical else 0, self.k_point.z if self.special_kz and self.k_point else 0, - not self.accurate_fields_near_cylorigin + not self.accurate_fields_near_cylorigin, + self.loop_tile_base ) if self.force_all_components and self.dimensions != 1: diff --git a/src/fields.cpp b/src/fields.cpp index 84e5d898b..f5d0d9767 100644 --- a/src/fields.cpp +++ b/src/fields.cpp @@ -29,7 +29,8 @@ using namespace std; namespace meep { -fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylorigin) +fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylorigin, + int loop_tile_base) : S(s->S), gv(s->gv), user_volume(s->user_volume), v(s->v), m(m), beta(beta), working_on(×_spent) { shared_chunks = s->shared_chunks; @@ -57,7 +58,8 @@ fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylori typedef fields_chunk *fields_chunk_ptr; chunks = new fields_chunk_ptr[num_chunks]; for (int i = 0; i < num_chunks; i++) - chunks[i] = new fields_chunk(s->chunks[i], outdir, m, beta, zero_fields_near_cylorigin, i); + chunks[i] = new fields_chunk(s->chunks[i], outdir, m, beta, + zero_fields_near_cylorigin, i, loop_tile_base); FOR_FIELD_TYPES(ft) { typedef realnum *realnum_ptr; comm_blocks[ft] = new realnum_ptr[num_chunks * num_chunks]; @@ -196,8 +198,40 @@ fields_chunk::~fields_chunk() { if (new_s && new_s->refcount-- <= 1) delete new_s; // delete if not shared } +static void split_into_tiles(grid_volume gvol, std::vector *result, + const size_t loop_tile_base) { + if (gvol.nowned_min() < loop_tile_base) { + result->push_back(gvol); + return; + } + + int best_split_point; + direction best_split_direction; + gvol.tile_split(best_split_point, best_split_direction); + grid_volume left_gvol = gvol.split_at_fraction(false, best_split_point, best_split_direction); + split_into_tiles(left_gvol, result, loop_tile_base); + grid_volume right_gvol = gvol.split_at_fraction(true, best_split_point, best_split_direction); + split_into_tiles(right_gvol, result, loop_tile_base); + return; +} + +// First check that the tile volumes gvs do not intersect and that they add +// up to the chunk's total grid_volume gv +static void check_tiles(grid_volume gv, const std::vector &gvs) { + grid_volume vol_intersection; + for (size_t i = 0; i < gvs.size(); i++) + for (size_t j = i + 1; j < gvs.size(); j++) + if (gvs[i].intersect_with(gvs[j], &vol_intersection)) + meep::abort("gvs[%zu] intersects with gvs[%zu]\n", i, j); + size_t sum = 0; + for (const auto& sub_gv : gvs) { sum += sub_gv.nowned_min(); } + size_t v_grid_points = 1; + LOOP_OVER_DIRECTIONS(gv.dim, d) { v_grid_points *= gv.num_direction(d); } + if (sum != v_grid_points) meep::abort("v_grid_points = %zu, sum(tiles) = %zu\n", v_grid_points, sum); +} + fields_chunk::fields_chunk(structure_chunk *the_s, const char *od, double m, double beta, - bool zero_fields_near_cylorigin, int chunkidx) + bool zero_fields_near_cylorigin, int chunkidx, int loop_tile_base) : gv(the_s->gv), v(the_s->v), m(m), zero_fields_near_cylorigin(zero_fields_near_cylorigin), beta(beta) { s = the_s; @@ -210,6 +244,12 @@ fields_chunk::fields_chunk(structure_chunk *the_s, const char *od, double m, dou Courant = s->Courant; dt = s->dt; dft_chunks = NULL; + if (loop_tile_base > 0) { + split_into_tiles(gv, &gvs, loop_tile_base); + check_tiles(gv, gvs); + } else { + gvs.push_back(gv); + } FOR_FIELD_TYPES(ft) { polarization_state *cur = NULL; pol[ft] = NULL; @@ -265,6 +305,7 @@ fields_chunk::fields_chunk(const fields_chunk &thef, int chunkidx) : gv(thef.gv) Courant = thef.Courant; dt = thef.dt; dft_chunks = NULL; + gvs = thef.gvs; FOR_FIELD_TYPES(ft) { polarization_state *cur = NULL; for (polarization_state *ocur = thef.pol[ft]; ocur; ocur = ocur->next) { diff --git a/src/meep.hpp b/src/meep.hpp index d92872852..aeb28211e 100644 --- a/src/meep.hpp +++ b/src/meep.hpp @@ -1448,6 +1448,7 @@ class fields_chunk { double a, Courant, dt; // resolution a, Courant number, and timestep dt=Courant/a grid_volume gv; + std::vector gvs; // subdomains for cache-tiled execution of step_curl volume v; double m; // angular dependence in cyl. coords bool zero_fields_near_cylorigin; // fields=0 m pixels near r=0 for stability @@ -1460,7 +1461,7 @@ class fields_chunk { int chunk_idx; fields_chunk(structure_chunk *, const char *outdir, double m, double beta, - bool zero_fields_near_cylorigin, int chunkidx); + bool zero_fields_near_cylorigin, int chunkidx, int loop_tile_base); fields_chunk(const fields_chunk &, int chunkidx); ~fields_chunk(); @@ -1693,7 +1694,8 @@ class fields { bool components_allocated; // fields.cpp methods: - fields(structure *, double m = 0, double beta = 0, bool zero_fields_near_cylorigin = true); + fields(structure *, double m = 0, double beta = 0, bool zero_fields_near_cylorigin = true, + int loop_tile_base = 0); fields(const fields &); ~fields(); bool equal_layout(const fields &f) const; diff --git a/src/meep/vec.hpp b/src/meep/vec.hpp index 6f1c5ae04..553d40faa 100644 --- a/src/meep/vec.hpp +++ b/src/meep/vec.hpp @@ -1027,6 +1027,8 @@ class grid_volume { std::complex get_split_costs(direction d, int split_point, bool frag_cost) const; + void tile_split(int &best_split_point, + direction &best_split_direction) const; void find_best_split(int desired_chunks, bool frag_cost, int &best_split_point, direction &best_split_direction, double &left_effort_fraction) const; diff --git a/src/step_db.cpp b/src/step_db.cpp index 5a31f93bc..7fc6568a1 100644 --- a/src/step_db.cpp +++ b/src/step_db.cpp @@ -44,83 +44,85 @@ bool fields_chunk::step_db(field_type ft) { if (ft != B_stuff && ft != D_stuff) meep::abort("bug - step_db should only be called for B or D"); - DOCMP FOR_FT_COMPONENTS(ft, cc) { - if (f[cc][cmp]) { - const component c_p = plus_component[cc], c_m = minus_component[cc]; - const direction d_deriv_p = plus_deriv_direction[cc]; - const direction d_deriv_m = minus_deriv_direction[cc]; - const direction d_c = component_direction(cc); - const bool have_p = have_plus_deriv[cc]; - const bool have_m = have_minus_deriv[cc]; - const direction dsig0 = cycle_direction(gv.dim, d_c, 1); - const direction dsig = s->sigsize[dsig0] > 1 ? dsig0 : NO_DIRECTION; - const direction dsigu0 = cycle_direction(gv.dim, d_c, 2); - const direction dsigu = s->sigsize[dsigu0] > 1 ? dsigu0 : NO_DIRECTION; - ptrdiff_t stride_p = have_p ? gv.stride(d_deriv_p) : 0; - ptrdiff_t stride_m = have_m ? gv.stride(d_deriv_m) : 0; - realnum *f_p = have_p ? f[c_p][cmp] : NULL; - realnum *f_m = have_m ? f[c_m][cmp] : NULL; - realnum *the_f = f[cc][cmp]; + for (const auto& sub_gv : gvs) { + DOCMP FOR_FT_COMPONENTS(ft, cc) { + if (f[cc][cmp]) { + const component c_p = plus_component[cc], c_m = minus_component[cc]; + const direction d_deriv_p = plus_deriv_direction[cc]; + const direction d_deriv_m = minus_deriv_direction[cc]; + const direction d_c = component_direction(cc); + const bool have_p = have_plus_deriv[cc]; + const bool have_m = have_minus_deriv[cc]; + const direction dsig0 = cycle_direction(gv.dim, d_c, 1); + const direction dsig = s->sigsize[dsig0] > 1 ? dsig0 : NO_DIRECTION; + const direction dsigu0 = cycle_direction(gv.dim, d_c, 2); + const direction dsigu = s->sigsize[dsigu0] > 1 ? dsigu0 : NO_DIRECTION; + ptrdiff_t stride_p = have_p ? gv.stride(d_deriv_p) : 0; + ptrdiff_t stride_m = have_m ? gv.stride(d_deriv_m) : 0; + realnum *f_p = have_p ? f[c_p][cmp] : NULL; + realnum *f_m = have_m ? f[c_m][cmp] : NULL; + realnum *the_f = f[cc][cmp]; - if (dsig != NO_DIRECTION && s->conductivity[cc][d_c] && !f_cond[cc][cmp]) { - f_cond[cc][cmp] = new realnum[gv.ntot()]; - memset(f_cond[cc][cmp], 0, sizeof(realnum) * gv.ntot()); - } - if (dsigu != NO_DIRECTION && !f_u[cc][cmp]) { - f_u[cc][cmp] = new realnum[gv.ntot()]; - memcpy(f_u[cc][cmp], the_f, gv.ntot() * sizeof(realnum)); - allocated_u = true; - } + if (dsig != NO_DIRECTION && s->conductivity[cc][d_c] && !f_cond[cc][cmp]) { + f_cond[cc][cmp] = new realnum[gv.ntot()]; + memset(f_cond[cc][cmp], 0, sizeof(realnum) * gv.ntot()); + } + if (dsigu != NO_DIRECTION && !f_u[cc][cmp]) { + f_u[cc][cmp] = new realnum[gv.ntot()]; + memcpy(f_u[cc][cmp], the_f, gv.ntot() * sizeof(realnum)); + allocated_u = true; + } - if (ft == D_stuff) { // strides are opposite sign for H curl - stride_p = -stride_p; - stride_m = -stride_m; - } + if (ft == D_stuff) { // strides are opposite sign for H curl + stride_p = -stride_p; + stride_m = -stride_m; + } - if (gv.dim == Dcyl) switch (d_c) { - case R: - f_p = NULL; // im/r Fz term will be handled separately - break; - case P: break; // curl works normally for phi component - case Z: { - f_m = NULL; // im/r Fr term will be handled separately + if (gv.dim == Dcyl) switch (d_c) { + case R: + f_p = NULL; // im/r Fz term will be handled separately + break; + case P: break; // curl works normally for phi component + case Z: { + f_m = NULL; // im/r Fr term will be handled separately - /* Here we do a somewhat cool hack: the update of the z - component gives a 1/r d(r Fp)/dr term, rather than - just the derivative dg/dr expected in step_curl. - Rather than duplicating all of step_curl to handle - this bloody derivative, however, we define a new - array f_rderiv_int which is the integral of 1/r d(r Fp)/dr, - so that we can pass it to the unmodified step_curl - and get the correct derivative. (More precisely, - the derivative and integral are replaced by differences - and sums, but you get the idea). */ - if (!f_rderiv_int) f_rderiv_int = new realnum[gv.ntot()]; - realnum ir0 = gv.origin_r() * gv.a + 0.5 * gv.iyee_shift(c_p).in_direction(R); - for (int iz = 0; iz <= gv.nz(); ++iz) - f_rderiv_int[iz] = 0; - int sr = gv.nz() + 1; - for (int ir = 1; ir <= gv.nr(); ++ir) { - realnum rinv = 1.0 / ((ir + ir0) - 0.5); - for (int iz = 0; iz <= gv.nz(); ++iz) { - ptrdiff_t idx = ir * sr + iz; - f_rderiv_int[idx] = - f_rderiv_int[idx - sr] + - rinv * (f_p[idx] * (ir + ir0) - f_p[idx - sr] * ((ir - 1) + ir0)); + /* Here we do a somewhat cool hack: the update of the z + component gives a 1/r d(r Fp)/dr term, rather than + just the derivative dg/dr expected in step_curl. + Rather than duplicating all of step_curl to handle + this bloody derivative, however, we define a new + array f_rderiv_int which is the integral of 1/r d(r Fp)/dr, + so that we can pass it to the unmodified step_curl + and get the correct derivative. (More precisely, + the derivative and integral are replaced by differences + and sums, but you get the idea). */ + if (!f_rderiv_int) f_rderiv_int = new realnum[gv.ntot()]; + realnum ir0 = gv.origin_r() * gv.a + 0.5 * gv.iyee_shift(c_p).in_direction(R); + for (int iz = 0; iz <= gv.nz(); ++iz) + f_rderiv_int[iz] = 0; + int sr = gv.nz() + 1; + for (int ir = 1; ir <= gv.nr(); ++ir) { + realnum rinv = 1.0 / ((ir + ir0) - 0.5); + for (int iz = 0; iz <= gv.nz(); ++iz) { + ptrdiff_t idx = ir * sr + iz; + f_rderiv_int[idx] = + f_rderiv_int[idx - sr] + + rinv * (f_p[idx] * (ir + ir0) - f_p[idx - sr] * ((ir - 1) + ir0)); + } } + f_p = f_rderiv_int; + break; } - f_p = f_rderiv_int; - break; + default: meep::abort("bug - non-cylindrical field component in Dcyl"); } - default: meep::abort("bug - non-cylindrical field component in Dcyl"); - } - STEP_CURL(the_f, cc, f_p, f_m, stride_p, stride_m, - gv, gv.little_owned_corner0(cc), gv.big_corner(), - Courant, dsig, s->sig[dsig], - s->kap[dsig], s->siginv[dsig], f_u[cc][cmp], dsigu, s->sig[dsigu], s->kap[dsigu], - s->siginv[dsigu], dt, s->conductivity[cc][d_c], s->condinv[cc][d_c], - f_cond[cc][cmp]); + STEP_CURL(the_f, cc, f_p, f_m, stride_p, stride_m, + gv, sub_gv.little_owned_corner0(cc), sub_gv.big_corner(), + Courant, dsig, s->sig[dsig], + s->kap[dsig], s->siginv[dsig], f_u[cc][cmp], dsigu, s->sig[dsigu], s->kap[dsigu], + s->siginv[dsigu], dt, s->conductivity[cc][d_c], s->condinv[cc][d_c], + f_cond[cc][cmp]); + } } } diff --git a/src/vec.cpp b/src/vec.cpp index c704749dd..69e809e02 100644 --- a/src/vec.cpp +++ b/src/vec.cpp @@ -978,6 +978,24 @@ static double cost_diff(int desired_chunks, std::complex costs) { return right_cost - left_cost; } +void grid_volume::tile_split(int &best_split_point, + direction &best_split_direction) const { + const size_t ntot_thresh = 10; + if (ntot() < ntot_thresh) { + best_split_point = 0; + best_split_direction = NO_DIRECTION; + } else if (nx() > 1) { + best_split_point = nx() / 2; + best_split_direction = X; + } else if (ny() > 1) { + best_split_point = ny() / 2; + best_split_direction = Y; + } else { + best_split_point = nz() / 2; + best_split_direction = Z; + } +} + void grid_volume::find_best_split(int desired_chunks, bool fragment_cost, int &best_split_point, direction &best_split_direction,