Skip to content

Commit

Permalink
loop tiling for step_curl using cache-oblivious recursive algorithm (N…
Browse files Browse the repository at this point in the history
…anoComp#1655)

* loop tiling for step_curl using cache-oblivious recursive algorithm

* minor fixes

* check that the sum of all the tiles adds up to the chunk volume

* switch from subdomain to gv when calling STEP_CURL

* replace manual computation of pixels in grid_volume with private function call

* lower mininum tile volume

* move loop over tiles outside of loop over components and use new tile_split function

* add undocumented parameter for specifying base case of recursion to Simulation constructor
  • Loading branch information
oskooi authored and mawc2019 committed Nov 3, 2021
1 parent fbda172 commit 61599ec
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 75 deletions.
5 changes: 4 additions & 1 deletion python/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,7 @@ def __init__(self,
progress_interval=4,
subpixel_tol=1e-4,
subpixel_maxeval=100000,
loop_tile_base=0,
ensure_periodicity=True,
num_chunks=0,
Courant=0.5,
Expand Down Expand Up @@ -1206,6 +1207,7 @@ def __init__(self,
self.eps_averaging = eps_averaging
self.subpixel_tol = subpixel_tol
self.subpixel_maxeval = subpixel_maxeval
self.loop_tile_base = loop_tile_base
self.ensure_periodicity = ensure_periodicity
self.extra_materials = extra_materials
self.default_material = default_material
Expand Down Expand Up @@ -1921,7 +1923,8 @@ def init_sim(self):
self.structure,
self.m if self.is_cylindrical else 0,
self.k_point.z if self.special_kz and self.k_point else 0,
not self.accurate_fields_near_cylorigin
not self.accurate_fields_near_cylorigin,
self.loop_tile_base
)

if self.force_all_components and self.dimensions != 1:
Expand Down
47 changes: 44 additions & 3 deletions src/fields.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ using namespace std;

namespace meep {

fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylorigin)
fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylorigin,
int loop_tile_base)
: S(s->S), gv(s->gv), user_volume(s->user_volume), v(s->v), m(m), beta(beta),
working_on(&times_spent) {
shared_chunks = s->shared_chunks;
Expand Down Expand Up @@ -57,7 +58,8 @@ fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylori
typedef fields_chunk *fields_chunk_ptr;
chunks = new fields_chunk_ptr[num_chunks];
for (int i = 0; i < num_chunks; i++)
chunks[i] = new fields_chunk(s->chunks[i], outdir, m, beta, zero_fields_near_cylorigin, i);
chunks[i] = new fields_chunk(s->chunks[i], outdir, m, beta,
zero_fields_near_cylorigin, i, loop_tile_base);
FOR_FIELD_TYPES(ft) {
typedef realnum *realnum_ptr;
comm_blocks[ft] = new realnum_ptr[num_chunks * num_chunks];
Expand Down Expand Up @@ -196,8 +198,40 @@ fields_chunk::~fields_chunk() {
if (new_s && new_s->refcount-- <= 1) delete new_s; // delete if not shared
}

static void split_into_tiles(grid_volume gvol, std::vector<grid_volume> *result,
const size_t loop_tile_base) {
if (gvol.nowned_min() < loop_tile_base) {
result->push_back(gvol);
return;
}

int best_split_point;
direction best_split_direction;
gvol.tile_split(best_split_point, best_split_direction);
grid_volume left_gvol = gvol.split_at_fraction(false, best_split_point, best_split_direction);
split_into_tiles(left_gvol, result, loop_tile_base);
grid_volume right_gvol = gvol.split_at_fraction(true, best_split_point, best_split_direction);
split_into_tiles(right_gvol, result, loop_tile_base);
return;
}

// First check that the tile volumes gvs do not intersect and that they add
// up to the chunk's total grid_volume gv
static void check_tiles(grid_volume gv, const std::vector<grid_volume> &gvs) {
grid_volume vol_intersection;
for (size_t i = 0; i < gvs.size(); i++)
for (size_t j = i + 1; j < gvs.size(); j++)
if (gvs[i].intersect_with(gvs[j], &vol_intersection))
meep::abort("gvs[%zu] intersects with gvs[%zu]\n", i, j);
size_t sum = 0;
for (const auto& sub_gv : gvs) { sum += sub_gv.nowned_min(); }
size_t v_grid_points = 1;
LOOP_OVER_DIRECTIONS(gv.dim, d) { v_grid_points *= gv.num_direction(d); }
if (sum != v_grid_points) meep::abort("v_grid_points = %zu, sum(tiles) = %zu\n", v_grid_points, sum);
}

fields_chunk::fields_chunk(structure_chunk *the_s, const char *od, double m, double beta,
bool zero_fields_near_cylorigin, int chunkidx)
bool zero_fields_near_cylorigin, int chunkidx, int loop_tile_base)
: gv(the_s->gv), v(the_s->v), m(m), zero_fields_near_cylorigin(zero_fields_near_cylorigin),
beta(beta) {
s = the_s;
Expand All @@ -210,6 +244,12 @@ fields_chunk::fields_chunk(structure_chunk *the_s, const char *od, double m, dou
Courant = s->Courant;
dt = s->dt;
dft_chunks = NULL;
if (loop_tile_base > 0) {
split_into_tiles(gv, &gvs, loop_tile_base);
check_tiles(gv, gvs);
} else {
gvs.push_back(gv);
}
FOR_FIELD_TYPES(ft) {
polarization_state *cur = NULL;
pol[ft] = NULL;
Expand Down Expand Up @@ -265,6 +305,7 @@ fields_chunk::fields_chunk(const fields_chunk &thef, int chunkidx) : gv(thef.gv)
Courant = thef.Courant;
dt = thef.dt;
dft_chunks = NULL;
gvs = thef.gvs;
FOR_FIELD_TYPES(ft) {
polarization_state *cur = NULL;
for (polarization_state *ocur = thef.pol[ft]; ocur; ocur = ocur->next) {
Expand Down
6 changes: 4 additions & 2 deletions src/meep.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1448,6 +1448,7 @@ class fields_chunk {

double a, Courant, dt; // resolution a, Courant number, and timestep dt=Courant/a
grid_volume gv;
std::vector<grid_volume> gvs; // subdomains for cache-tiled execution of step_curl
volume v;
double m; // angular dependence in cyl. coords
bool zero_fields_near_cylorigin; // fields=0 m pixels near r=0 for stability
Expand All @@ -1460,7 +1461,7 @@ class fields_chunk {
int chunk_idx;

fields_chunk(structure_chunk *, const char *outdir, double m, double beta,
bool zero_fields_near_cylorigin, int chunkidx);
bool zero_fields_near_cylorigin, int chunkidx, int loop_tile_base);

fields_chunk(const fields_chunk &, int chunkidx);
~fields_chunk();
Expand Down Expand Up @@ -1693,7 +1694,8 @@ class fields {
bool components_allocated;

// fields.cpp methods:
fields(structure *, double m = 0, double beta = 0, bool zero_fields_near_cylorigin = true);
fields(structure *, double m = 0, double beta = 0, bool zero_fields_near_cylorigin = true,
int loop_tile_base = 0);
fields(const fields &);
~fields();
bool equal_layout(const fields &f) const;
Expand Down
2 changes: 2 additions & 0 deletions src/meep/vec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,8 @@ class grid_volume {

std::complex<double> get_split_costs(direction d, int split_point,
bool frag_cost) const;
void tile_split(int &best_split_point,
direction &best_split_direction) const;
void find_best_split(int desired_chunks, bool frag_cost,
int &best_split_point, direction &best_split_direction,
double &left_effort_fraction) const;
Expand Down
140 changes: 71 additions & 69 deletions src/step_db.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,83 +44,85 @@ bool fields_chunk::step_db(field_type ft) {

if (ft != B_stuff && ft != D_stuff) meep::abort("bug - step_db should only be called for B or D");

DOCMP FOR_FT_COMPONENTS(ft, cc) {
if (f[cc][cmp]) {
const component c_p = plus_component[cc], c_m = minus_component[cc];
const direction d_deriv_p = plus_deriv_direction[cc];
const direction d_deriv_m = minus_deriv_direction[cc];
const direction d_c = component_direction(cc);
const bool have_p = have_plus_deriv[cc];
const bool have_m = have_minus_deriv[cc];
const direction dsig0 = cycle_direction(gv.dim, d_c, 1);
const direction dsig = s->sigsize[dsig0] > 1 ? dsig0 : NO_DIRECTION;
const direction dsigu0 = cycle_direction(gv.dim, d_c, 2);
const direction dsigu = s->sigsize[dsigu0] > 1 ? dsigu0 : NO_DIRECTION;
ptrdiff_t stride_p = have_p ? gv.stride(d_deriv_p) : 0;
ptrdiff_t stride_m = have_m ? gv.stride(d_deriv_m) : 0;
realnum *f_p = have_p ? f[c_p][cmp] : NULL;
realnum *f_m = have_m ? f[c_m][cmp] : NULL;
realnum *the_f = f[cc][cmp];
for (const auto& sub_gv : gvs) {
DOCMP FOR_FT_COMPONENTS(ft, cc) {
if (f[cc][cmp]) {
const component c_p = plus_component[cc], c_m = minus_component[cc];
const direction d_deriv_p = plus_deriv_direction[cc];
const direction d_deriv_m = minus_deriv_direction[cc];
const direction d_c = component_direction(cc);
const bool have_p = have_plus_deriv[cc];
const bool have_m = have_minus_deriv[cc];
const direction dsig0 = cycle_direction(gv.dim, d_c, 1);
const direction dsig = s->sigsize[dsig0] > 1 ? dsig0 : NO_DIRECTION;
const direction dsigu0 = cycle_direction(gv.dim, d_c, 2);
const direction dsigu = s->sigsize[dsigu0] > 1 ? dsigu0 : NO_DIRECTION;
ptrdiff_t stride_p = have_p ? gv.stride(d_deriv_p) : 0;
ptrdiff_t stride_m = have_m ? gv.stride(d_deriv_m) : 0;
realnum *f_p = have_p ? f[c_p][cmp] : NULL;
realnum *f_m = have_m ? f[c_m][cmp] : NULL;
realnum *the_f = f[cc][cmp];

if (dsig != NO_DIRECTION && s->conductivity[cc][d_c] && !f_cond[cc][cmp]) {
f_cond[cc][cmp] = new realnum[gv.ntot()];
memset(f_cond[cc][cmp], 0, sizeof(realnum) * gv.ntot());
}
if (dsigu != NO_DIRECTION && !f_u[cc][cmp]) {
f_u[cc][cmp] = new realnum[gv.ntot()];
memcpy(f_u[cc][cmp], the_f, gv.ntot() * sizeof(realnum));
allocated_u = true;
}
if (dsig != NO_DIRECTION && s->conductivity[cc][d_c] && !f_cond[cc][cmp]) {
f_cond[cc][cmp] = new realnum[gv.ntot()];
memset(f_cond[cc][cmp], 0, sizeof(realnum) * gv.ntot());
}
if (dsigu != NO_DIRECTION && !f_u[cc][cmp]) {
f_u[cc][cmp] = new realnum[gv.ntot()];
memcpy(f_u[cc][cmp], the_f, gv.ntot() * sizeof(realnum));
allocated_u = true;
}

if (ft == D_stuff) { // strides are opposite sign for H curl
stride_p = -stride_p;
stride_m = -stride_m;
}
if (ft == D_stuff) { // strides are opposite sign for H curl
stride_p = -stride_p;
stride_m = -stride_m;
}

if (gv.dim == Dcyl) switch (d_c) {
case R:
f_p = NULL; // im/r Fz term will be handled separately
break;
case P: break; // curl works normally for phi component
case Z: {
f_m = NULL; // im/r Fr term will be handled separately
if (gv.dim == Dcyl) switch (d_c) {
case R:
f_p = NULL; // im/r Fz term will be handled separately
break;
case P: break; // curl works normally for phi component
case Z: {
f_m = NULL; // im/r Fr term will be handled separately

/* Here we do a somewhat cool hack: the update of the z
component gives a 1/r d(r Fp)/dr term, rather than
just the derivative dg/dr expected in step_curl.
Rather than duplicating all of step_curl to handle
this bloody derivative, however, we define a new
array f_rderiv_int which is the integral of 1/r d(r Fp)/dr,
so that we can pass it to the unmodified step_curl
and get the correct derivative. (More precisely,
the derivative and integral are replaced by differences
and sums, but you get the idea). */
if (!f_rderiv_int) f_rderiv_int = new realnum[gv.ntot()];
realnum ir0 = gv.origin_r() * gv.a + 0.5 * gv.iyee_shift(c_p).in_direction(R);
for (int iz = 0; iz <= gv.nz(); ++iz)
f_rderiv_int[iz] = 0;
int sr = gv.nz() + 1;
for (int ir = 1; ir <= gv.nr(); ++ir) {
realnum rinv = 1.0 / ((ir + ir0) - 0.5);
for (int iz = 0; iz <= gv.nz(); ++iz) {
ptrdiff_t idx = ir * sr + iz;
f_rderiv_int[idx] =
f_rderiv_int[idx - sr] +
rinv * (f_p[idx] * (ir + ir0) - f_p[idx - sr] * ((ir - 1) + ir0));
/* Here we do a somewhat cool hack: the update of the z
component gives a 1/r d(r Fp)/dr term, rather than
just the derivative dg/dr expected in step_curl.
Rather than duplicating all of step_curl to handle
this bloody derivative, however, we define a new
array f_rderiv_int which is the integral of 1/r d(r Fp)/dr,
so that we can pass it to the unmodified step_curl
and get the correct derivative. (More precisely,
the derivative and integral are replaced by differences
and sums, but you get the idea). */
if (!f_rderiv_int) f_rderiv_int = new realnum[gv.ntot()];
realnum ir0 = gv.origin_r() * gv.a + 0.5 * gv.iyee_shift(c_p).in_direction(R);
for (int iz = 0; iz <= gv.nz(); ++iz)
f_rderiv_int[iz] = 0;
int sr = gv.nz() + 1;
for (int ir = 1; ir <= gv.nr(); ++ir) {
realnum rinv = 1.0 / ((ir + ir0) - 0.5);
for (int iz = 0; iz <= gv.nz(); ++iz) {
ptrdiff_t idx = ir * sr + iz;
f_rderiv_int[idx] =
f_rderiv_int[idx - sr] +
rinv * (f_p[idx] * (ir + ir0) - f_p[idx - sr] * ((ir - 1) + ir0));
}
}
f_p = f_rderiv_int;
break;
}
f_p = f_rderiv_int;
break;
default: meep::abort("bug - non-cylindrical field component in Dcyl");
}
default: meep::abort("bug - non-cylindrical field component in Dcyl");
}

STEP_CURL(the_f, cc, f_p, f_m, stride_p, stride_m,
gv, gv.little_owned_corner0(cc), gv.big_corner(),
Courant, dsig, s->sig[dsig],
s->kap[dsig], s->siginv[dsig], f_u[cc][cmp], dsigu, s->sig[dsigu], s->kap[dsigu],
s->siginv[dsigu], dt, s->conductivity[cc][d_c], s->condinv[cc][d_c],
f_cond[cc][cmp]);
STEP_CURL(the_f, cc, f_p, f_m, stride_p, stride_m,
gv, sub_gv.little_owned_corner0(cc), sub_gv.big_corner(),
Courant, dsig, s->sig[dsig],
s->kap[dsig], s->siginv[dsig], f_u[cc][cmp], dsigu, s->sig[dsigu], s->kap[dsigu],
s->siginv[dsigu], dt, s->conductivity[cc][d_c], s->condinv[cc][d_c],
f_cond[cc][cmp]);
}
}
}

Expand Down
18 changes: 18 additions & 0 deletions src/vec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,24 @@ static double cost_diff(int desired_chunks, std::complex<double> costs) {
return right_cost - left_cost;
}

void grid_volume::tile_split(int &best_split_point,
direction &best_split_direction) const {
const size_t ntot_thresh = 10;
if (ntot() < ntot_thresh) {
best_split_point = 0;
best_split_direction = NO_DIRECTION;
} else if (nx() > 1) {
best_split_point = nx() / 2;
best_split_direction = X;
} else if (ny() > 1) {
best_split_point = ny() / 2;
best_split_direction = Y;
} else {
best_split_point = nz() / 2;
best_split_direction = Z;
}
}

void grid_volume::find_best_split(int desired_chunks, bool fragment_cost,
int &best_split_point,
direction &best_split_direction,
Expand Down

0 comments on commit 61599ec

Please sign in to comment.