Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

loop tiling for step_curl using cache-oblivious recursive algorithm #1655

Merged
merged 8 commits into from
Aug 11, 2021
5 changes: 4 additions & 1 deletion python/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,7 @@ def __init__(self,
progress_interval=4,
subpixel_tol=1e-4,
subpixel_maxeval=100000,
loop_tile_base=0,
ensure_periodicity=True,
num_chunks=0,
Courant=0.5,
Expand Down Expand Up @@ -1206,6 +1207,7 @@ def __init__(self,
self.eps_averaging = eps_averaging
self.subpixel_tol = subpixel_tol
self.subpixel_maxeval = subpixel_maxeval
self.loop_tile_base = loop_tile_base
self.ensure_periodicity = ensure_periodicity
self.extra_materials = extra_materials
self.default_material = default_material
Expand Down Expand Up @@ -1921,7 +1923,8 @@ def init_sim(self):
self.structure,
self.m if self.is_cylindrical else 0,
self.k_point.z if self.special_kz and self.k_point else 0,
not self.accurate_fields_near_cylorigin
not self.accurate_fields_near_cylorigin,
self.loop_tile_base
)

if self.force_all_components and self.dimensions != 1:
Expand Down
47 changes: 44 additions & 3 deletions src/fields.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ using namespace std;

namespace meep {

fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylorigin)
fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylorigin,
int loop_tile_base)
: S(s->S), gv(s->gv), user_volume(s->user_volume), v(s->v), m(m), beta(beta),
working_on(&times_spent) {
shared_chunks = s->shared_chunks;
Expand Down Expand Up @@ -57,7 +58,8 @@ fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylori
typedef fields_chunk *fields_chunk_ptr;
chunks = new fields_chunk_ptr[num_chunks];
for (int i = 0; i < num_chunks; i++)
chunks[i] = new fields_chunk(s->chunks[i], outdir, m, beta, zero_fields_near_cylorigin, i);
chunks[i] = new fields_chunk(s->chunks[i], outdir, m, beta,
zero_fields_near_cylorigin, i, loop_tile_base);
FOR_FIELD_TYPES(ft) {
typedef realnum *realnum_ptr;
comm_blocks[ft] = new realnum_ptr[num_chunks * num_chunks];
Expand Down Expand Up @@ -202,8 +204,40 @@ fields_chunk::~fields_chunk() {
if (new_s && new_s->refcount-- <= 1) delete new_s; // delete if not shared
}

static void split_into_tiles(grid_volume gvol, std::vector<grid_volume> *result,
const size_t loop_tile_base) {
if (gvol.nowned_min() < loop_tile_base) {
result->push_back(gvol);
return;
}

int best_split_point;
direction best_split_direction;
gvol.tile_split(best_split_point, best_split_direction);
grid_volume left_gvol = gvol.split_at_fraction(false, best_split_point, best_split_direction);
split_into_tiles(left_gvol, result, loop_tile_base);
grid_volume right_gvol = gvol.split_at_fraction(true, best_split_point, best_split_direction);
split_into_tiles(right_gvol, result, loop_tile_base);
return;
oskooi marked this conversation as resolved.
Show resolved Hide resolved
}

// First check that the tile volumes gvs do not intersect and that they add
// up to the chunk's total grid_volume gv
static void check_tiles(grid_volume gv, const std::vector<grid_volume> &gvs) {
grid_volume vol_intersection;
for (size_t i = 0; i < gvs.size(); i++)
for (size_t j = i + 1; j < gvs.size(); j++)
if (gvs[i].intersect_with(gvs[j], &vol_intersection))
meep::abort("gvs[%zu] intersects with gvs[%zu]\n", i, j);
size_t sum = 0;
for (const auto& sub_gv : gvs) { sum += sub_gv.nowned_min(); }
size_t v_grid_points = 1;
LOOP_OVER_DIRECTIONS(gv.dim, d) { v_grid_points *= gv.num_direction(d); }
if (sum != v_grid_points) meep::abort("v_grid_points = %zu, sum(tiles) = %zu\n", v_grid_points, sum);
}

fields_chunk::fields_chunk(structure_chunk *the_s, const char *od, double m, double beta,
bool zero_fields_near_cylorigin, int chunkidx)
bool zero_fields_near_cylorigin, int chunkidx, int loop_tile_base)
: gv(the_s->gv), v(the_s->v), m(m), zero_fields_near_cylorigin(zero_fields_near_cylorigin),
beta(beta) {
s = the_s;
Expand All @@ -216,6 +250,12 @@ fields_chunk::fields_chunk(structure_chunk *the_s, const char *od, double m, dou
Courant = s->Courant;
dt = s->dt;
dft_chunks = NULL;
if (loop_tile_base > 0) {
split_into_tiles(gv, &gvs, loop_tile_base);
check_tiles(gv, gvs);
} else {
gvs.push_back(gv);
}
FOR_FIELD_TYPES(ft) {
polarization_state *cur = NULL;
pol[ft] = NULL;
Expand Down Expand Up @@ -277,6 +317,7 @@ fields_chunk::fields_chunk(const fields_chunk &thef, int chunkidx) : gv(thef.gv)
Courant = thef.Courant;
dt = thef.dt;
dft_chunks = NULL;
gvs = thef.gvs;
FOR_FIELD_TYPES(ft) {
polarization_state *cur = NULL;
for (polarization_state *ocur = thef.pol[ft]; ocur; ocur = ocur->next) {
Expand Down
6 changes: 4 additions & 2 deletions src/meep.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1447,6 +1447,7 @@ class fields_chunk {

double a, Courant, dt; // resolution a, Courant number, and timestep dt=Courant/a
grid_volume gv;
std::vector<grid_volume> gvs; // subdomains for cache-tiled execution of step_curl
volume v;
double m; // angular dependence in cyl. coords
bool zero_fields_near_cylorigin; // fields=0 m pixels near r=0 for stability
Expand All @@ -1459,7 +1460,7 @@ class fields_chunk {
int chunk_idx;

fields_chunk(structure_chunk *, const char *outdir, double m, double beta,
bool zero_fields_near_cylorigin, int chunkidx);
bool zero_fields_near_cylorigin, int chunkidx, int loop_tile_base);

fields_chunk(const fields_chunk &, int chunkidx);
~fields_chunk();
Expand Down Expand Up @@ -1694,7 +1695,8 @@ class fields {
bool components_allocated;

// fields.cpp methods:
fields(structure *, double m = 0, double beta = 0, bool zero_fields_near_cylorigin = true);
fields(structure *, double m = 0, double beta = 0, bool zero_fields_near_cylorigin = true,
int loop_tile_base = 0);
fields(const fields &);
~fields();
bool equal_layout(const fields &f) const;
Expand Down
2 changes: 2 additions & 0 deletions src/meep/vec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,8 @@ class grid_volume {

std::complex<double> get_split_costs(direction d, int split_point,
bool frag_cost) const;
void tile_split(int &best_split_point,
direction &best_split_direction) const;
void find_best_split(int desired_chunks, bool frag_cost,
int &best_split_point, direction &best_split_direction,
double &left_effort_fraction) const;
Expand Down
140 changes: 71 additions & 69 deletions src/step_db.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,83 +44,85 @@ bool fields_chunk::step_db(field_type ft) {

if (ft != B_stuff && ft != D_stuff) meep::abort("bug - step_db should only be called for B or D");

DOCMP FOR_FT_COMPONENTS(ft, cc) {
if (f[cc][cmp]) {
const component c_p = plus_component[cc], c_m = minus_component[cc];
const direction d_deriv_p = plus_deriv_direction[cc];
const direction d_deriv_m = minus_deriv_direction[cc];
const direction d_c = component_direction(cc);
const bool have_p = have_plus_deriv[cc];
const bool have_m = have_minus_deriv[cc];
const direction dsig0 = cycle_direction(gv.dim, d_c, 1);
const direction dsig = s->sigsize[dsig0] > 1 ? dsig0 : NO_DIRECTION;
const direction dsigu0 = cycle_direction(gv.dim, d_c, 2);
const direction dsigu = s->sigsize[dsigu0] > 1 ? dsigu0 : NO_DIRECTION;
ptrdiff_t stride_p = have_p ? gv.stride(d_deriv_p) : 0;
ptrdiff_t stride_m = have_m ? gv.stride(d_deriv_m) : 0;
realnum *f_p = have_p ? f[c_p][cmp] : NULL;
realnum *f_m = have_m ? f[c_m][cmp] : NULL;
realnum *the_f = f[cc][cmp];
for (const auto& sub_gv : gvs) {
DOCMP FOR_FT_COMPONENTS(ft, cc) {
if (f[cc][cmp]) {
const component c_p = plus_component[cc], c_m = minus_component[cc];
const direction d_deriv_p = plus_deriv_direction[cc];
const direction d_deriv_m = minus_deriv_direction[cc];
const direction d_c = component_direction(cc);
const bool have_p = have_plus_deriv[cc];
const bool have_m = have_minus_deriv[cc];
const direction dsig0 = cycle_direction(gv.dim, d_c, 1);
const direction dsig = s->sigsize[dsig0] > 1 ? dsig0 : NO_DIRECTION;
const direction dsigu0 = cycle_direction(gv.dim, d_c, 2);
const direction dsigu = s->sigsize[dsigu0] > 1 ? dsigu0 : NO_DIRECTION;
ptrdiff_t stride_p = have_p ? gv.stride(d_deriv_p) : 0;
ptrdiff_t stride_m = have_m ? gv.stride(d_deriv_m) : 0;
realnum *f_p = have_p ? f[c_p][cmp] : NULL;
realnum *f_m = have_m ? f[c_m][cmp] : NULL;
realnum *the_f = f[cc][cmp];

if (dsig != NO_DIRECTION && s->conductivity[cc][d_c] && !f_cond[cc][cmp]) {
f_cond[cc][cmp] = new realnum[gv.ntot()];
memset(f_cond[cc][cmp], 0, sizeof(realnum) * gv.ntot());
}
if (dsigu != NO_DIRECTION && !f_u[cc][cmp]) {
f_u[cc][cmp] = new realnum[gv.ntot()];
memcpy(f_u[cc][cmp], the_f, gv.ntot() * sizeof(realnum));
allocated_u = true;
}
if (dsig != NO_DIRECTION && s->conductivity[cc][d_c] && !f_cond[cc][cmp]) {
f_cond[cc][cmp] = new realnum[gv.ntot()];
memset(f_cond[cc][cmp], 0, sizeof(realnum) * gv.ntot());
}
if (dsigu != NO_DIRECTION && !f_u[cc][cmp]) {
f_u[cc][cmp] = new realnum[gv.ntot()];
memcpy(f_u[cc][cmp], the_f, gv.ntot() * sizeof(realnum));
allocated_u = true;
}

if (ft == D_stuff) { // strides are opposite sign for H curl
stride_p = -stride_p;
stride_m = -stride_m;
}
if (ft == D_stuff) { // strides are opposite sign for H curl
stride_p = -stride_p;
stride_m = -stride_m;
}

if (gv.dim == Dcyl) switch (d_c) {
case R:
f_p = NULL; // im/r Fz term will be handled separately
break;
case P: break; // curl works normally for phi component
case Z: {
f_m = NULL; // im/r Fr term will be handled separately
if (gv.dim == Dcyl) switch (d_c) {
case R:
f_p = NULL; // im/r Fz term will be handled separately
break;
case P: break; // curl works normally for phi component
case Z: {
f_m = NULL; // im/r Fr term will be handled separately

/* Here we do a somewhat cool hack: the update of the z
component gives a 1/r d(r Fp)/dr term, rather than
just the derivative dg/dr expected in step_curl.
Rather than duplicating all of step_curl to handle
this bloody derivative, however, we define a new
array f_rderiv_int which is the integral of 1/r d(r Fp)/dr,
so that we can pass it to the unmodified step_curl
and get the correct derivative. (More precisely,
the derivative and integral are replaced by differences
and sums, but you get the idea). */
if (!f_rderiv_int) f_rderiv_int = new realnum[gv.ntot()];
realnum ir0 = gv.origin_r() * gv.a + 0.5 * gv.iyee_shift(c_p).in_direction(R);
for (int iz = 0; iz <= gv.nz(); ++iz)
f_rderiv_int[iz] = 0;
int sr = gv.nz() + 1;
for (int ir = 1; ir <= gv.nr(); ++ir) {
realnum rinv = 1.0 / ((ir + ir0) - 0.5);
for (int iz = 0; iz <= gv.nz(); ++iz) {
ptrdiff_t idx = ir * sr + iz;
f_rderiv_int[idx] =
f_rderiv_int[idx - sr] +
rinv * (f_p[idx] * (ir + ir0) - f_p[idx - sr] * ((ir - 1) + ir0));
/* Here we do a somewhat cool hack: the update of the z
component gives a 1/r d(r Fp)/dr term, rather than
just the derivative dg/dr expected in step_curl.
Rather than duplicating all of step_curl to handle
this bloody derivative, however, we define a new
array f_rderiv_int which is the integral of 1/r d(r Fp)/dr,
so that we can pass it to the unmodified step_curl
and get the correct derivative. (More precisely,
the derivative and integral are replaced by differences
and sums, but you get the idea). */
if (!f_rderiv_int) f_rderiv_int = new realnum[gv.ntot()];
realnum ir0 = gv.origin_r() * gv.a + 0.5 * gv.iyee_shift(c_p).in_direction(R);
for (int iz = 0; iz <= gv.nz(); ++iz)
f_rderiv_int[iz] = 0;
int sr = gv.nz() + 1;
for (int ir = 1; ir <= gv.nr(); ++ir) {
realnum rinv = 1.0 / ((ir + ir0) - 0.5);
for (int iz = 0; iz <= gv.nz(); ++iz) {
ptrdiff_t idx = ir * sr + iz;
f_rderiv_int[idx] =
f_rderiv_int[idx - sr] +
rinv * (f_p[idx] * (ir + ir0) - f_p[idx - sr] * ((ir - 1) + ir0));
}
}
f_p = f_rderiv_int;
break;
}
f_p = f_rderiv_int;
break;
default: meep::abort("bug - non-cylindrical field component in Dcyl");
}
default: meep::abort("bug - non-cylindrical field component in Dcyl");
}

STEP_CURL(the_f, cc, f_p, f_m, stride_p, stride_m,
gv, gv.little_owned_corner0(cc), gv.big_corner(),
Courant, dsig, s->sig[dsig],
s->kap[dsig], s->siginv[dsig], f_u[cc][cmp], dsigu, s->sig[dsigu], s->kap[dsigu],
s->siginv[dsigu], dt, s->conductivity[cc][d_c], s->condinv[cc][d_c],
f_cond[cc][cmp]);
STEP_CURL(the_f, cc, f_p, f_m, stride_p, stride_m,
gv, sub_gv.little_owned_corner0(cc), sub_gv.big_corner(),
Courant, dsig, s->sig[dsig],
s->kap[dsig], s->siginv[dsig], f_u[cc][cmp], dsigu, s->sig[dsigu], s->kap[dsigu],
s->siginv[dsigu], dt, s->conductivity[cc][d_c], s->condinv[cc][d_c],
f_cond[cc][cmp]);
}
}
}

Expand Down
18 changes: 18 additions & 0 deletions src/vec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,24 @@ static double cost_diff(int desired_chunks, std::complex<double> costs) {
return right_cost - left_cost;
}

void grid_volume::tile_split(int &best_split_point,
direction &best_split_direction) const {
const size_t ntot_thresh = 10;
if (ntot() < ntot_thresh) {
best_split_point = 0;
best_split_direction = NO_DIRECTION;
} else if (nx() > 1) {
best_split_point = nx() / 2;
best_split_direction = X;
} else if (ny() > 1) {
best_split_point = ny() / 2;
best_split_direction = Y;
} else {
best_split_point = nz() / 2;
best_split_direction = Z;
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be worth also trying to split along the longest axis.

}

void grid_volume::find_best_split(int desired_chunks, bool fragment_cost,
int &best_split_point,
direction &best_split_direction,
Expand Down