Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding ESPResSo test PR #144

Merged
merged 19 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
34 changes: 34 additions & 0 deletions eessi/testsuite/tests/apps/espresso/benchmarks.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"mode","cores","mpi.x","mpi.y","mpi.z","particles","mean","std"
casparvl marked this conversation as resolved.
Show resolved Hide resolved
"weak scaling",4,2,2,1,6912,2.341e-01,8.081e-03
"strong scaling",4,2,2,1,5832,2.496e-01,9.019e-03
"weak scaling",16,4,2,2,27648,2.417e+00,9.576e-02
"strong scaling",16,4,2,2,5832,3.853e-02,1.991e-03
"weak scaling",32,4,4,2,55296,4.263e+00,1.161e+00
"strong scaling",32,4,4,2,5832,2.194e-02,7.303e-04
"weak scaling",1,1,1,1,1728,7.655e-02,3.434e-03
"weak scaling",2,2,1,1,3456,1.456e-01,4.679e-03
"strong scaling",2,2,1,1,5832,3.936e-01,1.098e-02
"strong scaling",1,1,1,1,5832,6.333e-01,1.194e-01
"strong scaling",64,4,4,4,5832,1.910e-02,6.132e-04
"weak scaling",1,1,1,1,1728,9.482e-02,2.956e-03
"weak scaling",2,2,1,1,3456,2.111e-01,6.614e-03
"strong scaling",1,1,1,1,5832,9.133e-01,2.868e-02
"strong scaling",16,4,2,2,5832,4.285e-02,1.327e-03
"strong scaling",64,4,4,4,5832,1.715e-02,5.776e-04
"strong scaling",128,8,4,4,5832,1.980e-02,7.013e-04
"weak scaling",64,4,4,4,110592,4.375e-01,1.414e-02
"weak scaling",100,5,5,4,172800,4.450e-01,1.437e-02
"weak scaling",128,8,4,4,221184,8.720e+00,2.753e-01
"weak scaling",128,8,4,4,221184,8.760e+00,3.110e-01
"weak scaling",4,2,2,1,6912,2.626e-01,8.142e-03
"weak scaling",4,2,2,1,6912,2.780e-01,8.683e-03
"weak scaling",4,2,2,1,6912,2.627e-01,8.391e-03
"weak scaling",4,2,2,1,6912,2.617e-01,8.155e-03
"weak scaling",2,2,1,1,3456,2.028e-01,6.255e-03
"weak scaling",2,2,1,1,3456,3.247e-01,1.026e-02
"weak scaling",2,2,1,1,3456,3.249e-01,1.029e-02
"weak scaling",2,2,1,1,3456,3.257e-01,1.028e-02
"weak scaling",2,2,1,1,3456,3.375e-01,1.095e-02
"weak scaling",2,2,1,1,3456,3.367e-01,1.086e-02
"weak scaling",2,2,1,1,3456,3.241e-01,1.048e-02
"weak scaling",2,2,1,1,3456,3.243e-01,1.038e-02
112 changes: 112 additions & 0 deletions eessi/testsuite/tests/apps/espresso/espresso.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
This module tests Espresso in available modules containing substring 'ESPResSo' which is different from Quantum Espresso.
Tests included:
- P3M benchmark - Ionic crystals
- Weak scaling
- Strong scaling
Weak and strong scaling are options that are needed to be provided tothe script and the system is either scaled based on
number of cores or kept constant.
"""

import reframe as rfm
import reframe.utility.sanity as sn

from reframe.core.builtins import parameter, run_after # added only to make the linter happy
from reframe.utility import reframe

from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark

from eessi.testsuite import hooks, utils
from eessi.testsuite.constants import *
from eessi.testsuite.utils import find_modules, log

@rfm.simple_test
class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest):
''''''
scale = parameter(SCALES.keys())
valid_prog_environs = ['default']
valid_systems = ['*']
time_limit = '30m'
# Need to check if QuantumESPRESSO also gets listed.
module_name = parameter(find_modules('ESPResSo'))
# device type is parameterized for an impending CUDA ESPResSo module.
device_type = parameter([DEVICE_TYPES[CPU]])

executable = 'python3 madelung.py'

default_strong_scaling_system_size = 9
default_weak_scaling_system_size = 6

benchmark_info = parameter([
('mpi.ionic_crystals.p3m', 'p3m'),
], fmt=lambda x: x[0], loggable=True)


@run_after('init')
def run_after_init(self):
"""hooks to run after init phase"""

# Filter on which scales are supported by the partitions defined in the ReFrame configuration
hooks.filter_supported_scales(self)

hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)

hooks.set_modules(self)

# Set scales as tags
hooks.set_tag_scale(self)

@run_after('init')
def set_tag_ci(self):
""" Setting tests under CI tag. """
if (self.benchmark_info[0] in ['mpi.ionic_crystals.p3m']):
self.tags.add('CI')
log(f'tags set to {self.tags}')

if (self.benchmark_info[0] == 'mpi.ionic_crystals.p3m'):
self.tags.add('ionic_crystals_p3m')


@run_after('init')
def set_mem(self):
""" Setting an extra job option of memory. """
self.extra_resources = {'memory': {'size': '50GB'}}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use

def req_memory_per_node(test: rfm.RegressionTest, app_mem_req):
to request a certain amount of memory.

Also, I assume the memory requirement isn't a fixed 50GB, but depends on the scale at which this is run (i.e. number of tasks)? Or doesn't it? If it does, please define an approximate function to compute the memory requirement as a function of task count. It's fine if it is somewhat conservative (i.e. asks for too much), but be aware that the test will be skipped on systems where insufficient memory is available (so don't over-do it).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was the problem causing OOM and I have fixed it. I am yet to push it into the PR as I am making some more incremental changes to it. I am also observing some crashes on zen4 such as:
https://gitlab.com/eessi/support/-/issues/37#note_1927317164


@run_after('init')
def set_executable_opts(self):
"""Set executable opts based on device_type parameter"""
num_default = 0 # If this test already has executable opts, they must have come from the command line
hooks.check_custom_executable_opts(self, num_default=num_default)
if not self.has_custom_executable_opts:
# By default we run weak scaling since the strong scaling sizes need to change based on max node size and a
# corresponding min node size has to be chozen.
self.executable_opts += ['--size', str(self.default_weak_scaling_system_size), '--weak-scaling']
utils.log(f'executable_opts set to {self.executable_opts}')

@run_after('setup')
def set_num_tasks_per_node(self):
""" Setting number of tasks per node and cpus per task in this function. This function sets num_cpus_per_task
for 1 node and 2 node options where the request is for full nodes."""
hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[CPU])

@deferrable
def assert_completion(self):
'''Check completion'''
cao = sn.extractsingle(r'^resulting parameters:.*cao: (?P<cao>\S+),', self.stdout, 'cao', int)
return (sn.assert_found(r'^Algorithm executed.', self.stdout) and cao)

@deferrable
def assert_convergence(self):
'''Check convergence'''
check_string = sn.assert_found(r'Final convergence met with tolerances:', self.stdout)
energy = sn.extractsingle(r'^\s+energy:\s+(?P<energy>\S+)', self.stdout, 'energy', float)
return (check_string and (energy != 0.0))

@sanity_function
def assert_sanity(self):
'''Check all sanity criteria'''
return sn.all([
self.assert_completion(),
self.assert_convergence(),
])

Empty file.
10 changes: 10 additions & 0 deletions eessi/testsuite/tests/apps/espresso/src/job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
#SBATCH --time=00:40:00
#SBATCH --output %j.stdout
#SBATCH --error %j.stderr
module load spack/default gcc/12.3.0 cuda/12.3.0 openmpi/4.1.6 \
fftw/3.3.10 boost/1.83.0 python/3.12.1
source ../espresso-4.3/venv/bin/activate
srun --cpu-bind=cores python3 madelung.py --size 6 --weak-scaling
srun --cpu-bind=cores python3 madelung.py --size 9 --strong-scaling
deactivate
152 changes: 152 additions & 0 deletions eessi/testsuite/tests/apps/espresso/src/madelung.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#
# Copyright (C) 2013-2024 The ESPResSo project
#
# This file is part of ESPResSo.
#
# ESPResSo is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ESPResSo is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

import espressomd
import espressomd.version
import espressomd.electrostatics
import argparse
import pathlib
import time
import numpy as np

parser = argparse.ArgumentParser(description="Benchmark P3M simulations.")
parser.add_argument("--size", metavar="S", action="store",
default=9, required=False, type=int,
help="Problem size, such that the number of particles N is "
"equal to (2*S)^2; with --weak-scaling this number N "
"is multiplied by the number of cores!")
parser.add_argument("--gpu", action=argparse.BooleanOptionalAction,
default=False, required=False, help="Use GPU implementation")
parser.add_argument("--topology", metavar=("X", "Y", "Z"), nargs=3, action="store",
default=None, required=False, type=int, help="Cartesian topology")
parser.add_argument("--output", metavar="FILEPATH", action="store",
casparvl marked this conversation as resolved.
Show resolved Hide resolved
type=str, required=False, default="benchmarks.csv",
help="Output file (default: benchmarks.csv)")
group = parser.add_mutually_exclusive_group()
group.add_argument("--weak-scaling", action="store_true",
help="Weak scaling benchmark (Gustafson's law: constant work per core)")
group.add_argument("--strong-scaling", action="store_true",
help="Strong scaling benchmark (Amdahl's law: constant total work)")
args = parser.parse_args()

def get_reference_values_per_ion(base_vector):
madelung_constant = -1.74756459463318219
base_tensor = base_vector * np.eye(3)
ref_energy = madelung_constant
ref_pressure = madelung_constant * base_tensor / np.trace(base_tensor)
return ref_energy, ref_pressure

def get_normalized_values_per_ion(system):
energy = system.analysis.energy()["coulomb"]
p_scalar = system.analysis.pressure()["coulomb"]
p_tensor = system.analysis.pressure_tensor()["coulomb"]
N = len(system.part)
V = system.volume()
return 2. * energy / N, 2. * p_scalar * V / N, 2. * p_tensor * V / N

# initialize system
system = espressomd.System(box_l=[100., 100., 100.])
system.time_step = 0.01
system.cell_system.skin = 0.4

# set MPI Cartesian topology
node_grid = system.cell_system.node_grid.copy()
n_cores = int(np.prod(node_grid))
if args.topology:
system.cell_system.node_grid = node_grid = args.topology

# place ions on a cubic lattice
base_vector = np.array([1., 1., 1.])
lattice_size = 3 * [2 * args.size]
if args.weak_scaling:
lattice_size = np.multiply(lattice_size, node_grid)
system.box_l = np.multiply(lattice_size, base_vector)
for j in range(lattice_size[0]):
for k in range(lattice_size[1]):
for l in range(lattice_size[2]):
_ = system.part.add(pos=np.multiply([j, k, l], base_vector),
q=(-1.)**(j + k + l), fix=3 * [True])

# setup P3M algorithm
algorithm = espressomd.electrostatics.P3M
if args.gpu:
algorithm = espressomd.electrostatics.P3MGPU
solver = algorithm(prefactor=1., accuracy=1e-6)
if (espressomd.version.major(), espressomd.version.minor()) == (4, 2):
system.actors.add(solver)
else:
system.electrostatics.solver = solver


print("Algorithm executed. \n")

atol_energy = atol_pressure = 1e-12
atol_forces = 1e-5
atol_abs_forces = 2e-6

rtol_energy = 5e-6
rtol_pressure = 2e-5
rtol_forces = 0.
rtol_abs_forces = 0.
# run checks
forces = np.copy(system.part.all().f)
energy, p_scalar, p_tensor = get_normalized_values_per_ion(system)
ref_energy, ref_pressure = get_reference_values_per_ion(base_vector)
np.testing.assert_allclose(energy, ref_energy, atol=atol_energy, rtol=rtol_energy)
np.testing.assert_allclose(p_scalar, np.trace(ref_pressure) / 3.,
atol=atol_pressure, rtol=rtol_pressure)
np.testing.assert_allclose(p_tensor, ref_pressure, atol=atol_pressure, rtol=rtol_pressure)
np.testing.assert_allclose(forces, 0., atol=atol_forces, rtol=rtol_forces)
np.testing.assert_allclose(np.median(np.abs(forces)), 0., atol=atol_abs_forces, rtol=rtol_abs_forces)


print("Executing sanity checks...\n")
if (np.all([np.allclose(energy, ref_energy, atol=atol_energy, rtol=rtol_energy),
np.allclose(p_scalar, np.trace(ref_pressure) / 3.,
atol=atol_pressure, rtol=rtol_pressure),
np.allclose(p_tensor, ref_pressure, atol=atol_pressure, rtol=rtol_pressure),
np.allclose(forces, 0., atol=atol_forces, rtol=rtol_forces),
np.allclose(np.median(np.abs(forces)), 0., atol=atol_abs_forces, rtol=rtol_abs_forces)])):
print("Final convergence met with tolerances: \n\
energy: ", atol_energy, "\n\
p_scalar: ", atol_pressure, "\n\
p_tensor: ", atol_pressure, "\n\
forces: ", atol_forces, "\n\
abs_forces: ", atol_abs_forces, "\n")
else:
print("At least one parameter did not meet the tolerance, see the log above.\n")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sanity checks have actually already executed at lines 111 to 116, and they will interrupt the Python interpreter with an exception if any check fails, so I would suspect the else branch is unreachable. I would also recommend against re-expressing the assertions as np.allclose in the conditional to avoid redundancy and prevent the risk that the assertions and conditional diverge over time, for example due to changes to tolerance values.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @jngrad ,

I have used the same values for tolerances in the both the assertions so the values should not diverge between the assertions and the conditional.

Do these assertions also end the python executions? In that case, I will move the conditional above your original assertions so that the execution can also reach the else part of the code.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have used the same values for tolerances in the both the assertions so the values should not diverge between the assertions and the conditional.

They don't diverge today. But they might in a month's time if multiple people contribute to this file, or if you forget that the conditional block must exactly mirror the assertion block above it.

Do these assertions also end the python executions?

np.testing.assert_allclose() raises an AssertionError which halts the Python interpreter with a non-zero exit code.

In that case, I will move the conditional above your original assertions so that the execution can also reach the else part of the code.

Is the else branch truly needed? np.testing.assert_allclose() already generates a clear error message:

Traceback (most recent call last):
  File "/work/jgrad/espresso/src/madelung.py", line 116, in <module>
    np.testing.assert_allclose(np.median(np.abs(forces)), 0., atol=atol_abs_forces, rtol=rtol_abs_forces)
  File "/tikhome/jgrad/.local/lib/python3.10/site-packages/numpy/testing/_private/utils.py", line 1527, in assert_allclose
    assert_array_compare(compare, actual, desired, err_msg=str(err_msg),
  File "/tikhome/jgrad/.local/lib/python3.10/site-packages/numpy/testing/_private/utils.py", line 844, in assert_array_compare
    raise AssertionError(msg)
AssertionError: 
Not equal to tolerance rtol=0, atol=2e-06

Mismatched elements: 1 / 1 (100%)
Max absolute difference: 2.1e-06
Max relative difference: inf
 x: array(2.1e-06)
 y: array(0.)

I do not totally understand the purpose of this if/else statement. If you need to log the tolerances to stdout, you can do so independently of the success or failure of the assertions, since they are constants. If you need to report the success of failure of the assertions, that is already done by numpy.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the assertion reports a failure in the manner that you have pointed out but how can I extract a success? Or printing a message right below it would suffice? Since it exits the program anyways.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can print("Success") or check if Python returned exit code 0.


print("Sampling runtime...\n")
# sample runtime
n_steps = 10
timings = []
for _ in range(10):
tick = time.time()
system.integrator.run(n_steps)
tock = time.time()
timings.append((tock - tick) / n_steps)

# write results to file
header = '"mode","cores","mpi.x","mpi.y","mpi.z","particles","mean","std"\n'
report = f'"{"weak scaling" if args.weak_scaling else "strong scaling"}",{n_cores},{node_grid[0]},{node_grid[1]},{node_grid[2]},{len(system.part)},{np.mean(timings):.3e},{np.std(timings, ddof=1):.3e}\n'
print(report)
if pathlib.Path(args.output).is_file():
header = ""
with open(args.output, "a") as f:
f.write(header + report)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This write operation is superfluous if the ReFrame runner captures stdout.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I have to remove this and is a part of my TODO. :)

39 changes: 39 additions & 0 deletions eessi/testsuite/tests/apps/espresso/src/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

df = pd.read_csv("benchmarks.csv")
df = df.sort_values(by=["mode", "cores", "mpi.x", "mpi.y", "mpi.z"])

group = df.query(f"mode == 'strong scaling'")

fig = plt.figure(figsize=(12, 6))
ax = fig.subplots().axes
xdata = group["cores"].to_numpy()
ydata = group["mean"].to_numpy()
ax.axline((xdata[0], xdata[0]), slope=1, linestyle="--", color="grey", label="Theoretical maximum")
ax.plot(xdata, ydata[0] / ydata, "o-", label="Measurements")
ax.set_title("Strong scaling")
ax.set_xlabel("Number of cores")
ax.set_ylabel("Speed-up")
ax.set_xscale("log", base=2)
ax.set_yscale("log", base=10)
ax.legend()
plt.show()

group = df.query(f"mode == 'weak scaling'")

fig = plt.figure(figsize=(12, 6))
ax = fig.subplots().axes
xdata = group["cores"].to_numpy()
ydata = group["mean"].to_numpy()
ax.axline((-np.inf, 1), slope=0, linestyle="--", color="grey", label="Theoretical maximum")
ax.plot(xdata, ydata[0] / ydata, "o-", label="Measurements")
ax.set_title("Weak scaling")
ax.set_xlabel("Number of cores")
ax.set_ylabel("Efficiency")
ax.set_xscale("log", base=2)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1))
ax.legend()
plt.show()
Binary file not shown.
Loading