From b2092e019ed67497e110e80dcfdb744bdc0e6c5d Mon Sep 17 00:00:00 2001 From: Frederik Van der Veken Date: Thu, 7 Mar 2024 17:32:57 +0100 Subject: [PATCH 1/7] Missing prebuilt kernel for aperture interpolation --- xtrack/prebuilt_kernels/kernel_definitions.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/xtrack/prebuilt_kernels/kernel_definitions.py b/xtrack/prebuilt_kernels/kernel_definitions.py index d2a97f958..f0ab6ad15 100644 --- a/xtrack/prebuilt_kernels/kernel_definitions.py +++ b/xtrack/prebuilt_kernels/kernel_definitions.py @@ -78,6 +78,13 @@ 'config': BASE_CONFIG, 'classes': ONLY_XTRACK_ELEMENTS + NO_SYNRAD_ELEMENTS, }), + ('default_only_xtrack_no_limit', { + 'config': { + **{k: v for k, v in BASE_CONFIG.items() + if k != 'XTRACK_GLOBAL_XY_LIMIT'} + }, + 'classes': ONLY_XTRACK_ELEMENTS + NO_SYNRAD_ELEMENTS, + }), ('only_xtrack_non_tracking_kernels', { 'config': BASE_CONFIG, 'classes': [], @@ -174,6 +181,13 @@ 'config': {}, 'classes': DEFAULT_XCOLL_ELEMENTS, }), + ('default_xcoll_no_limit', { + 'config': { + **{k: v for k, v in BASE_CONFIG.items() + if k != 'XTRACK_GLOBAL_XY_LIMIT'} + }, + 'classes': DEFAULT_XCOLL_ELEMENTS, + }), ('default_xcoll_frozen_longitudinal', { 'config': {**BASE_CONFIG, **FREEZE_LONGITUDINAL}, 'classes': DEFAULT_XCOLL_ELEMENTS, From bde9e0af1ce7f628d49b2e5dd6f91d654a99d03a Mon Sep 17 00:00:00 2001 From: "Frederik F. Van der Veken" Date: Sat, 9 Mar 2024 00:56:01 +0100 Subject: [PATCH 2/7] Added ZetaShift to prebuilt xcoll kernels (for off-momentum loss maps) --- xtrack/prebuilt_kernels/kernel_definitions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xtrack/prebuilt_kernels/kernel_definitions.py b/xtrack/prebuilt_kernels/kernel_definitions.py index f0ab6ad15..c84885841 100644 --- a/xtrack/prebuilt_kernels/kernel_definitions.py +++ b/xtrack/prebuilt_kernels/kernel_definitions.py @@ -166,6 +166,7 @@ DEFAULT_XCOLL_ELEMENTS = [ *ONLY_XTRACK_ELEMENTS, *NO_SYNRAD_ELEMENTS, + ZetaShift, xc.BlackAbsorber, xc.EverestBlock, xc.EverestCollimator, From 8c3ac6b696d19fdd5bb17fd0ebc8a25810f0d367 Mon Sep 17 00:00:00 2001 From: "Frederik F. Van der Veken" Date: Sat, 9 Mar 2024 19:29:38 +0100 Subject: [PATCH 3/7] When initialising the different threads for openmp, the particle size per thread is calculated based on the *active* particles. As the particles are reorganised at the end of tracking, this can produce a huge speed-up when tracking turn-by-turn (as is the case with a progress indicator) and particles often die, freeing up resources for the threads. --- xtrack/tracker.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/xtrack/tracker.py b/xtrack/tracker.py index 72fff5f08..186143114 100644 --- a/xtrack/tracker.py +++ b/xtrack/tracker.py @@ -466,19 +466,20 @@ def _build_kernel( int64_t offset_tbt_monitor, /*gpuglmem*/ int8_t* io_buffer){ - const int64_t capacity = ParticlesData_get__capacity(particles); //only_for_context cpu_openmp - const int num_threads = omp_get_max_threads(); //only_for_context cpu_openmp - const int64_t chunk_size = (capacity + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp - #pragma omp parallel for //only_for_context cpu_openmp - for (int chunk = 0; chunk < num_threads; chunk++) { //only_for_context cpu_openmp - int64_t part_id = chunk * chunk_size; //only_for_context cpu_openmp - int64_t end_id = (chunk + 1) * chunk_size; //only_for_context cpu_openmp - if (end_id > capacity) end_id = capacity; //only_for_context cpu_openmp + const int64_t capacity = ParticlesData_get__capacity(particles); //only_for_context cpu_openmp + const int64_t alive = ParticlesData_get__num_active_particles(particles); //only_for_context cpu_openmp + const int num_threads = omp_get_max_threads(); //only_for_context cpu_openmp + const int64_t chunk_size = (alive + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp + #pragma omp parallel for //only_for_context cpu_openmp + for (int chunk = 0; chunk < num_threads; chunk++) { //only_for_context cpu_openmp + int64_t part_id = chunk * chunk_size; //only_for_context cpu_openmp + int64_t end_id = (chunk + 1) * chunk_size; //only_for_context cpu_openmp + if (end_id > alive) end_id = alive; //only_for_context cpu_openmp int64_t part_id = 0; //only_for_context cpu_serial int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x; //only_for_context cuda int64_t part_id = get_global_id(0); //only_for_context opencl - int64_t end_id = 0; // unused outside of openmp //only_for_context cpu_serial cuda opencl + int64_t end_id = 0; // unused outside of openmp //only_for_context cpu_serial cuda opencl LocalParticle lpart; lpart.io_buffer = io_buffer; From 12e503fb596be5098e456238ae7680c32b087a06 Mon Sep 17 00:00:00 2001 From: "Frederik F. Van der Veken" Date: Sat, 9 Mar 2024 21:41:40 +0100 Subject: [PATCH 4/7] Same for per particle kernels in BeamElement. --- xtrack/base_element.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/xtrack/base_element.py b/xtrack/base_element.py index 8bf0911f4..334b24da0 100644 --- a/xtrack/base_element.py +++ b/xtrack/base_element.py @@ -105,16 +105,17 @@ def _generate_per_particle_kernel_from_local_particle_function( ''' int64_t flag_increment_at_element, /*gpuglmem*/ int8_t* io_buffer){ - const int num_threads = omp_get_max_threads(); //only_for_context cpu_openmp - const int64_t capacity = ParticlesData_get__capacity(particles); //only_for_context cpu_openmp - const int64_t chunk_size = (capacity + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp - #pragma omp parallel for //only_for_context cpu_openmp - for (int64_t batch_id = 0; batch_id < num_threads; batch_id++) { //only_for_context cpu_openmp + const int num_threads = omp_get_max_threads(); //only_for_context cpu_openmp + const int64_t capacity = ParticlesData_get__capacity(particles); //only_for_context cpu_openmp + const int64_t alive = ParticlesData_get__num_active_particles(particles); //only_for_context cpu_openmp + const int64_t chunk_size = (alive + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp + #pragma omp parallel for //only_for_context cpu_openmp + for (int64_t batch_id = 0; batch_id < num_threads; batch_id++) { //only_for_context cpu_openmp LocalParticle lpart; lpart.io_buffer = io_buffer; int64_t part_id = batch_id * chunk_size; //only_for_context cpu_openmp int64_t end_id = (batch_id + 1) * chunk_size; //only_for_context cpu_openmp - if (end_id > capacity) end_id = capacity; //only_for_context cpu_openmp + if (end_id > alive) end_id = alive; //only_for_context cpu_openmp int64_t part_id = 0; //only_for_context cpu_serial int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x; //only_for_context cuda From 2e3a201d2558644532c67efc3547a9e3652f12b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Wed, 13 Mar 2024 08:38:55 +0100 Subject: [PATCH 5/7] rename variables --- xtrack/base_element.py | 6 +++--- xtrack/tracker.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/xtrack/base_element.py b/xtrack/base_element.py index 334b24da0..dd5f6bab1 100644 --- a/xtrack/base_element.py +++ b/xtrack/base_element.py @@ -107,15 +107,15 @@ def _generate_per_particle_kernel_from_local_particle_function( /*gpuglmem*/ int8_t* io_buffer){ const int num_threads = omp_get_max_threads(); //only_for_context cpu_openmp const int64_t capacity = ParticlesData_get__capacity(particles); //only_for_context cpu_openmp - const int64_t alive = ParticlesData_get__num_active_particles(particles); //only_for_context cpu_openmp - const int64_t chunk_size = (alive + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp + const int64_t num_active = ParticlesData_get__num_active_particles(particles); //only_for_context cpu_openmp + const int64_t chunk_size = (num_active + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp #pragma omp parallel for //only_for_context cpu_openmp for (int64_t batch_id = 0; batch_id < num_threads; batch_id++) { //only_for_context cpu_openmp LocalParticle lpart; lpart.io_buffer = io_buffer; int64_t part_id = batch_id * chunk_size; //only_for_context cpu_openmp int64_t end_id = (batch_id + 1) * chunk_size; //only_for_context cpu_openmp - if (end_id > alive) end_id = alive; //only_for_context cpu_openmp + if (end_id > num_active) end_id = num_active; //only_for_context cpu_openmp int64_t part_id = 0; //only_for_context cpu_serial int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x; //only_for_context cuda diff --git a/xtrack/tracker.py b/xtrack/tracker.py index 186143114..123e630b5 100644 --- a/xtrack/tracker.py +++ b/xtrack/tracker.py @@ -467,14 +467,14 @@ def _build_kernel( /*gpuglmem*/ int8_t* io_buffer){ const int64_t capacity = ParticlesData_get__capacity(particles); //only_for_context cpu_openmp - const int64_t alive = ParticlesData_get__num_active_particles(particles); //only_for_context cpu_openmp + const int64_t num_active = ParticlesData_get__num_active_particles(particles); //only_for_context cpu_openmp const int num_threads = omp_get_max_threads(); //only_for_context cpu_openmp - const int64_t chunk_size = (alive + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp + const int64_t chunk_size = (num_active + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp #pragma omp parallel for //only_for_context cpu_openmp for (int chunk = 0; chunk < num_threads; chunk++) { //only_for_context cpu_openmp int64_t part_id = chunk * chunk_size; //only_for_context cpu_openmp int64_t end_id = (chunk + 1) * chunk_size; //only_for_context cpu_openmp - if (end_id > alive) end_id = alive; //only_for_context cpu_openmp + if (end_id > num_active) end_id = num_active; //only_for_context cpu_openmp int64_t part_id = 0; //only_for_context cpu_serial int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x; //only_for_context cuda From fa07c6248d0eba3be8378ec328fcd17f85339a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Wed, 13 Mar 2024 16:53:29 +0100 Subject: [PATCH 6/7] fixes for skip reordering --- tests/test_particles_api.py | 135 ++++++++++++++++++++++++++++++++++++ xtrack/base_element.py | 32 +++++++-- xtrack/tracker.py | 27 ++++++-- 3 files changed, 183 insertions(+), 11 deletions(-) create mode 100644 tests/test_particles_api.py diff --git a/tests/test_particles_api.py b/tests/test_particles_api.py new file mode 100644 index 000000000..f1a5db34a --- /dev/null +++ b/tests/test_particles_api.py @@ -0,0 +1,135 @@ +# copyright ############################### # +# This file is part of the Xpart Package. # +# Copyright (c) CERN, 2023. # +# ######################################### # +import numpy as np +import pytest + +import xtrack as xt +import xobjects as xo +import xpart as xp + + +def test_check_is_active_sorting_openmp(): + test_context = xo.ContextCpu(omp_num_threads=5) + + class TestElement(xt.BeamElement): + _xofields = { + 'states': xo.Int64[:], + } + + _extra_c_sources = [""" + #define XT_OMP_SKIP_REORGANIZE + + /*gpufun*/ + void TestElement_track_local_particle( + TestElementData el, + LocalParticle* part0 + ) { + //start_per_particle_block (part0->part) + int64_t state = check_is_active(part); + int64_t id = LocalParticle_get_particle_id(part); + TestElementData_set_states(el, id, state); + //end_per_particle_block + } + """] + + el = TestElement( + _context=test_context, + states=np.zeros(18, dtype=np.int64), + ) + particles = xp.Particles( + _context=test_context, + state=[ + 1, 0, 1, 0, 1, # should be reordered to 1, 1, 1, 0, 0 + 0, 0, 0, 0, 0, # should be left intact + 0, 1, 0, 1, 1, # should be reordered to 1, 1, 1, 0, 0 + 1, 1, 0, # should be left intact + ], + _capacity=22, # there are 4 particles that are unallocated + _no_reorganize=True, + ) + + el.track(particles) + + # We have five threads, so the particles should be split into chunks + # of 5, 5, 5, 3 + 2 (unallocated), 2 (unallocated). + assert len(particles.state) == 22 + + # Check that each chunk is reorganized correctly. + # First batch: + assert np.all(particles.state[0:5] == [1, 1, 1, 0, 0]) + assert set(particles.particle_id[0:3]) == {0, 2, 4} + assert set(particles.particle_id[3:5]) == {1, 3} + + # Second batch: + assert np.all(particles.state[5:10] == [0, 0, 0, 0, 0]) + # Don't reorder if not needed: + assert np.all(particles.particle_id[5:10] == [5, 6, 7, 8, 9]) + + # Third batch: + assert np.all(particles.state[10:15] == [1, 1, 1, 0, 0]) + assert set(particles.particle_id[10:13]) == {11, 13, 14} + assert set(particles.particle_id[13:15]) == {10, 12} + + # Fourth batch: + assert np.all(particles.state[15:20] == [1, 1, 0, -999999999, -999999999]) + # Don't reorder if not needed: + assert np.all(particles.particle_id[15:18] == [15, 16, 17]) + + # Fifth batch (unallocated): + assert np.all(particles.state[20:22] == [-999999999, -999999999]) + + +@pytest.mark.parametrize( + 'test_context', + [ + xo.ContextCpu(), + xo.ContextCpu(omp_num_threads=4), + ] +) +def test_check_is_active_sorting_cpu_default(test_context): + class TestElement(xt.BeamElement): + _xofields = { + 'states': xo.Int64[:], + } + + _extra_c_sources = [""" + /*gpufun*/ + void TestElement_track_local_particle( + TestElementData el, + LocalParticle* part0 + ) { + //start_per_particle_block (part0->part) + int64_t state = check_is_active(part); + int64_t id = LocalParticle_get_particle_id(part); + TestElementData_set_states(el, id, state); + //end_per_particle_block + } + """] + + el = TestElement( + _context=test_context, + states=np.zeros(18, dtype=np.int64), + ) + particles = xp.Particles( + _context=test_context, + state=[ + 1, 0, 1, 0, 1, + 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, + 1, 1, 0, + ], + _no_reorganize=True, + ) + # We want to simulate a situation where a recount is needed, so we put a + # value of active particles to be equal to the total number of particles: + particles._num_active_particles = 18 + + el.track(particles) + + # Here we don't reorganize by batches, so we just check the whole array + # to see if it's sensible: + assert np.all(particles.state == ([1] * 8) + ([0] * 10)) + assert set(particles.particle_id[:8]) == {0, 2, 4, 11, 13, 14, 15, 16} + assert set(particles.particle_id[8:]) == {1, 3, 5, 6, 7, 8, 9, 10, 12, 17} diff --git a/xtrack/base_element.py b/xtrack/base_element.py index dd5f6bab1..3ba5941e0 100644 --- a/xtrack/base_element.py +++ b/xtrack/base_element.py @@ -26,7 +26,7 @@ const int64_t XT_part_block_end_idx = LocalParticle_get__num_active_particles(part0); //only_for_context cpu_serial //#pragma omp simd // TODO: currently does not work, needs investigating - for (int64_t XT_part_block_ii=XT_part_block_start_idx; XT_part_block_ii num_active) end_id = num_active; //only_for_context cpu_openmp + if (end_id > num_particles_to_track) end_id = num_particles_to_track; //only_for_context cpu_openmp int64_t part_id = 0; //only_for_context cpu_serial int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x; //only_for_context cuda diff --git a/xtrack/tracker.py b/xtrack/tracker.py index 123e630b5..d9237b17e 100644 --- a/xtrack/tracker.py +++ b/xtrack/tracker.py @@ -466,15 +466,32 @@ def _build_kernel( int64_t offset_tbt_monitor, /*gpuglmem*/ int8_t* io_buffer){ - const int64_t capacity = ParticlesData_get__capacity(particles); //only_for_context cpu_openmp - const int64_t num_active = ParticlesData_get__num_active_particles(particles); //only_for_context cpu_openmp - const int num_threads = omp_get_max_threads(); //only_for_context cpu_openmp - const int64_t chunk_size = (num_active + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp + #define CONTEXT_OPENMP //only_for_context cpu_openmp + #ifdef CONTEXT_OPENMP + const int64_t capacity = ParticlesData_get__capacity(particles); + const int num_threads = omp_get_max_threads(); + + #ifndef XT_OMP_SKIP_REORGANIZE + const int64_t num_particles_to_track = ParticlesData_get__num_active_particles(particles); + + LocalParticle lpart; + lpart.io_buffer = io_buffer; + Particles_to_LocalParticle(particles, &lpart, 0, capacity); + check_is_active(&lpart); + count_reorganized_particles(&lpart); + LocalParticle_to_Particles(&lpart, particles, 0, capacity); + #else // When we skip reorganize, we cannot just batch active particles + const int64_t num_particles_to_track = capacity; + #endif + + const int64_t chunk_size = (num_particles_to_track + num_threads - 1)/num_threads; // ceil division + #endif // CONTEXT_OPENMP + #pragma omp parallel for //only_for_context cpu_openmp for (int chunk = 0; chunk < num_threads; chunk++) { //only_for_context cpu_openmp int64_t part_id = chunk * chunk_size; //only_for_context cpu_openmp int64_t end_id = (chunk + 1) * chunk_size; //only_for_context cpu_openmp - if (end_id > num_active) end_id = num_active; //only_for_context cpu_openmp + if (end_id > num_particles_to_track) end_id = num_particles_to_track; //only_for_context cpu_openmp int64_t part_id = 0; //only_for_context cpu_serial int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x; //only_for_context cuda From 1d5c4f1f788fe3d6ec94d054a7ddbba16717f70d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Wed, 13 Mar 2024 17:11:34 +0100 Subject: [PATCH 7/7] local scope lpart in tracker on openmp --- xtrack/tracker.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/xtrack/tracker.py b/xtrack/tracker.py index d9237b17e..a47e466f1 100644 --- a/xtrack/tracker.py +++ b/xtrack/tracker.py @@ -473,13 +473,15 @@ def _build_kernel( #ifndef XT_OMP_SKIP_REORGANIZE const int64_t num_particles_to_track = ParticlesData_get__num_active_particles(particles); - - LocalParticle lpart; - lpart.io_buffer = io_buffer; - Particles_to_LocalParticle(particles, &lpart, 0, capacity); - check_is_active(&lpart); - count_reorganized_particles(&lpart); - LocalParticle_to_Particles(&lpart, particles, 0, capacity); + + { + LocalParticle lpart; + lpart.io_buffer = io_buffer; + Particles_to_LocalParticle(particles, &lpart, 0, capacity); + check_is_active(&lpart); + count_reorganized_particles(&lpart); + LocalParticle_to_Particles(&lpart, particles, 0, capacity); + } #else // When we skip reorganize, we cannot just batch active particles const int64_t num_particles_to_track = capacity; #endif