diff --git a/tests/test_particles_api.py b/tests/test_particles_api.py new file mode 100644 index 000000000..f1a5db34a --- /dev/null +++ b/tests/test_particles_api.py @@ -0,0 +1,135 @@ +# copyright ############################### # +# This file is part of the Xpart Package. # +# Copyright (c) CERN, 2023. # +# ######################################### # +import numpy as np +import pytest + +import xtrack as xt +import xobjects as xo +import xpart as xp + + +def test_check_is_active_sorting_openmp(): + test_context = xo.ContextCpu(omp_num_threads=5) + + class TestElement(xt.BeamElement): + _xofields = { + 'states': xo.Int64[:], + } + + _extra_c_sources = [""" + #define XT_OMP_SKIP_REORGANIZE + + /*gpufun*/ + void TestElement_track_local_particle( + TestElementData el, + LocalParticle* part0 + ) { + //start_per_particle_block (part0->part) + int64_t state = check_is_active(part); + int64_t id = LocalParticle_get_particle_id(part); + TestElementData_set_states(el, id, state); + //end_per_particle_block + } + """] + + el = TestElement( + _context=test_context, + states=np.zeros(18, dtype=np.int64), + ) + particles = xp.Particles( + _context=test_context, + state=[ + 1, 0, 1, 0, 1, # should be reordered to 1, 1, 1, 0, 0 + 0, 0, 0, 0, 0, # should be left intact + 0, 1, 0, 1, 1, # should be reordered to 1, 1, 1, 0, 0 + 1, 1, 0, # should be left intact + ], + _capacity=22, # there are 4 particles that are unallocated + _no_reorganize=True, + ) + + el.track(particles) + + # We have five threads, so the particles should be split into chunks + # of 5, 5, 5, 3 + 2 (unallocated), 2 (unallocated). + assert len(particles.state) == 22 + + # Check that each chunk is reorganized correctly. + # First batch: + assert np.all(particles.state[0:5] == [1, 1, 1, 0, 0]) + assert set(particles.particle_id[0:3]) == {0, 2, 4} + assert set(particles.particle_id[3:5]) == {1, 3} + + # Second batch: + assert np.all(particles.state[5:10] == [0, 0, 0, 0, 0]) + # Don't reorder if not needed: + assert np.all(particles.particle_id[5:10] == [5, 6, 7, 8, 9]) + + # Third batch: + assert np.all(particles.state[10:15] == [1, 1, 1, 0, 0]) + assert set(particles.particle_id[10:13]) == {11, 13, 14} + assert set(particles.particle_id[13:15]) == {10, 12} + + # Fourth batch: + assert np.all(particles.state[15:20] == [1, 1, 0, -999999999, -999999999]) + # Don't reorder if not needed: + assert np.all(particles.particle_id[15:18] == [15, 16, 17]) + + # Fifth batch (unallocated): + assert np.all(particles.state[20:22] == [-999999999, -999999999]) + + +@pytest.mark.parametrize( + 'test_context', + [ + xo.ContextCpu(), + xo.ContextCpu(omp_num_threads=4), + ] +) +def test_check_is_active_sorting_cpu_default(test_context): + class TestElement(xt.BeamElement): + _xofields = { + 'states': xo.Int64[:], + } + + _extra_c_sources = [""" + /*gpufun*/ + void TestElement_track_local_particle( + TestElementData el, + LocalParticle* part0 + ) { + //start_per_particle_block (part0->part) + int64_t state = check_is_active(part); + int64_t id = LocalParticle_get_particle_id(part); + TestElementData_set_states(el, id, state); + //end_per_particle_block + } + """] + + el = TestElement( + _context=test_context, + states=np.zeros(18, dtype=np.int64), + ) + particles = xp.Particles( + _context=test_context, + state=[ + 1, 0, 1, 0, 1, + 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, + 1, 1, 0, + ], + _no_reorganize=True, + ) + # We want to simulate a situation where a recount is needed, so we put a + # value of active particles to be equal to the total number of particles: + particles._num_active_particles = 18 + + el.track(particles) + + # Here we don't reorganize by batches, so we just check the whole array + # to see if it's sensible: + assert np.all(particles.state == ([1] * 8) + ([0] * 10)) + assert set(particles.particle_id[:8]) == {0, 2, 4, 11, 13, 14, 15, 16} + assert set(particles.particle_id[8:]) == {1, 3, 5, 6, 7, 8, 9, 10, 12, 17} diff --git a/xtrack/base_element.py b/xtrack/base_element.py index 8bf0911f4..3ba5941e0 100644 --- a/xtrack/base_element.py +++ b/xtrack/base_element.py @@ -26,7 +26,7 @@ const int64_t XT_part_block_end_idx = LocalParticle_get__num_active_particles(part0); //only_for_context cpu_serial //#pragma omp simd // TODO: currently does not work, needs investigating - for (int64_t XT_part_block_ii=XT_part_block_start_idx; XT_part_block_ii capacity) end_id = capacity; //only_for_context cpu_openmp + if (end_id > num_particles_to_track) end_id = num_particles_to_track; //only_for_context cpu_openmp int64_t part_id = 0; //only_for_context cpu_serial int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x; //only_for_context cuda diff --git a/xtrack/prebuilt_kernels/kernel_definitions.py b/xtrack/prebuilt_kernels/kernel_definitions.py index d2a97f958..c84885841 100644 --- a/xtrack/prebuilt_kernels/kernel_definitions.py +++ b/xtrack/prebuilt_kernels/kernel_definitions.py @@ -78,6 +78,13 @@ 'config': BASE_CONFIG, 'classes': ONLY_XTRACK_ELEMENTS + NO_SYNRAD_ELEMENTS, }), + ('default_only_xtrack_no_limit', { + 'config': { + **{k: v for k, v in BASE_CONFIG.items() + if k != 'XTRACK_GLOBAL_XY_LIMIT'} + }, + 'classes': ONLY_XTRACK_ELEMENTS + NO_SYNRAD_ELEMENTS, + }), ('only_xtrack_non_tracking_kernels', { 'config': BASE_CONFIG, 'classes': [], @@ -159,6 +166,7 @@ DEFAULT_XCOLL_ELEMENTS = [ *ONLY_XTRACK_ELEMENTS, *NO_SYNRAD_ELEMENTS, + ZetaShift, xc.BlackAbsorber, xc.EverestBlock, xc.EverestCollimator, @@ -174,6 +182,13 @@ 'config': {}, 'classes': DEFAULT_XCOLL_ELEMENTS, }), + ('default_xcoll_no_limit', { + 'config': { + **{k: v for k, v in BASE_CONFIG.items() + if k != 'XTRACK_GLOBAL_XY_LIMIT'} + }, + 'classes': DEFAULT_XCOLL_ELEMENTS, + }), ('default_xcoll_frozen_longitudinal', { 'config': {**BASE_CONFIG, **FREEZE_LONGITUDINAL}, 'classes': DEFAULT_XCOLL_ELEMENTS, diff --git a/xtrack/tracker.py b/xtrack/tracker.py index 72fff5f08..a47e466f1 100644 --- a/xtrack/tracker.py +++ b/xtrack/tracker.py @@ -466,19 +466,39 @@ def _build_kernel( int64_t offset_tbt_monitor, /*gpuglmem*/ int8_t* io_buffer){ - const int64_t capacity = ParticlesData_get__capacity(particles); //only_for_context cpu_openmp - const int num_threads = omp_get_max_threads(); //only_for_context cpu_openmp - const int64_t chunk_size = (capacity + num_threads - 1)/num_threads; // ceil division //only_for_context cpu_openmp - #pragma omp parallel for //only_for_context cpu_openmp - for (int chunk = 0; chunk < num_threads; chunk++) { //only_for_context cpu_openmp - int64_t part_id = chunk * chunk_size; //only_for_context cpu_openmp - int64_t end_id = (chunk + 1) * chunk_size; //only_for_context cpu_openmp - if (end_id > capacity) end_id = capacity; //only_for_context cpu_openmp + #define CONTEXT_OPENMP //only_for_context cpu_openmp + #ifdef CONTEXT_OPENMP + const int64_t capacity = ParticlesData_get__capacity(particles); + const int num_threads = omp_get_max_threads(); + + #ifndef XT_OMP_SKIP_REORGANIZE + const int64_t num_particles_to_track = ParticlesData_get__num_active_particles(particles); + + { + LocalParticle lpart; + lpart.io_buffer = io_buffer; + Particles_to_LocalParticle(particles, &lpart, 0, capacity); + check_is_active(&lpart); + count_reorganized_particles(&lpart); + LocalParticle_to_Particles(&lpart, particles, 0, capacity); + } + #else // When we skip reorganize, we cannot just batch active particles + const int64_t num_particles_to_track = capacity; + #endif + + const int64_t chunk_size = (num_particles_to_track + num_threads - 1)/num_threads; // ceil division + #endif // CONTEXT_OPENMP + + #pragma omp parallel for //only_for_context cpu_openmp + for (int chunk = 0; chunk < num_threads; chunk++) { //only_for_context cpu_openmp + int64_t part_id = chunk * chunk_size; //only_for_context cpu_openmp + int64_t end_id = (chunk + 1) * chunk_size; //only_for_context cpu_openmp + if (end_id > num_particles_to_track) end_id = num_particles_to_track; //only_for_context cpu_openmp int64_t part_id = 0; //only_for_context cpu_serial int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x; //only_for_context cuda int64_t part_id = get_global_id(0); //only_for_context opencl - int64_t end_id = 0; // unused outside of openmp //only_for_context cpu_serial cuda opencl + int64_t end_id = 0; // unused outside of openmp //only_for_context cpu_serial cuda opencl LocalParticle lpart; lpart.io_buffer = io_buffer;