diff --git a/tests/test_particles_api.py b/tests/test_particles_api.py
new file mode 100644
index 000000000..f1a5db34a
--- /dev/null
+++ b/tests/test_particles_api.py
@@ -0,0 +1,135 @@
+# copyright ############################### #
+# This file is part of the Xpart Package.   #
+# Copyright (c) CERN, 2023.                 #
+# ######################################### #
+import numpy as np
+import pytest
+
+import xtrack as xt
+import xobjects as xo
+import xpart as xp
+
+
+def test_check_is_active_sorting_openmp():
+    test_context = xo.ContextCpu(omp_num_threads=5)
+
+    class TestElement(xt.BeamElement):
+        _xofields = {
+            'states': xo.Int64[:],
+        }
+
+        _extra_c_sources = ["""
+            #define XT_OMP_SKIP_REORGANIZE
+
+            /*gpufun*/
+            void TestElement_track_local_particle(
+                TestElementData el,
+                LocalParticle* part0
+            ) {
+                //start_per_particle_block (part0->part)
+                    int64_t state = check_is_active(part);
+                    int64_t id = LocalParticle_get_particle_id(part);
+                    TestElementData_set_states(el, id, state);
+                //end_per_particle_block
+            }
+        """]
+
+    el = TestElement(
+        _context=test_context,
+        states=np.zeros(18, dtype=np.int64),
+    )
+    particles = xp.Particles(
+        _context=test_context,
+        state=[
+            1, 0, 1, 0, 1,  # should be reordered to 1, 1, 1, 0, 0
+            0, 0, 0, 0, 0,  # should be left intact
+            0, 1, 0, 1, 1,  # should be reordered to 1, 1, 1, 0, 0
+            1, 1, 0,        # should be left intact
+        ],
+        _capacity=22,       # there are 4 particles that are unallocated
+        _no_reorganize=True,
+    )
+
+    el.track(particles)
+
+    # We have five threads, so the particles should be split into chunks
+    # of 5, 5, 5, 3 + 2 (unallocated), 2 (unallocated).
+    assert len(particles.state) == 22
+
+    # Check that each chunk is reorganized correctly.
+    # First batch:
+    assert np.all(particles.state[0:5] == [1, 1, 1, 0, 0])
+    assert set(particles.particle_id[0:3]) == {0, 2, 4}
+    assert set(particles.particle_id[3:5]) == {1, 3}
+
+    # Second batch:
+    assert np.all(particles.state[5:10] == [0, 0, 0, 0, 0])
+    # Don't reorder if not needed:
+    assert np.all(particles.particle_id[5:10] == [5, 6, 7, 8, 9])
+
+    # Third batch:
+    assert np.all(particles.state[10:15] == [1, 1, 1, 0, 0])
+    assert set(particles.particle_id[10:13]) == {11, 13, 14}
+    assert set(particles.particle_id[13:15]) == {10, 12}
+
+    # Fourth batch:
+    assert np.all(particles.state[15:20] == [1, 1, 0, -999999999, -999999999])
+    # Don't reorder if not needed:
+    assert np.all(particles.particle_id[15:18] == [15, 16, 17])
+
+    # Fifth batch (unallocated):
+    assert np.all(particles.state[20:22] == [-999999999, -999999999])
+
+
+@pytest.mark.parametrize(
+    'test_context',
+    [
+        xo.ContextCpu(),
+        xo.ContextCpu(omp_num_threads=4),
+    ]
+)
+def test_check_is_active_sorting_cpu_default(test_context):
+    class TestElement(xt.BeamElement):
+        _xofields = {
+            'states': xo.Int64[:],
+        }
+
+        _extra_c_sources = ["""
+            /*gpufun*/
+            void TestElement_track_local_particle(
+                TestElementData el,
+                LocalParticle* part0
+            ) {
+                //start_per_particle_block (part0->part)
+                    int64_t state = check_is_active(part);
+                    int64_t id = LocalParticle_get_particle_id(part);
+                    TestElementData_set_states(el, id, state);
+                //end_per_particle_block
+            }
+        """]
+
+    el = TestElement(
+        _context=test_context,
+        states=np.zeros(18, dtype=np.int64),
+    )
+    particles = xp.Particles(
+        _context=test_context,
+        state=[
+            1, 0, 1, 0, 1,
+            0, 0, 0, 0, 0,
+            0, 1, 0, 1, 1,
+            1, 1, 0,
+        ],
+        _no_reorganize=True,
+    )
+    # We want to simulate a situation where a recount is needed, so we put a
+    # value of active particles to be equal to the total number of particles:
+    particles._num_active_particles = 18
+
+    el.track(particles)
+
+    # Here we don't reorganize by batches, so we just check the whole array
+    # to see if it's sensible:
+    assert np.all(particles.state == ([1] * 8) + ([0] * 10))
+    assert set(particles.particle_id[:8]) == {0, 2, 4, 11, 13, 14, 15, 16}
+    assert set(particles.particle_id[8:]) == {1, 3, 5, 6, 7, 8, 9, 10, 12, 17}
diff --git a/xtrack/base_element.py b/xtrack/base_element.py
index 8bf0911f4..3ba5941e0 100644
--- a/xtrack/base_element.py
+++ b/xtrack/base_element.py
@@ -26,7 +26,7 @@
     const int64_t XT_part_block_end_idx = LocalParticle_get__num_active_particles(part0); //only_for_context cpu_serial
 
     //#pragma omp simd // TODO: currently does not work, needs investigating
-    for (int64_t XT_part_block_ii=XT_part_block_start_idx; XT_part_block_ii<XT_part_block_end_idx; XT_part_block_ii++) { //only_for_context cpu_openmp cpu_serial
+    for (int64_t XT_part_block_ii = XT_part_block_start_idx; XT_part_block_ii<XT_part_block_end_idx; XT_part_block_ii++) { //only_for_context cpu_openmp cpu_serial
 
         LocalParticle lpart = *part0;    //only_for_context cpu_serial cpu_openmp
         LocalParticle* part = &lpart;    //only_for_context cpu_serial cpu_openmp
@@ -105,16 +105,37 @@ def _generate_per_particle_kernel_from_local_particle_function(
 '''
                              int64_t flag_increment_at_element,
                 /*gpuglmem*/ int8_t* io_buffer){
-            const int num_threads = omp_get_max_threads();                                 //only_for_context cpu_openmp
-            const int64_t capacity = ParticlesData_get__capacity(particles);               //only_for_context cpu_openmp
-            const int64_t chunk_size = (capacity + num_threads - 1)/num_threads; // ceil division  //only_for_context cpu_openmp
-            #pragma omp parallel for                                                       //only_for_context cpu_openmp
-            for (int64_t batch_id = 0; batch_id < num_threads; batch_id++) {               //only_for_context cpu_openmp
+
+            #define CONTEXT_OPENMP  //only_for_context cpu_openmp
+            #ifdef CONTEXT_OPENMP
+                const int64_t capacity = ParticlesData_get__capacity(particles);
+                const int num_threads = omp_get_max_threads();
+
+                #ifndef XT_OMP_SKIP_REORGANIZE
+                    const int64_t num_particles_to_track = ParticlesData_get__num_active_particles(particles);
+
+                    {
+                        LocalParticle lpart;
+                        lpart.io_buffer = io_buffer;
+                        Particles_to_LocalParticle(particles, &lpart, 0, capacity);
+                        check_is_active(&lpart);
+                        count_reorganized_particles(&lpart);
+                        LocalParticle_to_Particles(&lpart, particles, 0, capacity);
+                    }
+                #else // When we skip reorganize, we cannot just batch active particles
+                    const int64_t num_particles_to_track = capacity;
+                #endif
+                
+                const int64_t chunk_size = (num_particles_to_track + num_threads - 1)/num_threads; // ceil division
+            #endif // CONTEXT_OPENMP
+
+            #pragma omp parallel for                                                           //only_for_context cpu_openmp
+            for (int64_t batch_id = 0; batch_id < num_threads; batch_id++) {                   //only_for_context cpu_openmp
                 LocalParticle lpart;
                 lpart.io_buffer = io_buffer;
                 int64_t part_id = batch_id * chunk_size;                                       //only_for_context cpu_openmp
                 int64_t end_id = (batch_id + 1) * chunk_size;                                  //only_for_context cpu_openmp
-                if (end_id > capacity) end_id = capacity;                                      //only_for_context cpu_openmp
+                if (end_id > num_particles_to_track) end_id = num_particles_to_track;          //only_for_context cpu_openmp
 
                 int64_t part_id = 0;                    //only_for_context cpu_serial
                 int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x; //only_for_context cuda
diff --git a/xtrack/prebuilt_kernels/kernel_definitions.py b/xtrack/prebuilt_kernels/kernel_definitions.py
index d2a97f958..c84885841 100644
--- a/xtrack/prebuilt_kernels/kernel_definitions.py
+++ b/xtrack/prebuilt_kernels/kernel_definitions.py
@@ -78,6 +78,13 @@
         'config': BASE_CONFIG,
         'classes': ONLY_XTRACK_ELEMENTS + NO_SYNRAD_ELEMENTS,
     }),
+    ('default_only_xtrack_no_limit', {
+        'config': {
+            **{k: v for k, v in BASE_CONFIG.items()
+                if k != 'XTRACK_GLOBAL_XY_LIMIT'}
+        },
+        'classes': ONLY_XTRACK_ELEMENTS + NO_SYNRAD_ELEMENTS,
+    }),
     ('only_xtrack_non_tracking_kernels', {
         'config': BASE_CONFIG,
         'classes': [],
@@ -159,6 +166,7 @@
     DEFAULT_XCOLL_ELEMENTS = [
         *ONLY_XTRACK_ELEMENTS,
         *NO_SYNRAD_ELEMENTS,
+        ZetaShift,
         xc.BlackAbsorber,
         xc.EverestBlock,
         xc.EverestCollimator,
@@ -174,6 +182,13 @@
             'config': {},
             'classes': DEFAULT_XCOLL_ELEMENTS,
         }),
+        ('default_xcoll_no_limit', {
+            'config': {
+                **{k: v for k, v in BASE_CONFIG.items()
+                    if k != 'XTRACK_GLOBAL_XY_LIMIT'}
+            },
+            'classes': DEFAULT_XCOLL_ELEMENTS,
+        }),
         ('default_xcoll_frozen_longitudinal', {
             'config': {**BASE_CONFIG, **FREEZE_LONGITUDINAL},
             'classes': DEFAULT_XCOLL_ELEMENTS,
diff --git a/xtrack/tracker.py b/xtrack/tracker.py
index 72fff5f08..a47e466f1 100644
--- a/xtrack/tracker.py
+++ b/xtrack/tracker.py
@@ -466,19 +466,39 @@ def _build_kernel(
                              int64_t offset_tbt_monitor,
                 /*gpuglmem*/ int8_t* io_buffer){
 
-            const int64_t capacity = ParticlesData_get__capacity(particles);               //only_for_context cpu_openmp
-            const int num_threads = omp_get_max_threads();                                 //only_for_context cpu_openmp
-            const int64_t chunk_size = (capacity + num_threads - 1)/num_threads; // ceil division  //only_for_context cpu_openmp
-            #pragma omp parallel for                                                       //only_for_context cpu_openmp
-            for (int chunk = 0; chunk < num_threads; chunk++) {                            //only_for_context cpu_openmp
-            int64_t part_id = chunk * chunk_size;                                          //only_for_context cpu_openmp
-            int64_t end_id = (chunk + 1) * chunk_size;                                     //only_for_context cpu_openmp
-            if (end_id > capacity) end_id = capacity;                                      //only_for_context cpu_openmp
+            #define CONTEXT_OPENMP  //only_for_context cpu_openmp
+            #ifdef CONTEXT_OPENMP
+                const int64_t capacity = ParticlesData_get__capacity(particles);
+                const int num_threads = omp_get_max_threads();
+
+                #ifndef XT_OMP_SKIP_REORGANIZE
+                    const int64_t num_particles_to_track = ParticlesData_get__num_active_particles(particles);
+                    
+                    {
+                        LocalParticle lpart;
+                        lpart.io_buffer = io_buffer;
+                        Particles_to_LocalParticle(particles, &lpart, 0, capacity);
+                        check_is_active(&lpart);
+                        count_reorganized_particles(&lpart);
+                        LocalParticle_to_Particles(&lpart, particles, 0, capacity);
+                    }
+                #else // When we skip reorganize, we cannot just batch active particles
+                    const int64_t num_particles_to_track = capacity;
+                #endif
+                
+                const int64_t chunk_size = (num_particles_to_track + num_threads - 1)/num_threads; // ceil division
+            #endif // CONTEXT_OPENMP
+            
+            #pragma omp parallel for                                                           //only_for_context cpu_openmp
+            for (int chunk = 0; chunk < num_threads; chunk++) {                                //only_for_context cpu_openmp
+            int64_t part_id = chunk * chunk_size;                                              //only_for_context cpu_openmp
+            int64_t end_id = (chunk + 1) * chunk_size;                                         //only_for_context cpu_openmp
+            if (end_id > num_particles_to_track) end_id = num_particles_to_track;              //only_for_context cpu_openmp
 
             int64_t part_id = 0;                                      //only_for_context cpu_serial
             int64_t part_id = blockDim.x * blockIdx.x + threadIdx.x;  //only_for_context cuda
             int64_t part_id = get_global_id(0);                       //only_for_context opencl
-            int64_t end_id = 0; // unused outside of openmp  //only_for_context cpu_serial cuda opencl
+            int64_t end_id = 0; // unused outside of openmp           //only_for_context cpu_serial cuda opencl
 
             LocalParticle lpart;
             lpart.io_buffer = io_buffer;