From 7ce871712b9d46b5545186379ede7d5e75e3396d Mon Sep 17 00:00:00 2001
From: Love Waern <love.waern@intel.com>
Date: Fri, 5 Apr 2024 15:58:58 +0200
Subject: [PATCH] `thread_aware` now internal & experimental; many fixes; test
 PoC

Gonna keep the documentation for now
---
 lib/1.4/dml-builtins.dml                      | 169 ++++++++++--------
 lib/1.4/utility.dml                           |  22 +--
 py/dml/c_backend.py                           |  33 ++--
 py/dml/structure.py                           |  10 +-
 test/1.4/misc/T_thread_aware.py               |  49 -----
 .../{T_thread_aware.dml => thread_aware.dml}  | 115 +++++++++++-
 test/1.4/misc/thread_aware.py                 |  71 ++++++++
 test/tests.py                                 |  12 +-
 8 files changed, 325 insertions(+), 156 deletions(-)
 delete mode 100644 test/1.4/misc/T_thread_aware.py
 rename test/1.4/misc/{T_thread_aware.dml => thread_aware.dml} (50%)
 create mode 100644 test/1.4/misc/thread_aware.py

diff --git a/lib/1.4/dml-builtins.dml b/lib/1.4/dml-builtins.dml
index 255b89bec..ec305456c 100644
--- a/lib/1.4/dml-builtins.dml
+++ b/lib/1.4/dml-builtins.dml
@@ -538,8 +538,8 @@ The `device` template contains the following parameters:
   this device, as specified by the `--simics-api` command-line argument;
   e.g. `"6"` for the Simics 6 API.
 
-* `thread_aware` *[bool]*: Whether the device model is declared to be
-  *thread-aware*. This is `true` if and only if the [`thread_aware`
+* `_thread_aware` *[bool]*: Whether the device model is declared to be
+  *thread-aware*. This is `true` if and only if the [`_thread_aware`
   template](#device-templates) has been instantiated for the device.
 */
 template device {
@@ -583,8 +583,8 @@ template device {
     // this carried semantics in DML 1.2; deprecated in 1.4
     param _confidentiality          = undefined;
 
-    param _thread_aware default false;
-    param thread_aware = _thread_aware;
+    param __thread_aware default false;
+    param _thread_aware = __thread_aware;
 
     method _init() { _rec_init(); }
     method _post_init() { _rec_post_init(); }
@@ -603,44 +603,46 @@ typically written in order to describe logic which affects the entire device
 model as a whole.
 
 The only built-in device template (save for the `device` template which defines
-the object type itself) is the `thread_aware` template; however, the DML
+the object type itself) is the `_thread_aware` template; however, the DML
 standard library (`utility.dml`) also defines the device templates [`poreset`,
 `hreset`, and `sreset`](utility.html#templates-for-reset), which declare device
 support for different kinds of resets.
 
-The `thread_aware` template is offered as a means of writing *thread-aware* DML
+The `_thread_aware` template is offered as a means of writing *thread-aware* DML
 device models, in the sense defined by the *Simics API Reference Manual* (in
 section 2.7). This can be necessary in a multicore-acceleration setting, if the
 models would otherwise present a bottleneck in the simulation. The
-`thread_aware` template **must be** and **should only be** used for this
+`_thread_aware` template **must be** and **should only be** used for this
 purpose.
 
-Instantiating the `thread_aware` template on top-level declares the device
+Instantiating the `_thread_aware` template on top-level declares the device
 model as being thread-aware, which carries a large number of ramifications and
 responsibilities.
 
-The consequences of instantiating the `thread_aware` template are as follows:
+The consequences of instantiating the `_thread_aware` template are as follows:
+* The [`_thread_aware` parameter](#device-objects) of the device will resolve
+  to constant `true` rather than `false`. This can be leveraged by model code:
+  `dev._thread_aware` is suitable for use together with `#if` to either
+  conditionally generate additional code needed to support thread-aware device
+  models, or to conditionally raise compile-time errors via the [`error`
+  statement/declaration](language.html#error-statements) if some piece of code
+  does not support thread-aware device models.
 * An implementation of the `concurrency_mode` interface is provided for the
   device object, together with an implementation of the `concurrency_group`
   interface to explicitly group all ports, banks, and subdevices together
   with the device.
-    * The `thread_aware` template declares the top-level `supported_modes`
+    * The `_thread_aware` template declares the top-level `supported_modes`
       parameter to allow the modeller to configure the supported concurrency
       modes that the device reports. Its default value is
       `Sim_Concurrency_Mode_Serialized | Sim_Concurrency_Mode_Serialized_Memory`.
-    * The `thread_aware` template declares the top-level
+    * The `_thread_aware` template declares the top-level
       `init_concurrency_mode` parameter to allow the modeller to configure
       the default concurrency mode the device places itself if never told
       to switch modes. Its default value is `Sim_Concurrency_Mode_Serialized`.
     * The current concurrency mode of the device can be accessed within DML via
       `dev.concurrency_mode.current`.
-* The [`thread_aware` parameter](#device-objects) of the device will resolve
-  to constant `true` rather than `false`. This can be leveraged by model code:
-  `dev.thread_aware` is suitable for use together with `#if` to either
-  conditionally generate additional code needed to support thread-aware device
-  models, or to conditionally raise compile-time errors via the [`error`
-  statement/declaration](language.html#error-statements) if some piece of code
-  does not support thread-aware device models.
+* All events (including those generated for `after` statements) of the device
+  model are registered with the `Sim_EC_No_Serialize` flag.
 * **DMLC will insert code to automatically acquire and release locks required
   by any feature directly provided by DML.** The following is guaranteed:
     * DMLC will ensure that the thread domain associated with the device object
@@ -651,9 +653,9 @@ The consequences of instantiating the `thread_aware` template are as follows:
       method is `independent`.)
     * Any call to an interface of a `connect` is automatically enclosed by
       `SIM_ACQUIRE_TARGET`/`SIM_RELEASE_TARGET` on the connected object.
-    * When a connect is being configured by Simics, the target object is
-      guaranteed to be acquired over the course of the subsequent calls to
-      `set()` and `validate()` of the connect object.
+    * When a connect is being configured by Simics, the cell domain of the
+      connected device is guaranteed to be acquired over the course of the
+      subsequent calls to `set()` and `validate()` of the `connect` object.
     * Any operations performed on the device clock by `event` objects, the
       `after` statement, or the `cancel_after()` operation will automatically
       hold the cell domain of the device clock as the operation is performed.
@@ -676,54 +678,48 @@ The consequences of instantiating the `thread_aware` template are as follows:
       to be held are indeed held (except for the thread domain of the device
       object, as DMLC automatically esnures ensures it's acquired.)
         * For example, any manual calls to `set()` and `validate()` methods
-          of connects must ensure that the  passed target is acquired
-          (if non-`NULL`) before the call is made.
+          of connects must ensure that the cell domain of the passed target is
+          acquired (if non-`NULL`) before the call is made.
     * All DML libraries used by the model must be able to support thread-aware
       models.
 */
-template _subdevice_thread_aware {
-    shared method _concurrency_group_attr_list_len() -> (uint32);
-    method _concurrency_group_attr_list_len() -> (uint32) {
+template _subdevice_thread_aware is subdevice {
+    param _each_subdevice_thread_aware : sequence(_subdevice_thread_aware);
+    param _each_subdevice_thread_aware =
+        each _subdevice_thread_aware in (this);
+
+    shared method _concurrency_group_attr_list_len() -> (uint32) {
         local uint32 len = 1;
-        len += (each port in (this)).len;
-        len += (each bank in (this)).len;
-        foreach sd in (each _subdevice_thread_aware in (this)) {
+        foreach sd in (_each_subdevice_thread_aware) {
             len += sd._concurrency_group_attr_list_len();
         }
         return len;
     }
 
-    shared method _populate_concurrency_group_attr_list(attr_value_t *val)
-        -> (uint64);
-    method _populate_concurrency_group_attr_list(attr_value_t *val)
+    shared method _populate_concurrency_group_attr_list(attr_value_t *vals)
         -> (uint64) {
         local uint64 i = 0;
-        val[i++] = SIM_make_attr_object(this.obj);
-        foreach x in (each port in (this)) {
-            val[i++] = SIM_make_attr_object(x._port_obj());
-        }
-        foreach x in (each bank in (this)) {
-            val[i++] = SIM_make_attr_object(x._bank_obj());
-        }
-        foreach sd in (each _subdevice_thread_aware in (this)) {
-            i += sd._populate_concurrency_group_attr_list(val + i);
+        vals[i++] = SIM_make_attr_object(this._port_obj());
+        foreach sd in (_each_subdevice_thread_aware) {
+            i += sd._populate_concurrency_group_attr_list(vals + i);
         }
         return i;
     }
 }
-template thread_aware is (device, _subdevice_thread_aware) {
-    param _thread_aware = true;
+
+template _thread_aware is device {
+    param __thread_aware = true;
     param init_concurrency_mode default Sim_Concurrency_Mode_Serialized;
     param supported_modes default Sim_Concurrency_Mode_Serialized
                                   | Sim_Concurrency_Mode_Serialized_Memory;
 
     group _init_current_concurrency_mode is init {
         method init() {
-            parent.concurrency_mode.current = parent.init_concurrency_mode;
-            assert (parent.concurrency_mode.current
-                    & parent.concurrency_mode.current - 1) == 0;
-            assert (parent.concurrency_mode.current
-                    & parent.supported_modes) != 0;
+            dev.concurrency_mode.current = dev.init_concurrency_mode;
+            assert (dev.concurrency_mode.current
+                    & dev.concurrency_mode.current - 1) == 0;
+            assert (dev.concurrency_mode.current
+                    & dev.supported_modes) != 0;
         }
     }
 
@@ -731,12 +727,12 @@ template thread_aware is (device, _subdevice_thread_aware) {
         saved concurrency_mode_t current;
 
         method supported_modes() -> (concurrency_mode_t) {
-            return parent.supported_modes;
+            return dev.supported_modes;
         }
 
         method switch_mode(concurrency_mode_t mode) {
             assert (mode & mode - 1) == 0;
-            assert (mode & parent.supported_modes) != 0;
+            assert (mode & dev.supported_modes) != 0;
             current = mode;
         }
 
@@ -745,12 +741,35 @@ template thread_aware is (device, _subdevice_thread_aware) {
         }
     }
 
+    in each (subdevice) { is _subdevice_thread_aware; }
+
     implement concurrency_group {
         method execution_group(uint32 group_index) -> (attr_value_t) {
             if (group_index > 0) return SIM_make_attr_nil();
-            local uint32 len = parent._concurrency_group_attr_list_len();
+
+            local uint32 len = 1
+                             + (each bank in (dev)).len
+                             + (each port in (dev)).len;
+            foreach sd in (each _subdevice_thread_aware in (dev)) {
+                len += sd._concurrency_group_attr_list_len();
+            }
+
             local attr_value_t list = SIM_alloc_attr_list(len);
-            parent._populate_concurrency_group_attr_list(SIM_attr_list(list));
+            local attr_value_t *vals = SIM_attr_list(list);
+
+            local uint64 i = 0;
+            vals[i++] = SIM_make_attr_object(dev.obj);
+            foreach x in (each bank in (dev)) {
+                vals[i++] = SIM_make_attr_object(x._bank_obj());
+            }
+            foreach x in (each port in (dev)) {
+                vals[i++] = SIM_make_attr_object(x._port_obj());
+            }
+
+            foreach x in (each _subdevice_thread_aware in (dev)) {
+                i += x._populate_concurrency_group_attr_list(vals + i);
+            }
+
             return list;
         }
 
@@ -1494,7 +1513,7 @@ template connect is _conf_attribute {
 
         // Check if the new setting is valid
         if (obj) {
-            #if (dev.thread_aware) SIM_ACQUIRE_TARGET(obj, &lock);
+            #if (dev._thread_aware) SIM_ACQUIRE_CELL(obj, &lock);
             local bool valid = true;
             foreach iface in (this._each_interface) {
                 if (iface._required) {
@@ -1515,7 +1534,7 @@ template connect is _conf_attribute {
                 }
             }
             if (!valid) {
-                #if (dev.thread_aware) SIM_RELEASE_TARGET(obj, &lock);
+                #if (dev._thread_aware) SIM_RELEASE_CELL(obj, &lock);
                 return Sim_Set_Interface_Not_Found;
             }
             local const char *old_port = this.port;
@@ -1523,7 +1542,7 @@ template connect is _conf_attribute {
             local bool ok = validate(obj);
             this.port = old_port;
             if (!ok) {
-                #if (dev.thread_aware) SIM_RELEASE_TARGET(obj, &lock);
+                #if (dev._thread_aware) SIM_RELEASE_CELL(obj, &lock);
                 return Sim_Set_Illegal_Value;
             }
         }
@@ -1534,7 +1553,7 @@ template connect is _conf_attribute {
         this.port = port ? MM_STRDUP(port) : NULL;
         /*% COVERITY var_deref_model %*/
         this.set(obj);
-        #if (dev.thread_aware) if (obj) SIM_RELEASE_TARGET(obj, &lock);
+        #if (dev._thread_aware) if (obj) SIM_RELEASE_CELL(obj, &lock);
         return Sim_Set_Ok;
     }
 
@@ -1575,7 +1594,7 @@ The `init_as_subobj` inherits the [`init`](#init) and
 template init_as_subobj is (connect, init) {
     param classname : const char *;
     param configuration default "none";
-    #if (dev.thread_aware) {
+    #if (dev._thread_aware) {
         error "`init_as_subobj` can't be used with a thread-aware device "
             + "model";
     }
@@ -3834,9 +3853,9 @@ template event is (object, shown_desc) {
             // without a clock, we cannot have posted any events
             return;
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
         SIM_event_cancel_time(dev.obj, evclass, dev.obj, NULL, NULL);
-        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
     }
     // internal callbacks, invoked by auto-generated callback functions
     method __describe_event(void *data) -> (char *) default {
@@ -3983,17 +4002,17 @@ template _time_event is _event {
                 + " cannot post event";
         else {
             local domain_lock_t *lock;
-            #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+            #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
             SIM_event_post_time(clk, *_pevclass, dev.obj, when, data);
-            #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+            #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
         }
     }
     shared method _next(void *data) -> (double) {
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
         local double upcoming = SIM_event_find_next_time(
             dev.obj, *_pevclass, dev.obj, DML_pointer_eq, data);
-        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
         return upcoming;
     }
 }
@@ -4006,17 +4025,17 @@ template _cycle_event is _event {
                 + " cannot post event";
         else {
             local domain_lock_t *lock;
-            #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+            #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
             SIM_event_post_cycle(clk, *_pevclass, dev.obj, when, data);
-            #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+            #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
         }
     }
     shared method _next(void *data) -> (uint64) {
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
         local double upcoming = SIM_event_find_next_cycle(
             dev.obj, *_pevclass, dev.obj, DML_pointer_eq, data);
-        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
         return upcoming;
     }
 }
@@ -4047,19 +4066,19 @@ template _simple_event is _event {
 
     shared method posted() -> (bool) {
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
         local double upcoming = SIM_event_find_next_cycle(
             dev.obj, *_pevclass, dev.obj, DML_pointer_eq, NULL);
-        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
         return upcoming >= 0;
     }
 
     shared method remove() {
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
         SIM_event_cancel_time(dev.obj, *_pevclass, dev.obj,
                               DML_pointer_eq, NULL);
-        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
     }
 }
 
@@ -4160,19 +4179,19 @@ template _uint64_event is _event {
 
     shared method posted(uint64 data) -> (bool) {
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
         local double upcoming = SIM_event_find_next_cycle(
             dev.obj, *_pevclass, dev.obj, DML_pointer_eq, _int_to_voidp(data));
-        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
         return upcoming >= 0;
     }
 
     shared method remove(uint64 data) {
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
         SIM_event_cancel_time(dev.obj, *_pevclass, dev.obj,
                               DML_pointer_eq, _int_to_voidp(data));
-        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
     }
 
     shared method event(uint64 data);
diff --git a/lib/1.4/utility.dml b/lib/1.4/utility.dml
index 55a7e4e4b..cb0028789 100644
--- a/lib/1.4/utility.dml
+++ b/lib/1.4/utility.dml
@@ -1128,7 +1128,7 @@ template function_io_memory {
         foreach b in (each function_mapped_bank in (dev)) {
             if (b.function == map_info.function) {
                 local domain_lock_t *lock;
-                #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+                #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
                 if (!b.use_io_memory) {
                     local map_target_t *mt = SIM_new_map_target(b._bank_obj(),
                                                                  NULL, NULL);
@@ -1136,7 +1136,7 @@ template function_io_memory {
                         local exception_type_t _exc = SIM_clear_exception();
                         log error: "failed to create map target for %s: %s",
                             SIM_object_name(b._bank_obj()), SIM_last_error();
-                        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+                        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
                         return Sim_PE_IO_Not_Taken;
                     }
                     // hack: VT_map_target_access doesn't accept a map_info
@@ -1149,17 +1149,17 @@ template function_io_memory {
                         = VT_map_target_access(mt, mem_op);
                     SIM_set_mem_op_physical_address(mem_op, before);
                     SIM_free_map_target(mt);
-                    #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+                    #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
                     return ret;
                 }
                 if (b.io_memory_access(
                         mem_op,
                         SIM_get_mem_op_physical_address(mem_op) + offset,
                         NULL)) {
-                    #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+                    #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
                     return Sim_PE_No_Exception;
                 } else {
-                    #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+                    #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
                     return Sim_PE_IO_Not_Taken;
                 }
             }
@@ -1380,14 +1380,14 @@ template map_target is (connect, _qname) {
             }
         }
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_TARGET(obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(obj, &lock);
         local exception_type_t exc = SIM_issue_transaction(map_target, t, addr);
         local bool fail = exc != Sim_PE_No_Exception;
         log info, (fail ? 2 : 4) : "%s%s %d bytes @ 0x%x in %s",
             fail ? "failed to " : "",
             SIM_transaction_is_read(t) ? "read" : fail ? "write" : "wrote",
             SIM_transaction_size(t), addr, SIM_object_name(obj);
-        #if (dev.thread_aware) SIM_RELEASE_TARGET(obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(obj, &lock);
         return exc;
     }
 }
@@ -1451,10 +1451,10 @@ template signal_connect is (connect, post_init) {
 
     method post_init() default {
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
         if (!SIM_is_restoring_state(dev.obj) && signal.val && signal.high)
             signal.signal_raise();
-        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
     }
 
     method set(conf_object_t *obj) default {
@@ -1469,10 +1469,10 @@ template signal_connect is (connect, post_init) {
         //   currently high but the effects of the hotplug has already taken
         //   place so we should NOT treat it as a hotplug.
         local domain_lock_t *lock;
-        #if (dev.thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_ACQUIRE_CELL(dev.obj, &lock);
         local bool hotplug = SIM_object_is_configured(dev.obj)
             && !SIM_is_restoring_state(dev.obj);
-        #if (dev.thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
+        #if (dev._thread_aware) SIM_RELEASE_CELL(dev.obj, &lock);
         if (hotplug && signal.val && signal.high)
             signal.signal_lower();
         default(obj);
diff --git a/py/dml/c_backend.py b/py/dml/c_backend.py
index 08ecf9b07..0113e491e 100644
--- a/py/dml/c_backend.py
+++ b/py/dml/c_backend.py
@@ -1269,13 +1269,14 @@ def generate_simple_events_control_methods(device):
     out('{\n', postindent = 1)
     out(crep.structtype(device) + ' *_dev UNUSED = ('
         + crep.structtype(device) + '*)_obj;\n')
-    output_domain_lock_decl('_lock')
-    output_acquire_cell('_obj', '_lock')
-    for key in dml.globals.after_delay_infos:
-        out('SIM_event_cancel_time('
-            + f'SIM_object_clock(_obj), {crep.get_evclass(key)}, _obj, '
-            + '_simple_event_predicate, (lang_void *) &domain);\n')
-    output_release_cell('_obj', '_lock')
+    if dml.globals.after_delay_infos:
+        output_domain_lock_decl('_lock')
+        output_acquire_cell('_obj', '_lock')
+        for key in dml.globals.after_delay_infos:
+            out('SIM_event_cancel_time('
+                + f'SIM_object_clock(_obj), {crep.get_evclass(key)}, _obj, '
+                + '_simple_event_predicate, (lang_void *) &domain);\n')
+        output_release_cell('_obj', '_lock')
 
     site = logging.SimpleSite('<_cancel_simple_events>')
     by_dims = {}
@@ -1336,25 +1337,27 @@ def generate_register_events(device):
     if not events and not dml.globals.after_delay_infos:
         out('return;\n')
     else:
+        flags = 'Sim_EC_No_Serialize' if dml.globals.thread_aware else '0'
         for event in events:
             if (dml.globals.dml_version == (1, 2)
                 and param_str(event, 'timebase') == 'stacked'
                 and event.dimensions > 0):
                 raise ICE(event, "stacked event array not supported")
             for indices in event.all_indices():
-                out('%s%s = SIM_register_event("%s", class, 0, %s);\n'
+                out('%s%s = SIM_register_event("%s", class, %s, %s);\n'
                     % (crep.get_evclass(event),
-                    ''.join('[' + str(i) + ']' for i in indices),
-                    event.logname_anonymized(indices),
-                    ', '.join(
-                        cname
-                        for (_, cname) in event_callbacks(event,
+                       ''.join('[' + str(i) + ']' for i in indices),
+                       event.logname_anonymized(indices),
+                       flags,
+                       ', '.join(
+                           cname
+                           for (_, cname) in event_callbacks(event,
                                                           indices))))
         for (key, info) in dml.globals.after_delay_infos.items():
-            out(('%s = SIM_register_event(%s, class, 0, %s, %s, %s, %s, '
+            out(('%s = SIM_register_event(%s, class, %s, %s, %s, %s, %s, '
                 + 'NULL);\n')
                 % (crep.get_evclass(key), string_literal(info.string_key),
-                   info.cident_callback, '_destroy_simple_event_data',
+                   flags, info.cident_callback, '_destroy_simple_event_data',
                    info.cident_get_value, info.cident_set_value))
     out('}\n\n', preindent = -1)
     splitting_point()
diff --git a/py/dml/structure.py b/py/dml/structure.py
index be5117c89..88f6bed87 100644
--- a/py/dml/structure.py
+++ b/py/dml/structure.py
@@ -1491,6 +1491,13 @@ def mkobj2(obj, obj_specs, params, each_stmts):
         for (issite, tpl) in obj_spec.templates:
             if tpl.trait:
                 obj_traits.append((issite, tpl.trait))
+            # TODO remove once thread-awareness support is public.
+            # Determining thread-awareness via the _thread_aware param is
+            # cleaner, but doesn't provide is-site info.
+            if (obj.objtype == 'device'
+                and tpl.name == '_thread_aware'
+                and dml.globals.dml_version != (1, 2)):
+                report(WEXPERIMENTAL(issite, 'thread-aware device model'))
 
     for obj_spec in obj_specs:
         for (templates, spec) in obj_spec.in_eachs:
@@ -1956,8 +1963,7 @@ def mkobj2(obj, obj_specs, params, each_stmts):
             mark_method_exported(func, name, export.site)
 
         if dml.globals.dml_version != (1, 2):
-            dml.globals.thread_aware = param_bool_fixup(
-                obj, 'thread_aware', False)
+            dml.globals.thread_aware = param_bool(obj, '_thread_aware')
 
     elif obj.objtype == 'bank':
         set_confidential_object(obj)
diff --git a/test/1.4/misc/T_thread_aware.py b/test/1.4/misc/T_thread_aware.py
deleted file mode 100644
index 6c1b17fb5..000000000
--- a/test/1.4/misc/T_thread_aware.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# © 2024 Intel Corporation
-# SPDX-License-Identifier: MPL-2.0
-
-import stest
-
-stest.expect_equal(obj.count, 0)
-
-cpu = SIM_create_object("clock", "clock", [["freq_mhz", 1]])
-# TODO: is this the desired behaviour? attributes not controlled by DML do not
-#       count for the context
-obj.queue = cpu
-
-# stest.expect_equal(obj.count, 0)
-
-obj.a = None
-
-# stest.expect_equal(obj.count, 1)
-
-local = obj.a
-
-# stest.expect_equal(obj.count, 2)
-
-obj.ev = None
-
-# stest.expect_equal(obj.count, 3)
-
-SIM_continue(100000)
-
-# stest.expect_equal(obj.count, 4)
-
-obj.iface.signal.signal_raise()
-
-# stest.expect_equal(obj.count, 5)
-
-SIM_notify(obj, SIM_notifier_type("exported-entry"))
-
-# stest.expect_equal(obj.count, 6)
-
-SIM_notify(obj, SIM_notifier_type("statically-exported-entry"))
-
-# stest.expect_equal(obj.count, 7)
-
-obj.immediate_after = None
-
-# stest.expect_equal(obj.count, 8)
-
-SIM_process_pending_work()
-
-# stest.expect_equal(obj.count, 9)
diff --git a/test/1.4/misc/T_thread_aware.dml b/test/1.4/misc/thread_aware.dml
similarity index 50%
rename from test/1.4/misc/T_thread_aware.dml
rename to test/1.4/misc/thread_aware.dml
index dc2e43c78..6535337fd 100644
--- a/test/1.4/misc/T_thread_aware.dml
+++ b/test/1.4/misc/thread_aware.dml
@@ -7,12 +7,112 @@ dml 1.4;
 device test;
 
 import "utility.dml";
+import "simics/util/os.dml";
 
 saved int s;
 
-is thread_aware;
+// This test should accomplish two things:
+// 1. Only pass if the device is actually considered thread-aware, and is
+//    entered without the cell domain being taken.
+// 2. Only pass if the device acquires and releases locks correctly
+//
+// Accomplished by two cooperative devices that have a piece of shared state:
+// 1. is accomplished by manipulating the shared state without locking,
+//    relying on sleeps for one thread to observe the changes made by the
+//    other, and performing tests on that state that can only succeed if
+//    the two devices are entered concurrently.
+// 2. is accomplished by manipulations on the shared state while holding
+//    an object lock; then having a device entry to inspect the
+//    state be blocked by an existing entry that modifies it. This blocking is
+//    ensured by also leveraging sleeps. The test is practically guaranteed to
+//    succeed only if object locking is done correctly.
+
+/// WARNING WEXPERIMENTAL
+is _thread_aware;
 
-param init_concurrency_mode = Sim_Concurrency_Mode_Serialized;
+header %{
+    static volatile int32 shared_state = 0;
+%}
+
+extern int32 shared_state;
+
+connect buddy {
+    session int32 *state;
+    param configuration = "optional";
+    interface pulse;
+}
+
+attribute setup_main is write_only_attr {
+    param type = "n";
+    method set(attr_value_t val) throws default {
+        after 1 cycles: event();
+    }
+
+    method event() {
+        os_millisleep(50);
+        // Second resolved (due to the sleep in setup_buddy.event() being
+        // longer)
+
+        // This essentially serves as a check that setup_buddy.event() is
+        // ongoing; and thus the buddy is holding its own lock
+        assert shared_state == 1;
+        // Modify shared state without acquiring lock.
+        shared_state = 2;
+        // Then try to enter the buddy, and become blocked
+        buddy.pulse.pulse();
+    }
+}
+
+attribute setup_buddy is write_only_attr {
+    param type = "n";
+    method set(attr_value_t val) throws default {
+        after 1 cycles: event();
+    }
+
+    method event() {
+        // First resolved (due to the sleep in setup_main.event())
+        shared_state = 1;
+        os_millisleep(100);
+        // Third resolved.
+        assert shared_state == 2;
+        shared_state = 3;
+        // Once this is left, the object lock of the buddy will be released
+    }
+}
+
+implement pulse {
+    method pulse() {
+        // Last resolved, when setup_main.event() becomes unblocked and may
+        // enter the buddy
+        assert shared_state == 3;
+    }
+}
+
+attribute shared_state_attr {
+    param type = "i";
+    method get() -> (attr_value_t) {
+        return SIM_make_attr_int64(shared_state);
+    }
+    method set(attr_value_t val) throws {
+        shared_state = SIM_attr_integer(val);
+    }
+}
+
+bank b;
+port p;
+subdevice sd {
+    subdevice sd {
+        bank b;
+        port p;
+    }
+    bank b;
+    port p;
+}
+
+// Remainder is skeleton code stolen from 1.4/misc/notify_state.
+// We have to do similar tests that object locking is done for the various ways
+// the device may be entered.
+// ... That, or write those tests via ctree_tests.py
 
 header %{
     #include <assert.h>
@@ -127,4 +227,15 @@ implement signal {
 }
 
 
+// port insig is signal_port;
+// connect outsig is signal_connect;
 
+// attribute test_outsig is (pseudo_attr, bool_attr) {
+//     method set(attr_value_t value) throws {
+//         default(value);
+//         if (this.val)
+//             outsig.set_level(1);
+//         else
+//             outsig.set_level(0);
+//     }
+// }
diff --git a/test/1.4/misc/thread_aware.py b/test/1.4/misc/thread_aware.py
new file mode 100644
index 000000000..fd848825f
--- /dev/null
+++ b/test/1.4/misc/thread_aware.py
@@ -0,0 +1,71 @@
+# © 2024 Intel Corporation
+# SPDX-License-Identifier: MPL-2.0
+
+import stest
+
+cpu1 = SIM_create_object("clock", "clock1", freq_mhz=1)
+cpu2 = SIM_create_object("clock", "clock2", freq_mhz=1, cell=cpu1.cell)
+
+buddy = SIM_create_object("test", "buddy")
+obj.queue = cpu1
+obj.buddy = buddy
+buddy.queue = cpu2
+
+SIM_run_command('enable-multicore-accelerator')
+for cpu in (cpu1, cpu2):
+    stest.expect_equal(cpu.multicore_accelerator_enabled, True)
+
+for dev in (obj, buddy):
+    stest.expect_equal(dev.iface.concurrency_mode.current_mode(),
+                       Sim_Concurrency_Mode_Serialized_Memory)
+    stest.expect_equal(
+        sorted(dev.iface.concurrency_group.execution_group(0)),
+        sorted([dev,
+                dev.bank.b, dev.sd.bank.b, dev.sd.sd.bank.b,
+                dev.port.p, dev.sd.port.p, dev.sd.sd.port.p,
+                dev.sd, dev.sd.sd]))
+
+obj.setup_main = None
+buddy.setup_buddy = None
+
+SIM_continue(1)
+stest.expect_equal(obj.shared_state_attr, 3)
+
+
+# # stest.expect_equal(obj.count, 0)
+
+# obj.a = None
+
+# # stest.expect_equal(obj.count, 1)
+
+# local = obj.a
+
+# # stest.expect_equal(obj.count, 2)
+
+# obj.ev = None
+
+# # stest.expect_equal(obj.count, 3)
+
+# SIM_continue(100000)
+
+# # stest.expect_equal(obj.count, 4)
+
+# obj.iface.signal.signal_raise()
+
+# # stest.expect_equal(obj.count, 5)
+
+# SIM_notify(obj, SIM_notifier_type("exported-entry"))
+
+# # stest.expect_equal(obj.count, 6)
+
+# SIM_notify(obj, SIM_notifier_type("statically-exported-entry"))
+
+# # stest.expect_equal(obj.count, 7)
+
+# obj.immediate_after = None
+
+# # stest.expect_equal(obj.count, 8)
+
+# SIM_process_pending_work()
+
+# # stest.expect_equal(obj.count, 9)
diff --git a/test/tests.py b/test/tests.py
index 96564af00..65a56d4c9 100644
--- a/test/tests.py
+++ b/test/tests.py
@@ -193,7 +193,9 @@ class DMLFileTestCase(BaseTestCase):
         'simics_stderr',
         'extraenv',                     # Extra environment variables
 
-        'status'                        # Expected status
+        'status',                       # Expected status
+
+        'thread_safe',                  # If the device module is considered thread-safe
         )
     def __init__(self, fullname, filename, **info):
         BaseTestCase.__init__(self, fullname)
@@ -205,6 +207,7 @@ def __init__(self, fullname, filename, **info):
         self.cc_extraargs = []
         self.status = 0
         self.extraenv = {}
+        self.thread_safe = False
         # Override defaults
         for k,v in info.items():
             setattr(self, k, v)
@@ -649,7 +652,7 @@ class options(object):
             cpumod = None
             date = None
             product = None
-            thread_safe = "no"
+            thread_safe = "yes" if self.thread_safe else "no"
             host_type = host_type()
             py_version = None
             py_iface_lists = []
@@ -1077,6 +1080,11 @@ def run_cc(self, cc_extraargs):
          status = 2,
          dmlc_extraargs = ["--werror"]))
 
+all_tests.append(CTestCase(
+         ["1.4", "misc", "thread_aware"],
+         join(testdir, "1.4", "misc", "thread_aware.dml"),
+         thread_safe=True))
+
 if get_simics_major() == "6":
     all_tests.append(CTestCase(
         ["1.2", "errors", "WREF"],