diff --git a/Chap_SIMD.tex b/Chap_SIMD.tex index 8a57ac9..8f1be13 100644 --- a/Chap_SIMD.tex +++ b/Chap_SIMD.tex @@ -8,34 +8,34 @@ Many processors have SIMD (vector) units that can perform simultaneously 2, 4, 8 or more executions of the same operation (by a single SIMD unit). -Loops without loop-carried backward dependency (or with dependency preserved using -ordered simd) are candidates for vectorization by the compiler for +Loops without loop-carried backward dependences (or with dependences preserved using +\kcode{ordered simd}) are candidates for vectorization by the compiler for execution with SIMD units. In addition, with state-of-the-art vectorization -technology and \code{declare simd} directive extensions for function vectorization +technology and \kcode{declare simd} directive extensions for function vectorization in the OpenMP 4.5 specification, loops with function calls can be vectorized as well. The basic idea is that a scalar function call in a loop can be replaced by a vector version of the function, and the loop can be vectorized simultaneously by combining a loop -vectorization (\code{simd} directive on the loop) and a function -vectorization (\code{declare simd} directive on the function). +vectorization (\kcode{simd} directive on the loop) and a function +vectorization (\kcode{declare simd} directive on the function). -A \code{simd} construct states that SIMD operations be performed on the +A \kcode{simd} construct states that SIMD operations be performed on the data within the loop. A number of clauses are available to provide -data-sharing attributes (\code{private}, \code{linear}, \code{reduction} and -\code{lastprivate}). Other clauses provide vector length preference/restrictions -(\code{simdlen} / \code{safelen}), loop fusion (\code{collapse}), and data -alignment (\code{aligned}). +data-sharing attributes (\kcode{private}, \kcode{linear}, \kcode{reduction} and +\kcode{lastprivate}). Other clauses provide vector length preference/restrictions +(\kcode{simdlen} / \kcode{safelen}), loop fusion (\kcode{collapse}), and data +alignment (\kcode{aligned}). -The \code{declare simd} directive designates +The \kcode{declare simd} directive designates that a vector version of the function should also be constructed for -execution within loops that contain the function and have a \code{simd} -directive. Clauses provide argument specifications (\code{linear}, -\code{uniform}, and \code{aligned}), a requested vector length -(\code{simdlen}), and designate whether the function is always/never -called conditionally in a loop (\code{notinbranch}/\code{inbranch}). +execution within loops that contain the function and have a \kcode{simd} +directive. Clauses provide argument specifications (\kcode{linear}, +\kcode{uniform}, and \kcode{aligned}), a requested vector length +(\kcode{simdlen}), and designate whether the function is always/never +called conditionally in a loop (\kcode{notinbranch}/\kcode{inbranch}). The latter is for optimizing performance. -Also, the \code{simd} construct has been combined with the worksharing loop -constructs (\code{for simd} and \code{do simd}) to enable simultaneous thread +Also, the \kcode{simd} construct has been combined with the worksharing loop +constructs (\kcode{for simd} and \kcode{do simd}) to enable simultaneous thread execution in different SIMD units. %Hence, the \code{simd} construct can be %used alone on a loop to direct vectorization (SIMD execution), or in diff --git a/Chap_affinity.tex b/Chap_affinity.tex index d2ec343..3babdc8 100644 --- a/Chap_affinity.tex +++ b/Chap_affinity.tex @@ -1,7 +1,7 @@ \cchapter{OpenMP Affinity}{affinity} \label{chap:openmp_affinity} -OpenMP Affinity consists of a \code{proc\_bind} policy (thread affinity policy) and a specification of +OpenMP Affinity consists of a \kcode{proc_bind} policy (thread affinity policy) and a specification of places (``location units'' or \plc{processors} that may be cores, hardware threads, sockets, etc.). OpenMP Affinity enables users to bind computations on specific places. @@ -11,13 +11,13 @@ if two or more cores (hardware threads, sockets, etc.) have been assigned to a given place. Often the binding can be managed without resorting to explicitly setting places. -Without the specification of places in the \code{OMP\_PLACES} variable, +Without the specification of places in the \kcode{OMP_PLACES} variable, the OpenMP runtime will distribute and bind threads using the entire range of processors for -the OpenMP program, according to the \code{OMP\_PROC\_BIND} environment variable -or the \code{proc\_bind} clause. When places are specified, the OMP runtime +the OpenMP program, according to the \kcode{OMP_PROC_BIND} environment variable +or the \kcode{proc_bind} clause. When places are specified, the OMP runtime binds threads to the places according to a default distribution policy, or -those specified in the \code{OMP\_PROC\_BIND} environment variable or the -\code{proc\_bind} clause. +those specified in the \kcode{OMP_PROC_BIND} environment variable or the +\kcode{proc_bind} clause. In the OpenMP Specifications document a processor refers to an execution unit that is enabled for an OpenMP thread to use. A processor is a core when there is @@ -31,7 +31,7 @@ The processors available to a process may be a subset of the system's processors. This restriction may be the result of a -wrapper process controlling the execution (such as \code{numactl} on Linux systems), +wrapper process controlling the execution (such as \plc{numactl} on Linux systems), compiler options, library-specific environment variables, or default kernel settings. For instance, the execution of multiple MPI processes, launched on a single compute node, will each have a subset of processors as @@ -53,20 +53,20 @@ Threads of a team are positioned onto places in a compact manner, a scattered distribution, or onto the primary thread's place, by setting the -\code{OMP\_PROC\_BIND} environment variable or the \code{proc\_bind} clause to -\code{close}, \code{spread}, or \code{primary} (\code{master} has been deprecated), respectively. When -\code{OMP\_PROC\_BIND} is set to FALSE no binding is enforced; and +\kcode{OMP_PROC_BIND} environment variable or the \kcode{proc_bind} clause to +\kcode{close}, \kcode{spread}, or \kcode{primary} (\kcode{master} has been deprecated), respectively. When +\kcode{OMP_PROC_BIND} is set to FALSE no binding is enforced; and when the value is TRUE, the binding is implementation defined to -a set of places in the \code{OMP\_PLACES} variable or to places -defined by the implementation if the \code{OMP\_PLACES} variable +a set of places in the \kcode{OMP_PLACES} variable or to places +defined by the implementation if the \kcode{OMP_PLACES} variable is not set. -The \code{OMP\_PLACES} variable can also be set to an abstract name -(\code{threads}, \code{cores}, \code{sockets}) to specify that a place is +The \kcode{OMP_PLACES} variable can also be set to an abstract name +(\kcode{threads}, \kcode{cores}, \kcode{sockets}) to specify that a place is either a single hardware thread, a core, or a socket, respectively. -This description of the \code{OMP\_PLACES} is most useful when the +This description of the \kcode{OMP_PLACES} is most useful when the number of threads is equal to the number of hardware thread, cores -or sockets. It can also be used with a \code{close} or \code{spread} +or sockets. It can also be used with a \kcode{close} or \kcode{spread} distribution policy when the equality doesn't hold. diff --git a/Chap_data_environment.tex b/Chap_data_environment.tex index 18e5ff7..245a12a 100644 --- a/Chap_data_environment.tex +++ b/Chap_data_environment.tex @@ -1,12 +1,12 @@ \cchapter{Data Environment}{data_environment} \label{chap:data_environment} The OpenMP \plc{data environment} contains data attributes of variables and -objects. Many constructs (such as \code{parallel}, \code{simd}, \code{task}) +objects. Many constructs (such as \kcode{parallel}, \kcode{simd}, \kcode{task}) accept clauses to control \plc{data-sharing} attributes of referenced variables in the construct, where \plc{data-sharing} applies to whether the attribute of the variable is \plc{shared}, is \plc{private} storage, or has special operational characteristics -(as found in the \code{firstprivate}, \code{lastprivate}, \code{linear}, or \code{reduction} clause). +(as found in the \kcode{firstprivate}, \kcode{lastprivate}, \kcode{linear}, or \kcode{reduction} clause). The data environment for a device (distinguished as a \plc{device data environment}) is controlled on the host by \plc{data-mapping} attributes, which determine the @@ -21,15 +21,15 @@ Certain variables and objects have predetermined attributes. A commonly found case is the loop iteration variable in associated loops -of a \code{for} or \code{do} construct. It has a private data-sharing attribute. +of a \kcode{for} or \kcode{do} construct. It has a private data-sharing attribute. Variables with predetermined data-sharing attributes cannot be listed in a data-sharing clause; but there are some exceptions (mainly concerning loop iteration variables). Variables with explicitly determined data-sharing attributes are those that are referenced in a given construct and are listed in a data-sharing attribute clause on the construct. Some of the common data-sharing clauses are: -\code{shared}, \code{private}, \code{firstprivate}, \code{lastprivate}, -\code{linear}, and \code{reduction}. % Are these all of them? +\kcode{shared}, \kcode{private}, \kcode{firstprivate}, \kcode{lastprivate}, +\kcode{linear}, and \kcode{reduction}. % Are these all of them? Variables with implicitly determined data-sharing attributes are those that are referenced in a given construct, do not have predetermined @@ -37,41 +37,41 @@ attribute clause of an enclosing construct. For a complete list of variables and objects with predetermined and implicitly determined attributes, please refer to the -\plc{Data-sharing Attribute Rules for Variables Referenced in a Construct} +\docref{Data-sharing Attribute Rules for Variables Referenced in a Construct} subsection of the OpenMP Specifications document. \bigskip DATA-MAPPING ATTRIBUTES -The \code{map} clause on a device construct explicitly specifies how the list items in +The \kcode{map} clause on a device construct explicitly specifies how the list items in the clause are mapped from the encountering task's data environment (on the host) to the corresponding item in the device data environment (on the device). The common \plc{list items} are arrays, array sections, scalars, pointers, and structure elements (members). Procedures and global variables have predetermined data mapping if they appear -within the list or block of a \code{declare}~\code{target} directive. Also, a C/C++ pointer +within the list or block of a \kcode{declare target} directive. Also, a C/C++ pointer is mapped as a zero-length array section, as is a C++ variable that is a reference to a pointer. % Waiting for response from Eric on this. -Without explicit mapping, non-scalar and non-pointer variables within the scope of the \code{target} -construct are implicitly mapped with a \plc{map-type} of \code{tofrom}. -Without explicit mapping, scalar variables within the scope of the \code{target} +Without explicit mapping, non-scalar and non-pointer variables within the scope of the \kcode{target} +construct are implicitly mapped with a \plc{map-type} of \kcode{tofrom}. +Without explicit mapping, scalar variables within the scope of the \kcode{target} construct are not mapped, but have an implicit firstprivate data-sharing attribute. (That is, the value of the original variable is given to a private variable of the same name on the device.) This behavior can be changed with -the \code{defaultmap} clause. +the \kcode{defaultmap} clause. -The \code{map} clause can appear on \code{target}, \code{target data} and -\code{target enter/exit data} constructs. The operations of creation and +The \kcode{map} clause can appear on \kcode{target}, \kcode{target data} and +\kcode{target enter/exit data} constructs. The operations of creation and removal of device storage as well as assignment of the original list item values to the corresponding list items may be complicated when the list item appears on multiple constructs or when the host and device storage is shared. In these cases the item's reference count, the number of times -it has been referenced (+1 on entry and -1 on exited) in nested (structured) +it has been referenced (increment by 1 on entry and decrement by 1 on exit) in nested (structured) map regions and/or accumulative (unstructured) mappings, determines the operation. -Details of the \code{map} clause and reference count operation are specified -in the \plc{map Clause} subsection of the OpenMP Specifications document. +Details of the \kcode{map} clause and reference count operation are specified +in the \docref{\kcode{map} Clause} subsection of the OpenMP Specifications document. %===== Examples Sections ===== @@ -81,6 +81,7 @@ \input{data_environment/fort_loopvar} \input{data_environment/fort_sp_common} \input{data_environment/fort_sa_private} +\input{data_environment/fort_shared_var} \input{data_environment/carrays_fpriv} \input{data_environment/lastprivate} \input{data_environment/reduction} diff --git a/Chap_devices.tex b/Chap_devices.tex index 4d1c791..0f4f0d0 100644 --- a/Chap_devices.tex +++ b/Chap_devices.tex @@ -1,9 +1,9 @@ \cchapter{Devices}{devices} \label{chap:devices} -The \code{target} construct consists of a \code{target} directive -and an execution region. The \code{target} region is executed on -the default device or the device specified in the \code{device} +The \kcode{target} construct consists of a \kcode{target} directive +and an execution region. The \kcode{target} region is executed on +the default device or the device specified in the \kcode{device} clause. In OpenMP version 4.0, by default, all variables within the lexical @@ -16,39 +16,39 @@ The constructs that explicitly create storage, transfer data, and free storage on the device are categorized as structured and unstructured. The -\code{target} \code{data} construct is structured. It creates -a data region around \code{target} constructs, and is +\kcode{target data} construct is structured. It creates +a data region around \kcode{target} constructs, and is convenient for providing persistent data throughout multiple -\code{target} regions. The \code{target} \code{enter} \code{data} and -\code{target} \code{exit} \code{data} constructs are unstructured, because +\kcode{target} regions. The \kcode{target enter data} and +\kcode{target exit data} constructs are unstructured, because they can occur anywhere and do not support a ``structure'' -(a region) for enclosing \code{target} constructs, as does the -\code{target} \code{data} construct. +(a region) for enclosing \kcode{target} constructs, as does the +\kcode{target data} construct. -The \code{map} clause is used on \code{target} +The \kcode{map} clause is used on \kcode{target} constructs and the data-type constructs to map host data. It -specifies the device storage and data movement \code{to} and \code{from} +specifies the device storage and data movement \plc{to} and \plc{from} the device, and controls on the storage duration. There is an important change in the OpenMP 4.5 specification that alters the data model for scalar variables and C/C++ pointer variables. The default behavior for scalar variables and C/C++ pointer variables -in a 4.5 compliant code is \code{firstprivate}. Example +in a 4.5 compliant code is \kcode{firstprivate}. Example codes that have been updated to reflect this new behavior are annotated with a description that describes changes required for correct execution. Often it is a simple matter of mapping -the variable as \code{tofrom} to obtain the intended 4.0 behavior. +the variable as \kcode{tofrom} to obtain the intended 4.0 behavior. In OpenMP version 4.5 the mechanism for target execution is specified as occurring through a \plc{target task}. -When the \code{target} construct is encountered a new -\plc{target task} is generated. The \plc{target task} -completes after the \code{target} region has executed and all data +When the \kcode{target} construct is encountered a new +target task is generated. The target task +completes after the \kcode{target} region has executed and all data transfers have finished. This new specification does not affect the execution of pre-4.5 code; it is a necessary element for asynchronous -execution of the \code{target} region when using the new \code{nowait} +execution of the \kcode{target} region when using the new \kcode{nowait} clause introduced in OpenMP 4.5. @@ -59,6 +59,7 @@ \input{devices/target_structure_mapping} \input{devices/target_fort_allocatable_array_mapping} \input{devices/array_sections} +\input{devices/usm} \input{devices/C++_virtual_functions} \input{devices/array_shaping} \input{devices/target_mapper} diff --git a/Chap_directives.tex b/Chap_directives.tex index ef0b6e9..7572009 100644 --- a/Chap_directives.tex +++ b/Chap_directives.tex @@ -2,7 +2,7 @@ \label{chap:directive_syntax} \index{directive syntax} -OpenMP \emph{directives} use base-language mechanisms to specify OpenMP program behavior. +OpenMP \plc{directives} use base-language mechanisms to specify OpenMP program behavior. In C code, the directives are formed exclusively with pragmas, whereas in C++ code, directives are formed from either pragmas or attributes. Fortran directives are formed with comments in free form and fixed form sources (codes). @@ -20,36 +20,36 @@ C/C++ pragmas \begin{indentedcodelist} -\code{\#pragma omp} \plc{directive-specification} +\kcode{\#pragma omp} \plc{directive-specification} \end{indentedcodelist} C++ attributes \begin{indentedcodelist} -\code{[[omp :: directive(} \plc{directive-specification} \code{)]]} -\code{[[using omp : directive(} \plc{directive-specification} \code{)]]} +\kcode{[[omp :: directive( \plc{directive-specification} )]]} +\kcode{[[using omp : directive( \plc{directive-specification} )]]} \end{indentedcodelist} Fortran comments \begin{indentedcodelist} -\code{!\$omp} \plc{directive-specification} +\scode{!$omp} \plc{directive-specification} \end{indentedcodelist} -where \code{c\$omp} and \code{*\$omp} may be used in Fortran fixed form sources. +where \scode{c$omp} and \scode{*$omp} may be used in Fortran fixed form sources. Most OpenMP directives accept clauses that alter the semantics of the directive in some way, and some directives also accept parenthesized arguments that follow the directive name. -A clause may just be a keyword (e.g., \scode{untied}) or it may also accept argument lists -(e.g., \scode{shared(x,y,z)}) and/or optional modifiers (e.g., \scode{tofrom} in -\scode{map(tofrom:}~\scode{x,y,z)}). +A clause may just be a keyword (e.g., \kcode{untied}) or it may also accept argument lists +(e.g., \kcode{shared(\ucode{x,y,z})}) and/or optional modifiers (e.g., \kcode{tofrom} in +\kcode{map(tofrom: \ucode{x,y,z})}). Clause modifiers may be ``simple'' or ``complex'' -- a complex modifier consists of a keyword followed by one or more parameters, bracketed by parentheses, while a simple -modifier does not. An example of a complex modifier is the \scode{iterator} modifier, -as in \scode{map(iterator(i=0:n),}~\scode{tofrom:}~\scode{p[i])}, or the \scode{step} modifier, as in -\scode{linear(x:}~\scode{ref,}~\scode{step(4))}. -In the preceding examples, \scode{tofrom} and \scode{ref} are simple modifiers. +modifier does not. An example of a complex modifier is the \kcode{iterator} modifier, +as in \kcode{map(iterator(\ucode{i=0:n}), tofrom: \ucode{p[i]})}, or the \kcode{step} modifier, as in +\kcode{linear(\ucode{x}: ref, step(\ucode{4}))}. +In the preceding examples, \kcode{tofrom} and \kcode{ref} are simple modifiers. -For Fortran, a declarative directive (such as \code{declare}~\code{reduction}) -must appear after any \code{USE}, \code{IMPORT}, and \code{IMPLICIT} statements +For Fortran, a declarative directive (such as \kcode{declare reduction}) +must appear after any \bcode{USE}, \bcode{IMPORT}, and \bcode{IMPLICIT} statements in the specification part. diff --git a/Chap_introduction.tex b/Chap_introduction.tex index ac9703a..fce5915 100644 --- a/Chap_introduction.tex +++ b/Chap_introduction.tex @@ -46,24 +46,24 @@ The directives, library routines, and environment variables demonstrated in this document allow users to create and manage parallel programs while permitting -portability. The directives extend the C, C++ and Fortran base languages with single -program multiple data (SPMD) constructs, tasking constructs, device constructs, -worksharing constructs, and synchronization constructs, and they provide support for +portability. The directives extend the C, C++ and Fortran base languages with \plc{single +program multiple data} (SPMD) constructs, \plc{tasking} constructs, \plc{device} constructs, +\plc{worksharing} constructs, and \plc{synchronization} constructs, and they provide support for sharing and privatizing data. The functionality to control the runtime environment is provided by library routines and environment variables. Compilers that support the OpenMP API often include a command line option to the compiler that activates and allows interpretation of all OpenMP directives. The documents and source codes for OpenMP Examples can be downloaded from -\href{https://github.com/OpenMP/Examples}{https://github.com/OpenMP/Examples}. -Each directory holds the contents of a chapter and has a \splc{sources} subdirectory of its codes. +\href{\examplesrepo}{\examplesrepo}. +Each directory holds the contents of a chapter and has a \plc{sources} subdirectory of its codes. This OpenMP Examples \VER{} document and its codes are tagged as -\href{https://github.com/OpenMP/Examples/tree/v\VER}{\plc{v\VER}}. +\examplestree{\VER}{\plc{v\VER}}. Complete information about the OpenMP API and a list of the compilers that support the OpenMP API can be found at the OpenMP.org web site -\code{https://www.openmp.org} +\scode{https://www.openmp.org} \clearpage diff --git a/Chap_loop_transformations.tex b/Chap_loop_transformations.tex index 4a77e65..de7d5ad 100644 --- a/Chap_loop_transformations.tex +++ b/Chap_loop_transformations.tex @@ -12,7 +12,7 @@ heuristics methods with compiler options that may not be able to discover optimal transformations. -Loop transformations can be augmented by preprocessor support or OpenMP \code{metadirective} +Loop transformations can be augmented by preprocessor support or OpenMP \kcode{metadirective} directives, to select optimal dimension and size parameters for specific platforms, facilitating a single code base for multiple platforms. Moreover, directive-based transformations make experimenting easier: diff --git a/Chap_memory_model.tex b/Chap_memory_model.tex index c39e099..9a031fa 100644 --- a/Chap_memory_model.tex +++ b/Chap_memory_model.tex @@ -4,10 +4,10 @@ OpenMP provides a shared-memory model that allows all threads on a given device shared access to \emph{memory}. For a given OpenMP region that may be executed by more than one thread or SIMD lane, variables in memory may be -\emph{shared} or \emph{private} with respect to those threads or SIMD lanes. A +\plc{shared} or \plc{private} with respect to those threads or SIMD lanes. A variable's data-sharing attribute indicates whether it is shared (the -\emph{shared} attribute) or private (the \emph{private}, \emph{firstprivate}, -\emph{lastprivate}, \emph{linear}, and \emph{reduction} attributes) in the data +\plc{shared} attribute) or private (the \plc{private}, \plc{firstprivate}, +\plc{lastprivate}, \plc{linear}, and \plc{reduction} attributes) in the data environment of an OpenMP region. While private variables in an OpenMP region are new copies of the original variable (with same name) that may then be concurrently accessed or modified by their respective threads or SIMD lanes, a @@ -21,27 +21,27 @@ operations for the purposes of making their temporary view of a variable consistent with the value of the variable in memory. The effect of a given flush operation is characterized by its flush properties -- some combination of -\emph{strong}, \emph{release}, and \emph{acquire} -- and, for \emph{strong} -flushes, a \emph{flush-set}. +\plc{strong}, \plc{release}, and \plc{acquire} -- and, for \plc{strong} +flushes, a \plc{flush-set}. -A \emph{strong} flush will force consistency between the temporary view and the -memory for all variables in its \emph{flush-set}. Furthermore, all strong flushes in a +A \plc{strong} flush will force consistency between the temporary view and the +memory for all variables in its \plc{flush-set}. Furthermore, all strong flushes in a program that have intersecting flush-sets will execute in some total order, and within a thread strong flushes may not be reordered with respect to other -memory operations on variables in its flush-set. \emph{Release} and -\emph{acquire} flushes operate in pairs. A release flush may ``synchronize'' +memory operations on variables in its flush-set. \plc{Release} and +\plc{acquire} flushes operate in pairs. A release flush may ``synchronize'' with an acquire flush, and when it does so the local memory operations that precede the release flush will appear to have been completed before the local memory operations on the same variables that follow the acquire flush. -Flush operations arise from explicit \code{flush} directives, implicit -\code{flush} directives, and also from the execution of \code{atomic} -constructs. The \code{flush} directive forces a consistent view of local -variables of the thread executing the \code{flush}. When a list is supplied on +Flush operations arise from explicit \kcode{flush} directives, implicit +\kcode{flush} directives, and also from the execution of \kcode{atomic} +constructs. The \kcode{flush} directive forces a consistent view of local +variables of the thread executing the \kcode{flush}. When a list is supplied on the directive, only the items (variables) in the list are guaranteed to be flushed. Implied flushes exist at prescribed locations of certain constructs. For the complete list of these locations and associated constructs, please -refer to the \plc{flush Construct} section of the OpenMP Specifications +refer to the \docref{\kcode{flush} Construct} section of the OpenMP Specifications document. In this chapter, examples illustrate how race conditions may arise for accesses @@ -53,7 +53,7 @@ races in OpenMP programs result in undefined behavior, and so they should generally be avoided for programs to be correct. The completion order of accesses to a shared variable is guaranteed in OpenMP through a set of memory -consistency rules that are described in the \plc{OpenMP Memory Consistency} +consistency rules that are described in the \docref{OpenMP Memory Consistency} section of the OpenMP Specifications document. %This chapter also includes examples that exhibit non-sequentially consistent diff --git a/Chap_parallel_execution.tex b/Chap_parallel_execution.tex index a639051..22cf431 100644 --- a/Chap_parallel_execution.tex +++ b/Chap_parallel_execution.tex @@ -5,39 +5,39 @@ an OpenMP enabled program, as if the whole program is in an implicit parallel region consisting of an implicit task executed by the \plc{initial thread}. -A \code{parallel} construct encloses code, -forming a parallel region. An \plc{initial thread} encountering a \code{parallel} +A \kcode{parallel} construct encloses code, +forming a parallel region. An \plc{initial thread} encountering a \kcode{parallel} region forks (creates) a team of threads at the beginning of the -\code{parallel} region, and joins them (removes from execution) at the +\kcode{parallel} region, and joins them (removes from execution) at the end of the region. The initial thread becomes the primary thread of the team in a -\code{parallel} region with a \plc{thread} number equal to zero, the other +\kcode{parallel} region with a \plc{thread} number equal to zero, the other threads are numbered from 1 to number of threads minus 1. A team may be comprised of just a single thread. -Each thread of a team is assigned an implicit task consisting of code within the -parallel region. The task that creates a parallel region is suspended while the +Each \plc{thread} of a team is assigned an implicit task consisting of code within the +\kcode{parallel} region. The task that creates a \kcode{parallel} region is suspended while the tasks of the team are executed. A thread is tied to its task; that is, only the thread assigned to the task can execute that task. After completion -of the \code{parallel} region, the primary thread resumes execution of the generating task. +of the \kcode{parallel} region, the primary thread resumes execution of the generating task. %After the \code{parallel} region the primary thread becomes the initial %thread again, and continues to execute the \plc{sequential part}. -Any task within a \code{parallel} region is allowed to encounter another -\code{parallel} region to form a nested \code{parallel} region. The -parallelism of a nested \code{parallel} region (whether it forks additional +Any task within a \kcode{parallel} region is allowed to encounter another +\kcode{parallel} region to form a nested \kcode{parallel} region. The +parallelism of a nested \kcode{parallel} region (whether it forks additional threads, or is executed serially by the encountering task) can be controlled by the -\code{OMP\_NESTED} environment variable or the \code{omp\_set\_nested()} +\kcode{OMP_NESTED} environment variable or the \kcode{omp_set_nested()} API routine with arguments indicating true or false. -The number of threads of a \code{parallel} region can be set by the \code{OMP\_NUM\_THREADS} -environment variable, the \code{omp\_set\_num\_threads()} routine, or on the \code{parallel} -directive with the \code{num\_threads} +The number of threads of a \kcode{parallel} region can be set by the \kcode{OMP_NUM_THREADS} +environment variable, the \kcode{omp_set_num_threads()} routine, or on the \kcode{parallel} +directive with the \kcode{num_threads} clause. The routine overrides the environment variable, and the clause overrides all. -Use the \code{OMP\_DYNAMIC} -or the \code{omp\_set\_dynamic()} function to specify that the OpenMP +Use the \kcode{OMP_DYNAMIC} +or the \kcode{omp_set_dynamic()} function to specify that the OpenMP implementation dynamically adjust the number of threads for -\code{parallel} regions. The default setting for dynamic adjustment is implementation +\kcode{parallel} regions. The default setting for dynamic adjustment is implementation defined. When dynamic adjustment is on and the number of threads is specified, the number of threads becomes an upper limit for the number of threads to be provided by the OpenMP runtime. @@ -56,56 +56,56 @@ \begin{compactitem} -\item loop constructs: {\code{for} and \code{do} } -\item \code{sections} -\item \code{single} -\item \code{workshare} +\item loop constructs: {\kcode{for} and \kcode{do} } +\item \kcode{sections} +\item \kcode{single} +\item \kcode{workshare} \end{compactitem} -The \code{for} and \code{do} constructs (loop constructs) create a region +The \kcode{for} and \kcode{do} constructs (loop constructs) create a region consisting of a loop. A loop controlled by a loop construct is called an \plc{associated} loop. Nested loops can form a single region when the -\code{collapse} clause (with an integer argument) designates the number of +\kcode{collapse} clause (with an integer argument) designates the number of \plc{associated} loops to be executed in parallel, by forming a ``single iteration space'' for the specified number of nested loops. -The \code{ordered} clause can also control multiple associated loops. +The \kcode{ordered} clause can also control multiple associated loops. An associated loop must adhere to a ``canonical form'' (specified in the -\plc{Canonical Loop Form} of the OpenMP Specifications document) which allows the +\docref{Canonical Loop Form} of the OpenMP Specifications document) which allows the iteration count (of all associated loops) to be computed before the (outermost) loop is executed. %[58:27-29]. Most common loops comply with the canonical form, including C++ iterators. -A \code{single} construct forms a region in which only one thread (any one +A \kcode{single} construct forms a region in which only one thread (any one of the team) executes the region. The other threads wait at the implied -barrier at the end, unless the \code{nowait} clause is specified. +barrier at the end, unless the \kcode{nowait} clause is specified. -The \code{sections} construct forms a region that contains one or more -structured blocks. Each block of a \code{sections} directive is -constructed with a \code{section} construct, and executed once by +The \kcode{sections} construct forms a region that contains one or more +structured blocks. Each block of a \kcode{sections} directive is +constructed with a \kcode{section} construct, and executed once by one of the threads (any one) in the team. (If only one block is -formed in the region, the \code{section} construct, which is used to +formed in the region, the \kcode{section} construct, which is used to separate blocks, is not required.) The other threads wait at the implied -barrier at the end, unless the \code{nowait} clause is specified. +barrier at the end, unless the \kcode{nowait} clause is specified. -The \code{workshare} construct is a Fortran feature that consists of a +The \kcode{workshare} construct is a Fortran feature that consists of a region with a single structure block (section of code). Statements in the -\code{workshare} region are divided into units of work, and executed (once) +\kcode{workshare} region are divided into units of work, and executed (once) by threads of the team. \bigskip MASKED CONSTRUCT -The \code{masked} construct is not a worksharing construct. The \code{masked} region is +The \kcode{masked} construct is not a worksharing construct. The \kcode{masked} region is executed only by the primary thread. There is no implicit barrier (and flush) -at the end of the \code{masked} region; hence the other threads of the team continue -execution beyond code statements beyond the \code{masked} region. -The \code{master} construct, which has been deprecated in OpenMP 5.1, has identical semantics -to the \code{masked} construct with no \code{filter} clause. +at the end of the \kcode{masked} region; hence the other threads of the team continue +execution beyond code statements beyond the \kcode{masked} region. +The \kcode{master} construct, which has been deprecated in OpenMP 5.1, has identical semantics +to the \kcode{masked} construct with no \kcode{filter} clause. %===== Examples Sections ===== diff --git a/Chap_program_control.tex b/Chap_program_control.tex index 0367eae..8461b0d 100644 --- a/Chap_program_control.tex +++ b/Chap_program_control.tex @@ -7,17 +7,17 @@ \bigskip CONDITIONAL COMPILATION and EXECUTION -Conditional compilation can be performed with conventional \#ifdef directives -in C, C++, and Fortran, and additionally with OpenMP sentinel (\code{!\$}) in Fortran. -The \code{if} clause on some directives +Conditional compilation can be performed with conventional \bcode{\#ifdef} directives +in C, C++, and Fortran, and additionally with OpenMP sentinel (\scode{!$}) in Fortran. +The \kcode{if} clause on some directives can direct the runtime to ignore or alter the behavior of the construct. -Of course, the base-language \code{if} statements can be used to control the execution -of stand-alone directives (such as \code{flush}, \code{barrier}, \code{taskwait}, -and \code{taskyield}). +Of course, the base-language \bcode{if} statements can be used to control the execution +of stand-alone directives (such as \kcode{flush}, \kcode{barrier}, \kcode{taskwait}, +and \kcode{taskyield}). However, the directives must appear in a block structure, and not as a substatement. -The \code{metadirective} and \code{declare}~\code{variant} directives provide conditional +The \kcode{metadirective} and \kcode{declare variant} directives provide conditional selection of directives and routines for compilation (and use), respectively. -The \code{assume} and \code{requires} directives provide invariants +The \kcode{assume} and \kcode{requires} directives provide invariants for optimizing compilation, and essential features for compilation and correct execution, respectively. @@ -26,29 +26,29 @@ CANCELLATION Cancellation (termination) of the normal sequence of execution for the threads in an OpenMP region can -be accomplished with the \code{cancel} construct. The construct uses a +be accomplished with the \kcode{cancel} construct. The construct uses a \plc{construct-type-clause} to set the region-type to activate for the cancellation. -That is, inclusion of one of the \plc{construct-type-clause} names \code{parallel}, \code{for}, -\code{do}, \code{sections} or \code{taskgroup} on the directive line +That is, inclusion of one of the \plc{construct-type-clause} names \kcode{parallel}, \kcode{for}, +\kcode{do}, \kcode{sections} or \kcode{taskgroup} on the directive line activates the corresponding region. -The \code{cancel} construct is activated by the first encountering thread, and it +The \kcode{cancel} construct is activated by the first encountering thread, and it continues execution at the end of the named region. -The \code{cancel} construct is also a cancellation point for any other thread of the team +The \kcode{cancel} construct is also a cancellation point for any other thread of the team to also continue execution at the end of the named region. Also, once the specified region has been activated for cancellation any thread that encounters -a \code{cancellation}~\code{point} construct with the same named region (\plc{construct-type-clause}), +a \kcode{cancellation point} construct with the same named region (\plc{construct-type-clause}), continues execution at the end of the region. -For an activated \code{cancel taskgroup} construct, the tasks that +For an activated \kcode{cancel taskgroup} construct, the tasks that belong to the taskgroup set of the innermost enclosing taskgroup region will be canceled. -A task that encounters a \code{cancel}~\code{taskgroup} construct continues execution at the end of its +A task that encounters a \kcode{cancel taskgroup} construct continues execution at the end of its task region. Any task of the taskgroup that has already begun execution will run to completion, -unless it encounters a \code{cancellation}~\code{point}; tasks that have not begun execution may be +unless it encounters a \kcode{cancellation point}; tasks that have not begun execution may be discarded as completed tasks. -\bigskip +\pagebreak CONTROL VARIABLES Internal control variables (ICV) are used by implementations to hold values which control the execution @@ -56,7 +56,7 @@ or set and adjusted through environment variables, clauses, and API functions. %Many of the ICV control values are accessible through API function calls. Initial ICV values are reported by the runtime - if the \code{OMP\_DISPLAY\_ENV} environment variable has been set to \code{TRUE} or \code{VERBOSE}. + if the \kcode{OMP_DISPLAY_ENV} environment variable has been set to \vcode{TRUE} or \vcode{VERBOSE}. %As an example, the \plc{nthreads-var} is the ICV that holds the number of threads %to be used in a \code{parallel} region. It can be set with the \code{OMP\_NUM\_THREADS} environment variable, @@ -71,7 +71,7 @@ Certain combinations of nested constructs are permitted, giving rise to \plc{combined} constructs consisting of two or more directives. These can be used when the two (or several) constructs would be used -immediately in succession (closely nested). A \plc{combined} construct can use the clauses of the component +immediately in succession (closely nested). A combined construct can use the clauses of the component constructs without restrictions. A \plc{composite} construct is a combined construct which has one or more clauses with (an often obviously) modified or restricted meaning, relative to when the constructs are uncombined. %%[appear separately (singly). @@ -83,28 +83,28 @@ %explicitly address the ordering of loop chunking \plc{and} SIMD ``combined'' execution. Certain nestings are forbidden, and often the reasoning is obvious. For example, worksharing constructs cannot be nested, and -the \code{barrier} construct cannot be nested inside a worksharing construct, or a \code{critical} construct. -Also, \code{target} constructs cannot be nested, unless the nested target is a reverse offload. - -The \code{parallel} construct can be nested, as well as the \code{task} construct. -The parallel execution in the nested parallel construct(s) is controlled by the -\code{OMP\_MAX\_ACTIVE\_LEVELS} environment variable, and the \code{omp\_set\_max\_active\_levels} routine. -Use the \code{omp\_get\_max\_active\_levels} routine to determine the maximum levels provided by an implementation. -As of OpenMP 5.0, use of the \code{OMP\_NESTED} environment variable and the \code{omp\_set\_nested} routine +the \kcode{barrier} construct cannot be nested inside a worksharing construct, or a \kcode{critical} construct. +Also, \kcode{target} constructs cannot be nested, unless the nested target is a reverse offload. + +The \kcode{parallel} construct can be nested, as well as the \kcode{task} construct. +The parallel execution in the nested \kcode{parallel} construct(s) is controlled by the +\kcode{OMP_MAX_ACTIVE_LEVELS} environment variable, and the \kcode{omp_set_max_active_levels} routine. +Use the \kcode{omp_get_max_active_levels} routine to determine the maximum levels provided by an implementation. +As of OpenMP 5.0, use of the \kcode{OMP_NESTED} environment variable and the \kcode{omp_set_nested} routine has been deprecated. -More details on nesting can be found in the \plc{Nesting of Regions} of the \plc{Directives} +More details on nesting can be found in the \docref{Nesting of Regions} of the \docref{Directives} chapter in the OpenMP Specifications document. %===== Examples Sections ===== +\input{program_control/assumption} \input{program_control/cond_comp} \input{program_control/icv} \input{program_control/standalone} \input{program_control/cancellation} \input{program_control/requires} -\input{program_control/variant} -\input{program_control/metadirective} +\input{program_control/context_based_variants} \input{program_control/nested_loop} \input{program_control/nesting_restrict} \input{program_control/target_offload} diff --git a/Chap_synchronization.tex b/Chap_synchronization.tex index dfe93d9..bddd6a2 100644 --- a/Chap_synchronization.tex +++ b/Chap_synchronization.tex @@ -1,47 +1,47 @@ \cchapter{Synchronization}{synchronization} \label{chap:synchronization} -The \code{barrier} construct is a stand-alone directive that requires all threads +The \kcode{barrier} construct is a stand-alone directive that requires all threads of a team (within a contention group) to execute the barrier and complete execution of all tasks within the region, before continuing past the barrier. -The \code{critical} construct is a directive that contains a structured block. +The \kcode{critical} construct is a directive that contains a structured block. The construct allows only a single thread at a time to execute the structured block (region). -Multiple critical regions may exist in a parallel region, and may -act cooperatively (only one thread at a time in all \code{critical} regions), -or separately (only one thread at a time in each \code{critical} regions when -a unique name is supplied on each \code{critical} construct). -An optional (lock) \code{hint} clause may be specified on a named \code{critical} +Multiple \kcode{critical} regions may exist in a parallel region, and may +act cooperatively (only one thread at a time in all \kcode{critical} regions), +or separately (only one thread at a time in each \kcode{critical} regions when +a unique name is supplied on each \kcode{critical} construct). +An optional (lock) \kcode{hint} clause may be specified on a named \kcode{critical} construct to provide the OpenMP runtime guidance in selection a locking mechanism. -On a finer scale the \code{atomic} construct allows only a single thread at +On a finer scale the \kcode{atomic} construct allows only a single thread at a time to have atomic access to a storage location involving a single read, write, update or capture statement, and a limited number of combinations -when specifying the \code{capture} \plc{atomic-clause} clause. The +when specifying the \kcode{capture} \plc{atomic-clause} clause. The \plc{atomic-clause} clause is required for some expression statements, but is -not required for \code{update} statements. The \plc{memory-order} clause can be -used to specify the degree of memory ordering enforced by an \code{atomic} -construct. From weakest to strongest, they are \code{relaxed} (the default), -acquire and/or release clauses (specified with \code{acquire}, \code{release}, -or \code{acq\_rel}), and \code{seq\_cst}. Please see the details in the -\plc{atomic Construct} subsection of the \plc{Directives} chapter in the OpenMP +not required for \kcode{update} statements. The \plc{memory-order} clause can be +used to specify the degree of memory ordering enforced by an \kcode{atomic} +construct. From weakest to strongest, they are \kcode{relaxed} (the default), +\plc{acquire} and/or \plc{release} clauses (specified with \kcode{acquire}, \kcode{release}, +or \kcode{acq_rel}), and \kcode{seq_cst}. Please see the details in the +\docref{atomic Construct} subsection of the \docref{Directives} chapter in the OpenMP Specifications document. % The following three sentences were stolen from the spec. -The \code{ordered} construct either specifies a structured block in a loop, +The \kcode{ordered} construct either specifies a structured block in a loop, simd, or loop SIMD region that will be executed in the order of the loop -iterations. The ordered construct sequentializes and orders the execution -of ordered regions while allowing code outside the region to run in parallel. +iterations. The \kcode{ordered} construct sequentializes and orders the execution +of \kcode{ordered} regions while allowing code outside the region to run in parallel. -Since OpenMP 4.5 the \code{ordered} construct can also be a stand-alone -directive that specifies cross-iteration dependences in a doacross loop nest. -The \code{depend} clause uses a \code{sink} \plc{dependence-type}, along with an -iteration vector argument (vec) to indicate the iteration that satisfies the -dependence. The \code{depend} clause with a \code{source} +Since OpenMP 4.5 the \kcode{ordered} construct can also be a stand-alone +directive that specifies cross-iteration dependences in a \plc{doacross} loop nest. +The \kcode{depend} clause uses a \kcode{sink} \plc{dependence-type}, along with an +iteration vector argument (\plc{vec}) to indicate the iteration that satisfies the +dependence. The \kcode{depend} clause with a \kcode{source} \plc{dependence-type} specifies dependence satisfaction. -The \code{flush} directive is a stand-alone construct for enforcing consistency +The \kcode{flush} directive is a stand-alone construct for enforcing consistency between a thread's view of memory and the view of memory for other threads (see the Memory Model chapter of this document for more details). When the construct is used with an explicit variable list, a \plc{strong flush} that forces a @@ -55,7 +55,7 @@ release memory ordering semantics according to the \plc{memory-order} clause, but no strong flush is performed. A resulting strong flush that applies to a set of variables effectively ensures that no memory (load or store) -operation for the affected variables may be reordered across the \code{flush} +operation for the affected variables may be reordered across the \kcode{flush} directive. General-purpose routines provide mutual exclusion semantics through locks, @@ -69,14 +69,14 @@ other lock type. Any explicit task will observe the synchronization prescribed in a -\code{barrier} construct and an implied barrier. Also, additional synchronizations -are available for tasks. All children of a task will wait at a \code{taskwait} (for -their siblings to complete). A \code{taskgroup} construct creates a region in which the +\kcode{barrier} construct and an implied barrier. Also, additional synchronizations +are available for tasks. All children of a task will wait at a \kcode{taskwait} (for +their siblings to complete). A \kcode{taskgroup} construct creates a region in which the current task is suspended at the end of the region until all sibling tasks, and their descendants, have completed. -Scheduling constraints on task execution can be prescribed by the \code{depend} +Scheduling constraints on task execution can be prescribed by the \kcode{depend} clause to enforce dependence on previously generated tasks. -More details on controlling task executions can be found in the \plc{Tasking} Chapter +More details on controlling task executions can be found in the \docref{Tasking} Chapter in the OpenMP Specifications document. %(DO REF. RIGHT.) @@ -87,7 +87,7 @@ \input{synchronization/atomic} \input{synchronization/atomic_cas} \input{synchronization/atomic_restrict} -\input{synchronization/flush_nolist} +\input{synchronization/atomic_hint} \input{synchronization/acquire_release} \input{synchronization/ordered} \input{synchronization/depobj} diff --git a/Chap_tasking.tex b/Chap_tasking.tex index 06134ad..24d5a6a 100644 --- a/Chap_tasking.tex +++ b/Chap_tasking.tex @@ -2,33 +2,33 @@ \label{chap:tasking} Tasking constructs provide units of work to a thread for execution. -Worksharing constructs do this, too (e.g. \code{for}, \code{do}, -\code{sections}, and \code{singles} constructs); +Worksharing constructs do this, too (e.g. \kcode{for}, \kcode{do}, +\kcode{sections}, and \kcode{single} constructs); but the work units are tightly controlled by an iteration limit and limited -scheduling, or a limited number of \code{sections} or \code{single} regions. +scheduling, or a limited number of \kcode{sections} or \kcode{single} regions. Worksharing was designed with ``data parallel'' computing in mind. Tasking was designed for ``task parallel'' computing and often involves non-locality or irregularity in memory access. -The \code{task} construct can be used to execute work chunks: in a while loop; +The \kcode{task} construct can be used to execute work chunks: in a while loop; while traversing nodes in a list; at nodes in a tree graph; -or in a normal loop (with a \code{taskloop} construct). +or in a normal loop (with a \kcode{taskloop} construct). Unlike the statically scheduled loop iterations of worksharing, a task is often enqueued, and then dequeued for execution by any of the threads of the team within a parallel region. The generation of tasks can be from a single generating thread (creating sibling tasks), or from multiple generators in a recursive graph tree traversals. %(creating a parent-descendents hierarchy of tasks, see example 4 and 7 below). -A \code{taskloop} construct +A \kcode{taskloop} construct bundles iterations of an associated loop into tasks, and provides -similar controls found in the \code{task} construct. +similar controls found in the \kcode{task} construct. -Sibling tasks are synchronized by the \code{taskwait} construct, and tasks +Sibling tasks are synchronized by the \kcode{taskwait} construct, and tasks and their descendent tasks can be synchronized by containing them in -a \code{taskgroup} region. Ordered execution is accomplished by specifying -dependences with a \code{depend} clause. Also, priorities can be -specified as hints to the scheduler through a \code{priority} clause. +a \kcode{taskgroup} region. Ordered execution is accomplished by specifying +dependences with a \kcode{depend} clause. Also, priorities can be +specified as hints to the scheduler through a \kcode{priority} clause. Various clauses can be used to manage and optimize task generation, as well as reduce the overhead of execution and to relinquish @@ -36,18 +36,18 @@ Once a thread starts executing a task, it is the designated thread for executing the task to completion, even though it may leave the -execution at a scheduling point and return later. The thread is tied -to the task. Scheduling points can be introduced with the \code{taskyield} -construct. With an \code{untied} clause any other thread is allowed to continue -the task. An \code{if} clause with an expression that evaluates to \plc{false} -results in an \emph{undeferred} task, which instructs the runtime to suspend +execution at a scheduling point and return later. The thread is \plc{tied} +to the task. Scheduling points can be introduced with the \kcode{taskyield} +construct. With an \kcode{untied} clause any other thread is allowed to continue +the task. An \kcode{if} clause with an expression that evaluates to \plc{false} +results in an \plc{undeferred} task, which instructs the runtime to suspend the generating task until the undeferred task completes its execution. By including the data environment of the generating task into the generated task with the -\code{mergeable} and \code{final} clauses, task generation overhead can be reduced. +\kcode{mergeable} and \kcode{final} clauses, task generation overhead can be reduced. A complete list of the tasking constructs and details of their clauses -can be found in the \plc{Tasking Constructs} chapter of the OpenMP Specifications, -in the \plc{OpenMP Application Programming Interface} section. +can be found in the \docref{Tasking Constructs} chapter of the OpenMP Specifications. +%in the \docref{OpenMP Application Programming Interface} section. %===== Examples Sections ===== diff --git a/Contributions.md b/Contributions.md index eb69dc1..5620f9e 100644 --- a/Contributions.md +++ b/Contributions.md @@ -3,7 +3,8 @@ The usual process for adding new examples, making changes or adding corrections is to submit an issue for discussion and initial evaluation of changes or example additions. When there is a consensus at a meeting about the contribution, -you will be asked to submit a pull request. +the issue will be brought forward for voting at the OpenMP Language +Committee meetings and you will be asked to submit a pull request. Of course, if your contribution is an obvious correction, clarification, or note, you may want to submit a pull request directly. @@ -13,7 +14,7 @@ may want to submit a pull request directly. ## The OpenMP Examples document The OpenMP Examples document is in LaTeX format. -Please see the master LaTeX file, `openmp-examples.tex`, for more information. +Please see the main LaTeX file, `openmp-examples.tex`, for more information. ## Maintainer @@ -22,18 +23,18 @@ For a brief revision history, see `Changes.log` in the repo. ## Git procedure - * Fork your own branch of the OpenMP [examples-internal repo](https:/github.com/openmp/examples-internal) + * Fork your own branch of the OpenMP [examples-internal repo](https://github.com/OpenMP/examples-internal) * Clone your fork locally - * If you are working on generic or old-version updates, create a branch off master. + * If you are working on generic or old-version updates, create a branch off main. * If you are working on an example for a release candidate for version #.#, create a branch off work_#.#. - 1) `git clone --branch https://github.com//examples-internal` - 2) `git checkout -b ` + 1) `git clone --branch https://github.com//examples-internal` + 2) `git checkout -b ` 3) ... `add`, `commit` 4) `git push -u origin ` 5) `make` or `make diff` will create a full-document pdf or just a pdf with differences (do this at any point). * `git status` and `git branch -a` are your friends * Submit an issue for your work (usually with a diff pdf), and then you will be asked to submit a pull request - * Create an issue by selecting the (issue tab)[https://github.com/openmp/examples-internal/issues] and clicking on `new issue`. + * Create an issue by selecting the (issue tab)[https://github.com/OpenMP/examples-internal/issues] and clicking on `new issue`. * Use this MarkDown Cheatsheet for (issue formatting)[https://wordpress.com/support/markdown-quick-reference/] * More MarkDown details are available (here)[https://markdown-it.github.io] * You can cut and paste markdown formatted text in a (reader)[https://dillinger.io] to see formatting effects. @@ -50,7 +51,7 @@ For a brief revision history, see `Changes.log` in the repo. * The example name may be a Section name (e.g. affinity), or a Subsection name (affinity_display) * If you are creating a new Chapter, it may be the chapter name. * New examples are usually added at the end of a Section or Subsection. Number it as the next number in the sequence numbers for examples in that Section or Subsection. - * The compiler suffix `` is `c`, `cpp`, `f`, and `f90` for C, C++ and Fortran codes. + * The compiler suffix `` is `c`, `cpp`, `f`, and `f90` for C, C++ and Fortran (fixed/free form) codes. * Insert the code in the sources directory for each chapter, and include the following metadata: * Metadata Tags for example sources: ``` @@ -62,34 +63,34 @@ For a brief revision history, see `Changes.log` in the repo. @@env: @@depend: ``` - * **name** - is the name of an example - * **type** - is the source code type, which can be translated into or from proper file extension (C:c,C++:cpp,F-fixed:f,F-free:f90) - * **operation** - indicates how the source code is treated. Possible values are: - `view` - code for illustration only, not compilable; - `compile` - incomplete program, such as function or subroutine; - `link` - complete program, but no verification value; - `run` - complete program with verification value. - * **expect** - indicates some expected result for testing purpose. - `success` means no issue; - `ct-error` applies to the result of code compilation; - `rt-error` is for a case where compilation may be successful, but the code + * **name** + - is the name of an example + * **type** + - is the source code type, which can be translated into or from proper file extension (C:c,C++:cpp,F-fixed:f,F-free:f90) + * **operation** + - indicates how the source code is treated. Possible values are: + - `view` - code for illustration only, not compilable; + - `compile` - incomplete program, such as function or subroutine; + - `link` - complete program, but no verification value; + - `run` - complete program with verification value. + * **expect** + - indicates some expected result for testing purpose. + - `success` means no issue; + - `ct-error` applies to the result of code compilation; + - `rt-error` is for a case where compilation may be successful, but the code contains potential runtime issues (including race condition); - `unspecified` could result from a non-conforming code or is for code + - `unspecified` could result from a non-conforming code or is for code that is viewable only. - * **version** - indicates that the example uses features in a specific OpenMP version, such as "`omp_5.0`" + * **version** + - indicates that the example uses features in a specific OpenMP version, such as "`omp_5.0`". The prefix `pre_` indicates that the example uses features prior to a specific version, such as "`pre_omp_3.0`". - * **env** - specifies any environment variables needed to run the code. + * **env** + - specifies any environment variables needed to run the code. This tag is optional and can be repeated. - * **depend** - specifies a source code file on which the current code depends. + * **depend** + - specifies a source code file on which the current code depends. This tag is optional and can be repeated. - * For **env** and **depend**, make sure to specify + * For **env** and **depend**, make sure to specify a proper skipping number `` in the LaTeX macros described below to match with the number of `env` and `depend` tags. @@ -105,7 +106,7 @@ For a brief revision history, see `Changes.log` in the repo. * Shepherd your issue to acceptance (discussed at weekly Examples meeting and in issue comments) * When it is in a ready state, you should then submit a pull request. * It will be reviewed and voted on, and changes will be requested. - * Once the last changes are made, it will be verified and merged into an appropriate branch (either the `master` branch or a working branch). + * Once the last changes are made, it will be verified and merged into an appropriate branch (either the `main` branch or a working branch). @@ -116,6 +117,7 @@ The following describes LaTeX macros defined specifically for examples. * Source code with language h-rules * Source code without language h-rules * Language h-rules +* Macros for keywords in text description * Other macros * See `openmp.sty` for more information @@ -143,7 +145,7 @@ The following describes LaTeX macros defined specifically for examples. prefix `` with an underscore '\_' symbol in the macro. The exception is macro `\srcnexample`, for which the corresponding - source code should not contain any `@@` metadata tags. The `ext` argument + source code might not contain any `@@` metadata tags. The `ext` argument to this macro is the file extension (such as `h`, `hpp`, `inc`). The `` option to each macro allows finer-control of any additional lines @@ -156,13 +158,43 @@ The following describes LaTeX macros defined specifically for examples. \cppspecificstart, \cppspecificend \ccppspecificstart, \ccppspecificend \fortranspecificstart, \fortranspecificend + \topmarker{Lang} ``` + The macro `\topmarker` puts a dashed blue line floater at top of a page for + "Lang (cont.)" where `Lang` can be `C/C++`, `C++`, `Fortran`. + +### Macros for keywords in text description +A partial list: +- `\kcode{}` - for OpenMP keywords, such as directives, clauses, environment variables, API routines. Support direct use of '_' (underscore) and ' ' (space) +- `\scode{}` - OpenMP specifier with special chars, such as '`$`' in "`!$omp`" +- `\bcode{}` - base language keywords (such as `ASSOCIATE` in Fortran) +- `\vcode{}` - values of a keyword, such as `TRUE`, `FALSE`, `VERBOSE` +- `\plc{}` - OpenMP concept, such ICV names; `\splc{}` - escape '_' (underscore) +- `\example{}` - example names, such as `\example{taskloop_reduction.1}` +- `\docref{}` - chapter or section name of a document, such as the spec +- `\ucode{}` - program variables, procedure names, or expression in examples codes. Support direct use of '_' (underscore) and ' ' (space). +- `\pout{}` - program outputs + +Examples: +- `\kcode{declare reduction}` for **declare reduction** +- `\scode{!$omp}` sentinel, however, `\kcode{\#pragma omp}` +- `\kcode{map(iterator(\ucode{i=0:n}), tofrom: \ucode{p[i]})}` for **map(iterator(**_i=0:n_**), tofrom:** _p[i]_**)** +- Fortran `\bcode{IMPLICIT NONE}` statement +- The `\vcode{VERBOSE}` value for `\kcode{OMP_DISPLAY_ENV}` +- OpenMP `\plc{directives}`, the `\plc{num-threads}` ICV +- This is an example name `\example{taskloop_reduction.1}` +- `(\ucode{x,y,z})` argument for procedure `\ucode{a_proc_name}` +- structure constructor `\ucode{point($\ldots$)}` +- This is a code output `"\pout{x = 1}"` + ### Other macros ``` \cchapter{}{} \hexentry[ext1]{}[ext2]{} \hexmentry[ext1]{}[ext2]{}{} + \examplesref{} + \examplesblob{} ``` The `\cchapter` macro is used for starting a chapter with proper page spacing. @@ -184,6 +216,9 @@ in the earlier version. `\hexentry` assumes no name change for an example in different versions; `\hexmentry` can be used to specify a prior name if it is different. +The two macros `\examplesref` and `\examplesblob` are for referencing +a specific version of or a file in the github Examples repository. + ## License For copyright information, please see [omp_copyright.txt](omp_copyright.txt). diff --git a/Deprecated_Features.tex b/Deprecated_Features.tex index bc5bd9e..09ebe24 100644 --- a/Deprecated_Features.tex +++ b/Deprecated_Features.tex @@ -3,6 +3,33 @@ \label{sec:deprecated_features} \index{deprecated features} +\newcommand\tabpcont[1]{\multicolumn{2}{l}{\small\slshape table continued #1 page}} +\newcommand\tabpheader{\textbf{Version} & \textbf{Deprecated Feature} & + \textbf{Replacement}} +\newcommand\tabuheader{\textbf{Example Name} & \textbf{Earlier Version} & + \textbf{Feature Updated}} +\newcommand\dpftable[1]{ + \renewcommand{\arraystretch}{1.0} + \tablefirsthead{% + \hline\\[-2ex] + \tabuheader\\[2pt] + \hline\\[-2ex] + } + \tablehead{% + \tabpcont{from previous}\\[2pt] + \hline\\[-2ex] + \tabuheader\\[2pt] + \hline\\[-2ex] + } + \tabletail{% + \hline\\[-2.5ex] + \tabpcont{on next}\\ + } + \tablelasttail{\hline\\[-1ex]} + \tablecaption{Updated Examples for Features Deprecated in Version #1\label{tab:Updated Examples #1}} +} + + Deprecation of features began in OpenMP 5.0. Examples that use a deprecated feature have been updated with an equivalent replacement feature. @@ -14,45 +41,45 @@ \nolinenumbers \renewcommand{\arraystretch}{1.4} \tablefirsthead{% -\hline -\textbf{Version} & \textbf{Deprecated Feature} & \textbf{Replacement}\\ -\hline\\[-3.5ex] + \hline + \tabpheader\\ + \hline\\[-3.5ex] } \tablehead{% -\multicolumn{2}{l}{\small\slshape table continued from previous page}\\ -\hline -\textbf{Version} & \textbf{Deprecated Feature} & \textbf{Replacement}\\ -\hline\\[-3ex] + \tabpcont{from previous}\\ + \hline + \tabpheader\\ + \hline\\[-3ex] } \tabletail{% -\hline\\[-4ex] -\multicolumn{2}{l}{\small\slshape table continued on next page}\\ + \hline\\[-4ex] + \tabpcont{on next}\\ } \tablelasttail{\hline\\[-2ex]} \tablecaption{Deprecated Features and Their Replacements\label{tab:Deprecated Features}} \begin{supertabular}{p{0.4in} p{2.3in} p{2.2in}} -5.2 & \scode{default} clause on metadirectives - & \scode{otherwise} clause \\ -5.2 & delimited \scode{declare}~\scode{target} directive for C/C++ - & \scode{begin}~\scode{declare}~\scode{target} directive \\ -5.2 & \scode{to} clause on \scode{declare}~\scode{target} directive - & \scode{enter} clause \\ -5.2 & non-argument \scode{destroy} clause on \scode{depobj} construct - & \scode{destroy(}\plc{argument}\code{)} \\ -5.2 & \scode{allocate} construct for Fortran \scode{ALLOCATE} statements - & \scode{allocators} construct \\ -5.2 & \scode{depend} clause on \scode{ordered} construct - & \scode{doacross} clause \\ -5.2 & \scode{linear(}\plc{modifier(list): linear-step}\code{)} clause - & \scode{linear(}\plc{list:}~\scode{step(}\plc{linear-step}\scode{)}\plc{, modifier}\scode{)} clause \\ +5.2 & \kcode{default} clause on metadirectives + & \kcode{otherwise} clause \\ +5.2 & delimited \kcode{declare target} directive for C/C++ + & \kcode{begin declare target} directive \\ +5.2 & \kcode{to} clause on \kcode{declare target} directive + & \kcode{enter} clause \\ +5.2 & non-argument \kcode{destroy} clause on \kcode{depobj} construct + & \kcode{destroy(\plc{argument})} \\ +5.2 & \kcode{allocate} directive for Fortran \bcode{ALLOCATE} statements + & \kcode{allocators} directive \\ +5.2 & \kcode{depend} clause on \kcode{ordered} construct + & \kcode{doacross} clause \\ +5.2 & \kcode{linear(\plc{modifier(list): linear-step})} clause + & \kcode{linear(\plc{list:} step(\plc{linear-step})\plc{, modifier})} clause \\ \hline -5.1 & \scode{master} construct - & \scode{masked} construct \\ -5.1 & \scode{master} affinity policy - & \scode{primary} affinity policy \\ +5.1 & \kcode{master} construct + & \kcode{masked} construct \\ +5.1 & \kcode{master} affinity policy + & \kcode{primary} affinity policy \\ \hline -5.0 & \scode{omp_lock_hint_*} constants - & \scode{omp_sync_hint_*} constants \\[2pt] +5.0 & \kcode{omp_lock_hint_*} constants + & \kcode{omp_sync_hint_*} constants \\[2pt] \end{supertabular} \linenumbers @@ -70,60 +97,42 @@ \section{Updated Examples for Different Versions} the tables shows the version tag of the earlier version. It also shows the prior name of an example when it has been renamed. -Table~\ref{tab:Updated Examples 5.2} lists the updated examples for OpenMP 5.2 -in the Examples Document Version -\href{https://github.com/OpenMP/Examples/tree/v5.2}{5.2}. + +Table~\ref{tab:Updated Examples 5.2} lists the updated examples for +features deprecated in OpenMP 5.2 +in the Examples Document Version \examplesref{5.2}. The \emph{Earlier Version} column of the table lists the earlier version tags of the examples that can be found in -the Examples Document Version -\href{https://github.com/OpenMP/Examples/tree/v5.1}{5.1}. - -\index{clauses!default@\code{default}} -\index{clauses!otherwise@\code{otherwise}} -\index{clauses!to@\code{to}} -\index{clauses!enter@\code{enter}} -\index{clauses!depend@\code{depend}} -\index{clauses!doacross@\code{doacross}} -\index{clauses!linear@\code{linear}} -\index{clauses!destroy@\code{destroy}} -\index{default clause@\code{default} clause} -\index{otherwise clause@\code{otherwise} clause} -\index{to clause@\code{to} clause} -\index{enter clause@\code{enter} clause} -\index{depend clause@\code{depend} clause} -\index{doacross clause@\code{doacross} clause} -\index{linear clause@\code{linear} clause} -\index{destroy clause@\code{destroy} clause} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} -\index{allocate construct@\code{allocate} construct} -\index{allocators construct@\code{allocators} construct} +the Examples Document Version \examplesref{5.1}. + +\index{clauses!default@\kcode{default}} +\index{clauses!otherwise@\kcode{otherwise}} +\index{clauses!to@\kcode{to}} +\index{clauses!enter@\kcode{enter}} +\index{clauses!depend@\kcode{depend}} +\index{clauses!doacross@\kcode{doacross}} +\index{clauses!linear@\kcode{linear}} +\index{clauses!destroy@\kcode{destroy}} +\index{default clause@\kcode{default} clause} +\index{otherwise clause@\kcode{otherwise} clause} +\index{to clause@\kcode{to} clause} +\index{enter clause@\kcode{enter} clause} +\index{depend clause@\kcode{depend} clause} +\index{doacross clause@\kcode{doacross} clause} +\index{linear clause@\kcode{linear} clause} +\index{destroy clause@\kcode{destroy} clause} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} +\index{allocate directive@\kcode{allocate} directive} +\index{allocators directive@\kcode{allocators} directive} \nolinenumbers -\renewcommand{\arraystretch}{1.0} -\tablefirsthead{% -\hline\\[-2ex] -\textbf{Example Name} & \textbf{Earlier Version} & \textbf{Feature Updated} -\\[2pt] -\hline\\[-2ex] -} -\tablehead{% -\multicolumn{2}{l}{\small\slshape table continued from previous page}\\[2pt] -\hline\\[-2ex] -\textbf{Example Name} & \textbf{Earlier Version} & \textbf{Feature Updated}\\[2pt] -\hline\\[-2ex] -} -\tabletail{% -\hline\\[-2.5ex] -\multicolumn{2}{l}{\small\slshape table continued on next page}\\ -} -\tablelasttail{\hline\\[-1ex]} -\tablecaption{Updated Examples for Version 5.2\label{tab:Updated Examples 5.2}} +\dpftable{5.2} \begin{supertabular}{p{1.7in} p{1.2in} p{2.1in}} \hexentry{error.1}[f90]{5.1} & - \scode{default} clause on metadirectives \\ + \kcode{default} clause on metadirectives \\ \hexentry{metadirective.1}[f90]{5.0} & - replaced with \scode{otherwise} clause \\ + replaced with \kcode{otherwise} clause \\ \hexentry{metadirective.2}[f90]{5.0} & \\ \hexentry{metadirective.3}[f90]{5.0} & \\ \hexentry{metadirective.4}[f90]{5.1} & \\ @@ -131,19 +140,19 @@ \section{Updated Examples for Different Versions} \hexentry{target_ptr_map.5}[f90]{5.1} & \\[2pt] \hline\\[-2ex] \hexentry[f90]{array_shaping.1}{5.0} & - \scode{to} clause on \scode{declare} \scode{target} \\ + \kcode{to} clause on \kcode{declare target} \\ \hexentry{target_reverse_offload.7}{5.0} & - directive replaced with \scode{enter} clause \\ + directive replaced with \kcode{enter} clause \\ \hexentry{target_task_reduction.1}[f90]{5.1} & \\ \hexentry{target_task_reduction.2a}[f90]{5.0} & \\ \hexentry{target_task_reduction.2b}[f90]{5.1} &\\[2pt] \hline\\[-2ex] \hexentry{array_shaping.1}{5.0} & - delimited \scode{declare}~\scode{target} \\ + delimited \kcode{declare target} \\ \hexentry{async_target.1}{4.0} & directive replaced with \\ \hexentry{async_target.2}{4.0} & - \scode{begin}~\scode{declare}~\scode{target} \\ + \kcode{begin declare target} \\ \hexentry{declare_target.1}{4.0} & directive for C/C++ \\ \hexentry[cpp]{declare_target.2c}{4.0} & \\ @@ -163,72 +172,53 @@ \section{Updated Examples for Different Versions} \hexentry{target_struct_map.4}{5.0} & \\[2pt] \hline\\[-2ex] \hexentry{doacross.1}[f90]{4.5} & - \scode{depend} clause on \scode{ordered} \\ + \kcode{depend} clause on \kcode{ordered} \\ \hexentry{doacross.2}[f90]{4.5} & - construct replaced with \scode{doacross} \\ + construct replaced with \kcode{doacross} \\ \hexentry{doacross.3}[f90]{4.5} & clause \\ \hexentry{doacross.4}[f90]{4.5} & \\[2pt] \hline\\[-2ex] \hexentry[cpp]{linear_modifier.1}[f90]{4.5} & - modifier syntax change for \scode{linear} \\ + modifier syntax change for \kcode{linear} \\ \hexentry[cpp]{linear_modifier.2}[f90]{4.5} & - clause on \scode{declare}~\scode{simd} directive \\ + clause on \kcode{declare simd} directive \\ \hexentry{linear_modifier.3}[f90]{4.5} & \\[2pt] \hline\\[-2ex] \hexentry[f90]{allocators.1}{5.0} & - \scode{allocate} construct replaced with \scode{allocators} construct - for Fortran allocate statements \\[2pt] + \kcode{allocate} directive replaced with \kcode{allocators} directive + for Fortran \bcode{allocate} statements \\[2pt] \hline\\[-2ex] \hexentry{depobj.1}[f90]{5.0} & - argument added to \scode{destroy} clause on \scode{depobj} + argument added to \kcode{destroy} clause on \kcode{depobj} construct \\[2pt] \end{supertabular} \linenumbers -Table~\ref{tab:Updated Examples 5.1} lists the updated examples for OpenMP 5.1 -in the Examples Document Version -\href{https://github.com/OpenMP/Examples/tree/v5.1}{5.1}. +Table~\ref{tab:Updated Examples 5.1} lists the updated examples for +features deprecated in OpenMP 5.1 +in the Examples Document Version \examplesref{5.1}. The \emph{Earlier Version} column of the table lists the earlier version tags and prior names of the examples that can be found in -the Examples Document Version -\href{https://github.com/OpenMP/Examples/tree/v5.0.1}{5.0.1}. +the Examples Document Version \examplesref{5.0.1}. -\index{affinity!master policy@\code{master} policy} -\index{affinity!primary policy@\code{primary} policy} -\index{constructs!master@\code{master}} -\index{constructs!masked@\code{masked}} -\index{master construct@\code{master} construct} -\index{masked construct@\code{masked} construct} +\index{affinity!master policy@\kcode{master} policy} +\index{affinity!primary policy@\kcode{primary} policy} +\index{constructs!master@\kcode{master}} +\index{constructs!masked@\kcode{masked}} +\index{master construct@\kcode{master} construct} +\index{masked construct@\kcode{masked} construct} \nolinenumbers -\renewcommand{\arraystretch}{1.0} -\tablefirsthead{% -\hline\\[-2ex] -\textbf{Example Name} & \textbf{Earlier Version} & \textbf{Feature Updated} -\\[2pt] -\hline\\[-2ex] -} -\tablehead{% -\multicolumn{2}{l}{\small\slshape table continued from previous page}\\[2pt] -\hline\\[-2ex] -\textbf{Example Name} & \textbf{Earlier Version} & \textbf{Feature Updated}\\[2pt] -\hline\\[-2ex] -} -\tabletail{% -\hline\\[-2.5ex] -\multicolumn{2}{l}{\small\slshape table continued on next page}\\ -} -\tablelasttail{\hline\\[-1ex]} -\tablecaption{Updated Examples for Version 5.1\label{tab:Updated Examples 5.1}} +\dpftable{5.1} \begin{supertabular}{p{1.8in} p{1.4in} p{1.8in}} \hexentry{affinity.5}[f]{4.0} & - \scode{master} affinity policy replaced with \scode{primary} policy \\[2pt] + \kcode{master} affinity policy replaced with \kcode{primary} policy \\[2pt] \hline\\[-2ex] \hexentry{async_target.3}[f90]{5.0} & - \scode{master} construct replaced \\ + \kcode{master} construct replaced \\ \hexentry{cancellation.2}[f90]{4.0} & - with \scode{masked} construct \\ + with \kcode{masked} construct \\ \hexentry{copyprivate.2}[f]{3.0} & \\ \hexentry[f]{fort_sa_private.5}{3.0} & \\ \hexentry{lock_owner.1}[f]{3.0} & \\ @@ -242,39 +232,20 @@ \section{Updated Examples for Different Versions} \end{supertabular} \linenumbers -Table~\ref{tab:Updated Examples 5.0} lists the updated examples for OpenMP 5.0 -in the Examples Document Version -\href{https://github.com/OpenMP/Examples/tree/v5.1}{5.1}. +Table~\ref{tab:Updated Examples 5.0} lists the updated examples for +features deprecated in OpenMP 5.0 +in the Examples Document Version \examplesref{5.1}. The \emph{Earlier Version} column of the table lists the earlier version tags of the examples that can be found in -the Examples Document Version -\href{https://github.com/OpenMP/Examples/tree/v5.0.1}{5.0.1}. +the Examples Document Version \examplesref{5.0.1}. \nolinenumbers -\renewcommand{\arraystretch}{1.0} -\tablefirsthead{% -\hline\\[-2ex] -\textbf{Example Name} & \textbf{Earlier Version} & \textbf{Feature Updated} -\\[2pt] -\hline\\[-2ex] -} -\tablehead{% -\multicolumn{2}{l}{\small\slshape table continued from previous page}\\[2pt] -\hline\\[-2ex] -\textbf{Example Name} & \textbf{Earlier Version} & \textbf{Feature Updated}\\[2pt] -\hline\\[-2ex] -} -\tabletail{% -\hline\\[-2.5ex] -\multicolumn{2}{l}{\small\slshape table continued on next page}\\ -} -\tablelasttail{\hline\\[-1ex]} -\tablecaption{Updated Examples for Version 5.0\label{tab:Updated Examples 5.0}} +\dpftable{5.0} \begin{supertabular}{p{1.6in} p{1.3in} p{2.1in}} \hexentry{critical.2}[f]{4.5} & - \scode{omp_lock_hint_*} constants \\ + \kcode{omp_lock_hint_*} constants \\ \hexentry[cpp]{init_lock_with_hint.1}[f]{4.5} & - replaced with \scode{omp_sync_hint_*} constants \\[2pt] + replaced with \kcode{omp_sync_hint_*} constants \\[2pt] \end{supertabular} \linenumbers diff --git a/Foreword_Chapt.tex b/Foreword_Chapt.tex index 3298858..6499f31 100644 --- a/Foreword_Chapt.tex +++ b/Foreword_Chapt.tex @@ -7,7 +7,8 @@ \chapter*{Foreword} Text describing an example with a \PVER\ feature specifically states that the feature support begins in the OpenMP \PVER\ Specification. Also, -an \code{\small omp\_\PVER} keyword is included in the metadata of the source code. +an \kcode{\small{}omp_\PVER} keyword is included in the metadata of the source code. + These distinctions are presented to remind readers that a \PVER\ compliant OpenMP implementation is necessary to use these features in codes. @@ -28,6 +29,13 @@ \chapter*{Foreword} \bigskip Examples Subcommittee Co-chairs: \smallskip\linebreak Henry Jin (\textsc{NASA} Ames Research Center) \linebreak -Swaroop Pophale (Oak Ridge National Labortory) +Swaroop Pophale (Oak Ridge National Laboratory) + +\bigskip +\bigskip +Past Examples Subcommittee Co-chairs: +\begin{itemize} +\item Kent Milfeld (2014 - 2022) +\end{itemize} diff --git a/History.tex b/History.tex index ccf19ae..30bc30c 100644 --- a/History.tex +++ b/History.tex @@ -1,6 +1,73 @@ \cchapter{Document Revision History}{history} \label{chap:history} +%===================================== +\section{Changes from 5.2.1 to 5.2.2} +\label{sec:history_521_to_522} + +\begin{itemize} +\item To improve the style of the document, a set of macros was introduced + and consistently used for language keywords, names, concepts, and user codes + in the text description of the document. Refer to the content of + \examplesblob{v5.2.2/Contributions.md} + for details. + +\item Added the following examples: +\begin{itemize} + \item Orphaned and nested \kcode{loop} constructs (\specref{sec:loop}) + \item \kcode{all} variable category for the \kcode{defaultmap} clause + (\specref{sec:defaultmap}) + \item \kcode{target update} construct using a custom mapper + (\specref{subsec:target_update_mapper}) + \item \kcode{indirect} clause for indirect procedure calls in a + \kcode{target} region (\specref{subsec:indirect}) + \item \kcode{omp_target_memcpy_async} routine with depend object + (\specref{subsec:target_mem_and_device_ptrs}) + \item Synchronization hint for atomic operation (\specref{sec:atomic_hint}) + \item Implication of passing shared variable to a procedure + in Fortran (\specref{sec:fort_shared_var}) + \item Assumption directives for providing additional information + about program properties (\specref{sec:assumption}) + \item Mapping behavior of scalars, pointers, references (C++) and associate names + (Fortran) when unified shared memory is required + (\specref{sec:requires}) + \item \kcode{begin declare variant} paired with \kcode{end declare variant} + example to show use of nested declare variant + directives (\specref{subsec:declare_variant}) + \item Explicit scoring in context selectors + (\specref{subsec:context_selector_scoring}) +\end{itemize} + +\item Miscellaneous changes: +\begin{itemize} + \item Included a general statement in Introduction about the number of + threads used throughout the examples document (\specref{sec:examples}) + \item Clarified the mapping of virtual functions in \kcode{target} regions + (\specref{sec:virtual_functions}) + \item Added missing \kcode{declare target} directive for procedures + called inside \kcode{target} region in \example{Examples} + \example{declare_mapper.1.f90} (\specref{sec:declare_mapper}), + \example{target_reduction.*.f90} (\specref{subsec:target_reduction}), + and \example{target_task_reduction.*.f90} + (\specref{subsec:target_task_reduction}) + \item Added missing \kcode{end target} directive in + \example{Example declare_mapper.3.f90} + (\specref{sec:declare_mapper}) + \item Removed example for \kcode{flush} without a list from Synchronization + since the example is confusing and the use of \kcode{flush} is already + covered in other examples + (\specref{chap:synchronization}) + \item \docref{declare variant Directive} and \docref{Metadirective} sections were moved to + subsections in the new \docref{Context-based Variant Selection} section, + with a section introduction on context selectors. + (\specref{sec:context_based_variants}) + \item Fixed a typo (`\kcode{for}' $\rightarrow$ `\kcode{do}') in + \example{Example metadirective.4.f90} + (\specref{subsec:metadirective}) +\end{itemize} + +\end{itemize} + %===================================== \section{Changes from 5.2 to 5.2.1} \label{sec:history_52_to_521} @@ -9,44 +76,57 @@ \section{Changes from 5.2 to 5.2.1} \item General changes: \begin{itemize} \item Updated source metadata tags for all examples to use an improved form - (see \href{https://github.com/OpenMP/Examples/blob/v\VER/Contributions.md}% - {https://github.com/OpenMP/Examples/blob/v\VER/Contributions.md}) + (see \examplesblob{v5.2.1/Contributions.md}) \item Explicitly included the version tag \verlabel[pre\_]{3.0} in those examples that did not contain a version tag previously \end{itemize} \item Added the following examples for the 5.2 features: \begin{itemize} - \item \scode{uses_allocators} clause for the use of allocators in - \code{target} regions (\specref{sec:allocators}) + \item \kcode{uses_allocators} clause for the use of allocators in + \kcode{target} regions (\specref{sec:allocators}) \end{itemize} \item Added the following examples for the 5.1 features: \begin{itemize} - \item The \scode{inoutset} dependence type (\specref{subsec:task_concurrent_depend}) + \item The \kcode{inoutset} dependence type (\specref{subsec:task_concurrent_depend}) \item Atomic compare and capture (\specref{sec:cas}) \end{itemize} \item Added the following examples for the 5.0 features: \begin{itemize} - \item \code{declare}~\code{target} directive with \scode{device_type(nohost)} + \item \kcode{declare target} directive with \kcode{device_type(nohost)} clause (\specref{subsec:declare_target_device_type}) - \item \scode{omp_pause_resource} and \scode{omp_pause_resource_all} + \item \kcode{omp_pause_resource} and \kcode{omp_pause_resource_all} routines (\specref{sec:pause_resource}) \end{itemize} \item Miscellaneous fixes: \begin{itemize} -\item Fixed an inconsistent use of mapper in \splc{Example target_mapper.3.f90} - (\specref{sec:declare_mapper}) -\item Fixed mismatched argument list in \splc{Example fort_sa_private.5.f} - (\specref{sec:fort_sa_private}) -\item Moved the placement of \code{declare}~\code{target}~\code{enter} - directive after function declaration - (\specref{subsec:target_task_reduction}) -\item Fixed an incorrect use of \scode{omp_in_parallel} routine in - \splc{Example metadirective.4} - (\specref{sec:metadirective}) -\item Fixed an incorrect value for \code{at} clause - (\specref{subsec:error}) + \item Cast to implementation-defined enum type \kcode{omp_event_handle_t} + now uses \bcode{uintptr_t} (not \bcode{void *}) in + \example{Example task_detach.2.c} + (\specref{sec:task_detachment}) + \item Moved Fortran \kcode{requires} directive into program main (\ucode{rev_off}), + the program unit, in \example{Example target_reverse_offload.7.f90} + (\specref{subsec:target_reverse_offload}) + \item Fixed an inconsistent use of mapper in \example{Example target_mapper.3.f90} + (\specref{sec:declare_mapper}) + \item Added a missing semicolon at end of \ucode{XOR1} class definition in + \example{Example declare_target.2a.cpp} + (\specref{subsec:declare_target_class}) + \item Fixed the placement of \kcode{declare simd} directive in + \example{Examples linear_modifier.*.f90} (\specref{sec:linear_modifier}) + and added a general statement about where a Fortran declarative + directive can appear (\specref{chap:directive_syntax}) + \item Fixed mismatched argument list in \example{Example fort_sa_private.5.f} + (\specref{sec:fort_sa_private}) + \item Moved the placement of \kcode{declare target enter} + directive after function declaration + (\specref{subsec:target_task_reduction}) + \item Fixed an incorrect use of \kcode{omp_in_parallel} routine in + \example{Example metadirective.4} + (\specref{subsec:metadirective}) + \item Fixed an incorrect value for \kcode{at} clause + (\specref{subsec:error}) \end{itemize} \end{itemize} @@ -75,7 +155,7 @@ \section{Changes from 5.1 to 5.2} \begin{itemize} \item Mapping class objects with virtual functions (\specref{sec:virtual_functions}) - \item \scode{allocators} construct for Fortran \code{allocate} statement + \item \kcode{allocators} construct for Fortran \bcode{allocate} statement (\specref{sec:allocators}) \item Behavior of reallocation of variables through OpenMP allocator in Fortran (\specref{sec:allocators}) @@ -83,36 +163,36 @@ \section{Changes from 5.1 to 5.2} \item Added the following examples for the 5.1 features: \begin{itemize} - \item Clarification of optional \code{end} directive for strictly structured + \item Clarification of optional \kcode{end} directive for strictly structured block in Fortran (\specref{sec:fortran_free_format_comments}) - \item \scode{filter} clause on \scode{masked} construct (\specref{sec:masked}) - \item \scode{omp_all_memory} reserved locator for specifying task dependences + \item \kcode{filter} clause on \kcode{masked} construct (\specref{sec:masked}) + \item \kcode{omp_all_memory} reserved locator for specifying task dependences (\specref{subsec:depend_undefer_task}) - \item Behavior of Fortran allocatable variables in \code{target} regions + \item Behavior of Fortran allocatable variables in \kcode{target} regions (\specref{sec:fort_allocatable_array_mapping}) \item Device memory routines in Fortran (\specref{subsec:target_mem_and_device_ptrs}) - \item Partial tiles from \scode{tile} construct + \item Partial tiles from \kcode{tile} construct (\specref{sec:incomplete_tiles}) - \item Fortran associate names and selectors in \code{target} region + \item Fortran associate names and selectors in \kcode{target} region (\specref{sec:associate_target}) - \item \scode{allocate} directive for variable declarations and - \scode{allocate} clause on \scode{task} constructs + \item \kcode{allocate} directive for variable declarations and + \kcode{allocate} clause on \kcode{task} constructs (\specref{sec:allocators}) - \item Controlling concurrency and reproducibility with \code{order} clause + \item Controlling concurrency and reproducibility with \kcode{order} clause (\specref{sec:reproducible_modifier}) \end{itemize} \item Added other examples: \begin{itemize} - \item Using lambda expressions with \scode{target} constructs + \item Using lambda expressions with \kcode{target} constructs (\specref{sec:lambda_expressions}) \item Target memory and device pointer routines (\specref{subsec:target_mem_and_device_ptrs}) \item Examples to illustrate the ordering properties of the \plc{flush} operation (\specref{sec:mem_model}) - \item User selector in the \code{metadirective} directive - (\specref{sec:metadirective}) + \item User selector in the \kcode{metadirective} directive + (\specref{subsec:metadirective}) \end{itemize} \end{itemize} @@ -124,11 +204,11 @@ \section{Changes from 5.0.1 to 5.1} \begin{itemize} \item General changes: \begin{itemize} - \item Replaced \code{master} construct example with equivalent \code{masked} construct example (\specref{sec:masked}) + \item Replaced \kcode{master} construct example with equivalent \kcode{masked} construct example (\specref{sec:masked}) \item Primary thread is now used to describe thread number 0 in the current team - \item \code{primary} thread affinity policy is now used to specify that every + \item \kcode{primary} thread affinity policy is now used to specify that every thread in the team is assigned to the same place as the primary thread (\specref{subsec:affinity_primary}) - \item The \scode{omp_lock_hint_*} constants have been renamed \scode{omp_sync_hint_*} (\specref{sec:critical}, \specref{sec:locks}) + \item The \kcode{omp_lock_hint_*} constants have been renamed \kcode{omp_sync_hint_*} (\specref{sec:critical}, \specref{sec:locks}) \end{itemize} \item Added the following new chapters: @@ -143,34 +223,34 @@ \section{Changes from 5.0.1 to 5.1} \begin{itemize} \item OpenMP directives in C++ \plc{attribute} specifiers (\specref{sec:attributes}) - \item Directive syntax adjustment to allow Fortran \code{BLOCK} ... - \code{END}~\code{BLOCK} as a structured block + \item Directive syntax adjustment to allow Fortran \bcode{BLOCK} ... + \bcode{END BLOCK} as a structured block (\specref{sec:fortran_free_format_comments}) - \item \code{omp\_target\_is\_accessible} API routine + \item \kcode{omp_target_is_accessible} API routine (\specref{sec:pointer_mapping}) - \item Fortran allocatable array mapping in \code{target} regions (\specref{sec:fort_allocatable_array_mapping}) - \item \code{begin}~\code{declare}~\code{target} (with - \code{end}~\code{declare}~\code{target}) directive + \item Fortran allocatable array mapping in \kcode{target} regions (\specref{sec:fort_allocatable_array_mapping}) + \item \kcode{begin declare target} (with + \kcode{end declare target}) directive (\specref{subsec:declare_target_class}) - \item \code{tile} construct (\specref{sec:tile}) - \item \code{unroll} construct (\specref{sec:unroll}) - \item Reduction with the \code{scope} construct + \item \kcode{tile} construct (\specref{sec:tile}) + \item \kcode{unroll} construct (\specref{sec:unroll}) + \item Reduction with the \kcode{scope} construct (\specref{subsec:reduction_scope}) - \item \code{metadirective} directive with dynamic \code{condition} selector - (\specref{sec:metadirective}) - \item \code{interop} construct (\specref{sec:interop}) - \item Environment display with the \scode{omp_display_env} routine + \item \kcode{metadirective} directive with dynamic \kcode{condition} selector + (\specref{subsec:metadirective}) + \item \kcode{interop} construct (\specref{sec:interop}) + \item Environment display with the \kcode{omp_display_env} routine (\specref{subsec:display_env}) - \item \code{error} directive (\specref{subsec:error}) + \item \kcode{error} directive (\specref{subsec:error}) \end{itemize} \item Included additional examples for the 5.0 features: \begin{itemize} - \item \code{collapse} clause for non-rectangular loop nest + \item \kcode{collapse} clause for non-rectangular loop nest (\specref{sec:collapse}) - \item \code{detach} clause for tasks (\specref{sec:task_detachment}) + \item \kcode{detach} clause for tasks (\specref{sec:task_detachment}) \item Pointer attachment for a structure member (\specref{sec:structure_mapping}) - \item Host and device pointer association with the \scode{omp_target_associate_ptr} routine (\specref{sec:target_associate_ptr}) + \item Host and device pointer association with the \kcode{omp_target_associate_ptr} routine (\specref{sec:target_associate_ptr}) \item Sample code on activating the tool interface (\specref{sec:ompt_start}) @@ -178,7 +258,7 @@ \section{Changes from 5.0.1 to 5.1} \item Added other examples: \begin{itemize} - \item The \scode{omp_get_wtime} routine (\specref{subsec:get_wtime}) + \item The \kcode{omp_get_wtime} routine (\specref{subsec:get_wtime}) \end{itemize} \end{itemize} @@ -188,22 +268,22 @@ \section{Changes from 5.0.0 to 5.0.1} \label{sec:history_50_to_501} \begin{itemize} -\item Added version tags (\code{\small{}omp\_}\plc{x.y}) in example labels +\item Added version tags \verlabel{\plc{x.y}} in example labels and the corresponding source codes for all examples that feature OpenMP 3.0 and later. \item Included additional examples for the 5.0 features: \begin{itemize} -\item Extension to the \code{defaultmap} clause +\item Extension to the \kcode{defaultmap} clause (\specref{sec:defaultmap}) -\item Transferring noncontiguous data with the \code{target}~\code{update} directive in Fortran (\specref{sec:array-shaping}) -\item \code{conditional} modifier for the \code{lastprivate} clause (\specref{sec:lastprivate}) -\item \code{task} modifier for the \code{reduction} clause (\specref{subsec:task_reduction}) +\item Transferring noncontiguous data with the \kcode{target update} directive in Fortran (\specref{sec:array-shaping}) +\item \kcode{conditional} modifier for the \kcode{lastprivate} clause (\specref{sec:lastprivate}) +\item \kcode{task} modifier for the \kcode{reduction} clause (\specref{subsec:task_reduction}) \item Reduction on combined target constructs (\specref{subsec:target_reduction}) -\item Task reduction with \code{target} constructs +\item Task reduction with \kcode{target} constructs (\specref{subsec:target_task_reduction}) -\item \code{scan} directive for returning the \emph{prefix sum} of a reduction (\specref{sec:scan}) +\item \kcode{scan} directive for returning the \emph{prefix sum} of a reduction (\specref{sec:scan}) \end{itemize} @@ -212,7 +292,7 @@ \section{Changes from 5.0.0 to 5.0.1} \begin{itemize} \item Dependence for undeferred tasks (\specref{subsec:depend_undefer_task}) -\item \code{ref}, \code{val}, \code{uval} modifiers for \code{linear} clause (\specref{sec:linear_modifier}) +\item \kcode{ref}, \kcode{val}, \kcode{uval} modifiers for \kcode{linear} clause (\specref{sec:linear_modifier}) \end{itemize} @@ -231,37 +311,39 @@ \section{Changes from 4.5.0 to 5.0.0} \item Added the following examples for the 5.0 features: \begin{itemize} -\item Extended \code{teams} construct for host execution (\specref{sec:host_teams}) -\item \code{loop} and \code{teams}~\code{loop} constructs specify loop iterations that can execute concurrently +\item Extended \kcode{teams} construct for host execution (\specref{sec:host_teams}) +\item \kcode{loop} and \kcode{teams loop} constructs specify loop iterations that can execute concurrently (\specref{sec:loop}) -\item Task data affinity is indicated by \code{affinity} clause of \code{task} construct +\item Task data affinity is indicated by \kcode{affinity} clause of \kcode{task} construct (\specref{sec: task_affinity}) -\item Display thread affinity with \code{OMP\_DISPLAY\_AFFINITY} environment variable or \code{omp\_display\_affinity()} API routine +\item Display thread affinity with \kcode{OMP_DISPLAY_AFFINITY} environment variable or \kcode{omp_display_affinity()} API routine (\specref{sec:affinity_display}) -\item \code{taskwait} with dependences (\specref{subsec:taskwait_depend}) -\item \code{mutexinoutset} task dependences (\specref{subsec:task_dep_mutexinoutset}) -\item Multidependence Iterators (in \code{depend} clauses) (\specref{subsec:depend_iterator}) -\item Combined constructs: \code{parallel}~\code{master}~\code{taskloop} and \code{parallel}~\code{master}~\code{taskloop}~\code{simd} +\item \kcode{taskwait} with dependences (\specref{subsec:taskwait_depend}) +\item \kcode{mutexinoutset} task dependences (\specref{subsec:task_dep_mutexinoutset}) +\item Multidependence Iterators (in \kcode{depend} clauses) (\specref{subsec:depend_iterator}) +\item Combined constructs: \kcode{parallel master taskloop} and \kcode{parallel master taskloop simd} (\specref{sec:parallel_masked_taskloop}) -\item Reverse Offload through \plc{ancestor} modifier of \code{device} clause. (\specref{subsec:target_reverse_offload}) +\item Reverse Offload through \kcode{ancestor} modifier of \kcode{device} clause. (\specref{subsec:target_reverse_offload}) \item Pointer Mapping - behavior of mapped pointers (\specref{sec:pointer_mapping}) %Example_target_ptr_map* \item Structure Mapping - behavior of mapped structures (\specref{sec:structure_mapping}) %Examples_target_structure_mapping.tex target_struct_map* \item Array Shaping with the \plc{shape-operator} (\specref{sec:array-shaping}) -\item The \code{declare}~\code{mapper} directive (\specref{sec:declare_mapper}) +\item The \kcode{declare mapper} directive (\specref{sec:declare_mapper}) \item Acquire and Release Semantics Synchronization: Memory ordering - clauses \code{acquire}, \code{release}, and \code{acq\_rel} were added + clauses \kcode{acquire}, \kcode{release}, and \kcode{acq_rel} were added to flush and atomic constructs (\specref{sec:acquire_and_release_semantics}) -\item \code{depobj} construct provides dependence objects for subsequent use in \code{depend} clauses +\item \kcode{depobj} construct provides dependence objects for subsequent use in \kcode{depend} clauses (\specref{sec:depobj}) -\item \code{reduction} clause for \code{task} construct (\specref{subsec:task_reduction}) -\item \code{reduction} clause for \code{taskloop} construct (\specref{subsec:taskloop_reduction}) -\item \code{reduction} clause for \code{taskloop}~\code{simd} construct (\specref{subsec:taskloop_reduction}) +\item \kcode{reduction} clause for \kcode{task} construct (\specref{subsec:task_reduction}) +\item \kcode{reduction} clause for \kcode{taskloop} construct (\specref{subsec:taskloop_reduction}) +\item \kcode{reduction} clause for \kcode{taskloop simd} construct (\specref{subsec:taskloop_reduction}) \item Memory Allocators for making OpenMP memory requests with traits (\specref{sec:allocators}) -\item \code{requires} directive specifies required features of implementation (\specref{sec:requires}) -\item \code{declare}~\code{variant} directive - for function variants (\specref{sec:declare_variant}) -\item \code{metadirective} directive - for directive variants (\specref{sec:metadirective}) -\item \code{OMP\_TARGET\_OFFLOAD} Environment Variable - controls offload behavior (\specref{sec:target_offload}) +\item \kcode{requires} directive specifies required features of implementation (\specref{sec:requires}) +\item \kcode{declare variant} directive - for function variants +(\specref{subsec:declare_variant}) +\item \kcode{metadirective} directive - for directive variants +(\specref{subsec:metadirective}) +\item \kcode{OMP_TARGET_OFFLOAD} Environment Variable - controls offload behavior (\specref{sec:target_offload}) \end{itemize} \item Included the following additional examples for the 4.x features: @@ -278,22 +360,22 @@ \section{Changes from 4.0.2 to 4.5.0} \begin{itemize} \item Reorganized into chapters of major topics \item Included file extensions in example labels to indicate source type -\item Applied the explicit \code{map(tofrom)} for scalar variables +\item Applied the explicit \kcode{map(tofrom)} for scalar variables in a number of examples to comply with the change of the default behavior for scalar variables from - \code{map(tofrom)} to \code{firstprivate} in the 4.5 specification + \kcode{map(tofrom)} to \kcode{firstprivate} in the 4.5 specification \item Added the following new examples: \begin{itemize} -\item \code{linear} clause in loop constructs (\specref{sec:linear_in_loop}) -\item \code{priority} clause for \code{task} construct (\specref{sec:task_priority}) -\item \code{taskloop} construct (\specref{sec:taskloop}) -\item \plc{directive-name} modifier in multiple \code{if} clauses on +\item \kcode{linear} clause in loop constructs (\specref{sec:linear_in_loop}) +\item \kcode{priority} clause for \kcode{task} construct (\specref{sec:task_priority}) +\item \kcode{taskloop} construct (\specref{sec:taskloop}) +\item \plc{directive-name} modifier in multiple \kcode{if} clauses on a combined construct (\specref{subsec:target_if}) \item unstructured data mapping (\specref{sec:target_enter_exit_data}) -\item \code{link} clause for \code{declare}~\code{target} directive +\item \kcode{link} clause for \kcode{declare target} directive (\specref{subsec:declare_target_link}) -\item asynchronous target execution with \code{nowait} clause (\specref{sec:async_target_exec_depend}) +\item asynchronous target execution with \kcode{nowait} clause (\specref{sec:async_target_exec_depend}) \item device memory routines and device pointers (\specref{subsec:target_mem_and_device_ptrs}) \item doacross loop nest (\specref{sec:doacross}) \item locks with hints (\specref{sec:locks}) @@ -316,8 +398,8 @@ \section{Changes from 4.0 to 4.0.1} Added the following new examples: \begin{itemize} -\item the \code{proc\_bind} clause (\specref{sec:affinity}) -\item the \code{taskgroup} construct (\specref{sec:taskgroup}) +\item the \kcode{proc_bind} clause (\specref{sec:affinity}) +\item the \kcode{taskgroup} construct (\specref{sec:taskgroup}) \end{itemize} \section{Changes from 3.1 to 4.0} @@ -329,13 +411,13 @@ \section{Changes from 3.1 to 4.0} \begin{itemize} \item task dependences (\specref{sec:task_depend}) -\item \code{target} construct (\specref{sec:target}) +\item \kcode{target} construct (\specref{sec:target}) \item array sections in device constructs (\specref{sec:array_sections}) -\item \code{target}~\code{data} construct (\specref{sec:target_data}) -\item \code{target}~\code{update} construct (\specref{sec:target_update}) -\item \code{declare}~\code{target} directive (\specref{sec:declare_target}) -\item \code{teams} constructs (\specref{sec:teams}) -\item asynchronous execution of a \code{target} region using tasks (\specref{subsec:async_target_with_tasks}) +\item \kcode{target data} construct (\specref{sec:target_data}) +\item \kcode{target update} construct (\specref{sec:target_update}) +\item \kcode{declare target} directive (\specref{sec:declare_target}) +\item \kcode{teams} constructs (\specref{sec:teams}) +\item asynchronous execution of a \kcode{target} region using tasks (\specref{subsec:async_target_with_tasks}) \item device runtime routines (\specref{sec:device}) \item Fortran ASSOCIATE construct (\specref{sec:associate}) \item cancellation constructs (\specref{sec:cancellation}) diff --git a/Makefile b/Makefile index 4c74dd0..5944af8 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,17 @@ # Makefile for the OpenMP Examples document in LaTex format. # For more information, see the main document, openmp-examples.tex. -version=5.2.1 +include versioninfo + default: openmp-examples.pdf diff: openmp-diff-abridged.pdf book: BOOK_BUILD="\\\\def\\\\bookbuild{1}" +book: VERSIONSTR="$(version_date)" book: clean openmp-examples.pdf mv openmp-examples-${version}.pdf openmp-examples-${version}-book.pdf +release: VERSIONSTR="$(version_date)" +release: clean openmp-examples.pdf CHAPTERS=Title_Page.tex \ Foreword_Chapt.tex \ @@ -37,6 +41,8 @@ LATEXDCMD=$(LATEXCMD) -draftmode # check for branches names with "name_XXX" DIFF_TICKET_ID=$(shell git rev-parse --abbrev-ref HEAD) +GITREV=$(shell git rev-parse --short HEAD) +VERSIONSTR="GIT rev $(GITREV)" openmp-examples.pdf: $(CHAPTERS) $(SOURCES) openmp.sty openmp-examples.tex openmp-logo.png generated-include.tex rm -f $(INTERMEDIATE_FILES) @@ -47,6 +53,9 @@ openmp-examples.pdf: $(CHAPTERS) $(SOURCES) openmp.sty openmp-examples.tex openm $(LATEXCMD) openmp-examples.tex cp openmp-examples.pdf openmp-examples-${version}.pdf +check: + sources/check_tags + clean: rm -f $(INTERMEDIATE_FILES) rm -f generated-include.tex @@ -66,11 +75,11 @@ endif ifdef DIFF_FROM VC_DIFF_FROM := -r ${DIFF_FROM} else - VC_DIFF_FROM := -r work_5.2 + VC_DIFF_FROM := -r main endif DIFF_TO:=HEAD -DIFF_FROM:=work_5.2 +DIFF_FROM:=main DIFF_TYPE:=UNDERLINE COMMON_DIFF_OPTS:=--math-markup=whole \ @@ -84,6 +93,9 @@ VC_DIFF_MINIMAL_OPTS:= --only-changes --force generated-include.tex: echo "$(BOOK_BUILD)" echo "$(BOOK_BUILD)" > $@ + echo "\def\VER{${version}}" >> $@ + echo "\def\PVER{${version_spec}}" >> $@ + echo "\def\VERDATE{${VERSIONSTR}}" >> $@ util/list_tags -vtag */sources/* >> $@ %.tmpdir: $(wildcard *.sty) $(wildcard *.png) $(wildcard *.aux) openmp-examples.pdf diff --git a/SIMD/SIMD.tex b/SIMD/SIMD.tex index 159a72b..8e6e5f7 100644 --- a/SIMD/SIMD.tex +++ b/SIMD/SIMD.tex @@ -1,10 +1,10 @@ %\pagebreak -\section{\code{simd} and \code{declare} \code{simd} Directives} +\section{\kcode{simd} and \kcode{declare simd} Directives} \label{sec:SIMD} -\index{constructs!simd@\code{simd}} -\index{simd construct@\code{simd} construct} -The following example illustrates the basic use of the \code{simd} construct +\index{constructs!simd@\kcode{simd}} +\index{simd construct@\kcode{simd} construct} +The following example illustrates the basic use of the \kcode{simd} construct to assure the compiler that the loop can be vectorized. \cexample[4.0]{SIMD}{1} @@ -12,38 +12,38 @@ \section{\code{simd} and \code{declare} \code{simd} Directives} \ffreeexample[4.0]{SIMD}{1} -\index{directives!declare simd@\code{declare}~\code{simd}} -\index{declare simd directive@\code{declare}~\code{simd} directive} -\index{clauses!uniform@\code{uniform}} -\index{uniform clause@\code{uniform} clause} -\index{clauses!linear@\code{linear}} -\index{linear clause@\code{linear} clause} +\index{directives!declare simd@\kcode{declare simd}} +\index{declare simd directive@\kcode{declare simd} directive} +\index{clauses!uniform@\kcode{uniform}} +\index{uniform clause@\kcode{uniform} clause} +\index{clauses!linear@\kcode{linear}} +\index{linear clause@\kcode{linear} clause} When a function can be inlined within a loop the compiler has an opportunity to vectorize the loop. By guaranteeing SIMD behavior of a function's operations, characterizing the arguments of the function and privatizing temporary variables of the loop, the compiler can often create faster, vector code for -the loop. In the examples below the \code{declare} \code{simd} directive is -used on the \plc{add1} and \plc{add2} functions to enable creation of their +the loop. In the examples below the \kcode{declare simd} directive is +used on the \ucode{add1} and \ucode{add2} functions to enable creation of their corresponding SIMD function versions for execution within the associated SIMD loop. The functions characterize two different approaches of accessing data within the function: by a single variable and as an element in a data array, -respectively. The \plc{add3} C function uses dereferencing. +respectively. The \ucode{add3} C function uses dereferencing. -The \code{declare} \code{simd} directives also illustrate the use of -\code{uniform} and \code{linear} clauses. The \code{uniform(fact)} clause -indicates that the variable \plc{fact} is invariant across the SIMD lanes. In -the \plc{add2} function \plc{a} and \plc{b} are included in the \code{uniform} +The \kcode{declare simd} directives also illustrate the use of +\kcode{uniform} and \kcode{linear} clauses. The \kcode{uniform(\ucode{fact})} clause +indicates that the variable \ucode{fact} is invariant across the SIMD lanes. In +the \ucode{add2} function \ucode{a} and \ucode{b} are included in the \kcode{uniform} list because the C pointer and the Fortran array references are constant. The -\plc{i} index used in the \plc{add2} function is included in a \code{linear} +\ucode{i} index used in the \ucode{add2} function is included in a \kcode{linear} clause with a constant-linear-step of 1, to guarantee a unity increment of the -associated loop. In the \code{declare} \code{simd} directive for the \plc{add3} -C function the \code{linear(a,b:1)} clause instructs the compiler to generate +associated loop. In the \kcode{declare simd} directive for the \ucode{add3} +C function the \kcode{linear(\ucode{a,b:1})} clause instructs the compiler to generate unit-stride loads across the SIMD lanes; otherwise, costly \emph{gather} instructions would be generated for the unknown sequence of access of the pointer dereferences. -In the \code{simd} constructs for the loops the \code{private(tmp)} clause is -necessary to assure that the each vector operation has its own \plc{tmp} +In the \kcode{simd} constructs for the loops the \kcode{private(\ucode{tmp})} clause is +necessary to assure that each vector operation has its own \ucode{tmp} variable. \cexample[4.0]{SIMD}{2} @@ -51,16 +51,16 @@ \section{\code{simd} and \code{declare} \code{simd} Directives} \ffreeexample[4.0]{SIMD}{2} %\pagebreak -\index{clauses!private@\code{private}} -\index{private clause@\code{private} clause} -\index{clauses!reduction@\code{reduction}} -\index{reduction clause@\code{reduction} clause} -\index{reductions!reduction clause@\code{reduction} clause} +\index{clauses!private@\kcode{private}} +\index{private clause@\kcode{private} clause} +\index{clauses!reduction@\kcode{reduction}} +\index{reduction clause@\kcode{reduction} clause} +\index{reductions!reduction clause@\kcode{reduction} clause} A thread that encounters a SIMD construct executes a vectorized code of the iterations. Similar to the concerns of a worksharing loop a loop vectorized with a SIMD construct must assure that temporary and reduction variables are privatized and declared as reductions with clauses. The example below -illustrates the use of \code{private} and \code{reduction} clauses in a SIMD +illustrates the use of \kcode{private} and \kcode{reduction} clauses in a SIMD construct. \cexample[4.0]{SIMD}{3} @@ -69,16 +69,16 @@ \section{\code{simd} and \code{declare} \code{simd} Directives} %\pagebreak -\index{clauses!safelen@\code{safelen}} -\index{safelen clause@\code{safelen} clause} -A \code{safelen(N)} clause in a \code{simd} construct assures the compiler that -there are no loop-carried dependencies for vectors of size \plc{N} or below. If -the \code{safelen} clause is not specified, then the default safelen value is +\index{clauses!safelen@\kcode{safelen}} +\index{safelen clause@\kcode{safelen} clause} +A \kcode{safelen(\ucode{N})} clause in a \kcode{simd} construct assures the compiler that +there are no loop-carried dependences for vectors of size \ucode{N} or below. If +the \kcode{safelen} clause is not specified, then the default safelen value is the number of loop iterations. -The \code{safelen(16)} clause in the example below guarantees that the vector -code is safe for vectors up to and including size 16. In the loop, \plc{m} can -be 16 or greater, for correct code execution. If the value of \plc{m} is less +The \kcode{safelen(\ucode{16})} clause in the example below guarantees that the vector +code is safe for vectors up to and including size 16. In the loop, \ucode{m} can +be 16 or greater, for correct code execution. If the value of \ucode{m} is less than 16, the behavior is undefined. \cexample[4.0]{SIMD}{4} @@ -86,10 +86,10 @@ \section{\code{simd} and \code{declare} \code{simd} Directives} \ffreeexample[4.0]{SIMD}{4} %\pagebreak -\index{clauses!collapse@\code{collapse}} -\index{collapse clause@\code{collapse} clause} -The following SIMD construct instructs the compiler to collapse the \plc{i} and -\plc{j} loops into a single SIMD loop in which SIMD chunks are executed by +\index{clauses!collapse@\kcode{collapse}} +\index{collapse clause@\kcode{collapse} clause} +The following SIMD construct instructs the compiler to collapse the \ucode{i} and +\ucode{j} loops into a single SIMD loop in which SIMD chunks are executed by threads of the team. Within the workshared loop chunks of a thread, the SIMD chunks are executed in the lanes of the vector units. @@ -99,31 +99,31 @@ \section{\code{simd} and \code{declare} \code{simd} Directives} %%% section -\section{\code{inbranch} and \code{notinbranch} Clauses} +\section{\kcode{inbranch} and \kcode{notinbranch} Clauses} \label{sec:SIMD_branch} -\index{clauses!inbranch@\code{inbranch}} -\index{inbranch clause@\code{inbranch} clause} -\index{clauses!notinbranch@\code{notinbranch}} -\index{notinbranch clause@\code{notinbranch} clause} - -The following examples illustrate the use of the \code{declare} \code{simd} -directive with the \code{inbranch} and \code{notinbranch} clauses. The -\code{notinbranch} clause informs the compiler that the function \plc{foo} is -never called conditionally in the SIMD loop of the function \plc{myaddint}. On -the other hand, the \code{inbranch} clause for the function goo indicates that +\index{clauses!inbranch@\kcode{inbranch}} +\index{inbranch clause@\kcode{inbranch} clause} +\index{clauses!notinbranch@\kcode{notinbranch}} +\index{notinbranch clause@\kcode{notinbranch} clause} + +The following examples illustrate the use of the \kcode{declare simd} +directive with the \kcode{inbranch} and \kcode{notinbranch} clauses. The +\kcode{notinbranch} clause informs the compiler that the function \ucode{foo} is +never called conditionally in the SIMD loop of the function \ucode{myaddint}. On +the other hand, the \kcode{inbranch} clause for the function goo indicates that the function is always called conditionally in the SIMD loop inside -the function \plc{myaddfloat}. +the function \ucode{myaddfloat}. \cexample[4.0]{SIMD}{6} \ffreeexample[4.0]{SIMD}{6} -In the code below, the function \plc{fib()} is called in the main program and -also recursively called in the function \plc{fib()} within an \code{if} +In the code below, the function \ucode{fib()} is called in the main program and +also recursively called in the function \ucode{fib()} within an \bcode{if} condition. The compiler creates a masked vector version and a non-masked vector -version for the function \plc{fib()} while retaining the original scalar -version of the \plc{fib()} function. +version for the function \ucode{fib()} while retaining the original scalar +version of the \ucode{fib()} function. \cexample[4.0]{SIMD}{7} @@ -132,7 +132,7 @@ \section{\code{inbranch} and \code{notinbranch} Clauses} %%% section -\pagebreak +%\pagebreak \section{Loop-Carried Lexical Forward Dependence} \label{sec:SIMD_forward_dep} \index{dependences!loop-carried lexical forward} @@ -140,9 +140,9 @@ \section{Loop-Carried Lexical Forward Dependence} The following example tests the restriction on an SIMD loop with the loop-carried lexical forward-dependence. This dependence must be preserved for the correct execution of SIMD loops. -A loop can be vectorized even though the iterations are not completely independent when it has loop-carried dependences that are forward lexical dependences, indicated in the code below by the read of \plc{A[j+1]} and the write to \plc{A[j]} in C/C++ code (or \plc{A(j+1)} and \plc{A(j)} in Fortran). That is, the read of \plc{A[j+1]} (or \plc{A(j+1)} in Fortran) before the write to \plc{A[j]} (or \plc{A(j)} in Fortran) ordering must be preserved for each iteration in \plc{j} for valid SIMD code generation. +A loop can be vectorized even though the iterations are not completely independent when it has loop-carried dependences that are forward lexical dependences, indicated in the code below by the read of \ucode{A[j+1]} and the write to \ucode{A[j]} in C/C++ code (or \ucode{A(j+1)} and \ucode{A(j)} in Fortran). That is, the read of \ucode{A[j+1]} (or \ucode{A(j+1)} in Fortran) before the write to \ucode{A[j]} (or \ucode{A(j)} in Fortran) ordering must be preserved for each iteration in \ucode{j} for valid SIMD code generation. -This test assures that the compiler preserves the loop carried lexical forward-dependence for generating a correct SIMD code. +This test assures that the compiler preserves the loop-carried lexical forward-dependence for generating a correct SIMD code. \cexample[4.0]{SIMD}{8} diff --git a/SIMD/linear_modifier.tex b/SIMD/linear_modifier.tex index 2ea89f8..5a493c7 100644 --- a/SIMD/linear_modifier.tex +++ b/SIMD/linear_modifier.tex @@ -1,31 +1,31 @@ %%% section -\section{\code{ref}, \code{val}, \code{uval} Modifiers for \code{linear} Clause} +\section{\kcode{ref}, \kcode{val}, \kcode{uval} Modifiers for \kcode{linear} Clause} \label{sec:linear_modifier} -\index{modifiers, linear@modifiers, \code{linear}!ref@\code{ref}} -\index{modifiers, linear@modifiers, \code{linear}!val@\code{val}} -\index{modifiers, linear@modifiers, \code{linear}!uval@\code{uval}} -\index{clauses!linear@\code{linear}} -\index{linear clause@\code{linear} clause} +\index{modifiers, linear@modifiers, \kcode{linear}!ref@\kcode{ref}} +\index{modifiers, linear@modifiers, \kcode{linear}!val@\kcode{val}} +\index{modifiers, linear@modifiers, \kcode{linear}!uval@\kcode{uval}} +\index{clauses!linear@\kcode{linear}} +\index{linear clause@\kcode{linear} clause} -When generating vector functions from \code{declare}~\code{simd} directives, +When generating vector functions from \kcode{declare simd} directives, it is important for a compiler to know the proper types of function arguments in order to generate efficient codes. This is especially true for C++ reference types and Fortran arguments. -In the following example, the function \plc{add\_one2} has a C++ reference -parameter (or Fortran argument) \plc{p}. Variable \plc{p} gets incremented by 1 in the function. -The caller loop \plc{i} in the main program passes -a variable \plc{k} as a reference to the function \plc{add\_one2} call. -The \code{ref} modifier for the \code{linear} clause on the -\code{declare}~\code{simd} directive specifies that the -reference-type parameter \plc{p} is to match the property of the variable -\plc{k} in the loop. +In the following example, the function \ucode{add_one2} has a C++ reference +parameter (or Fortran argument) \ucode{p}. Variable \ucode{p} gets incremented by 1 in the function. +The caller loop \ucode{i} in the main program passes +a variable \ucode{k} as a reference to the function \ucode{add_one2} call. +The \kcode{ref} modifier for the \kcode{linear} clause on the +\kcode{declare simd} directive specifies that the +reference-type parameter \ucode{p} is to match the property of the variable +\ucode{k} in the loop. This use of reference type is equivalent to the second call to -\plc{add\_one2} with a direct passing of the array element \plc{a[i]}. +\ucode{add_one2} with a direct passing of the array element \ucode{a[i]}. In the example, the preferred vector length 8 is specified for both the caller loop and the callee function. -When \code{linear(p:~ref)} is applied to an argument passed by reference, +When \kcode{linear(\ucode{p}: ref)} is applied to an argument passed by reference, it tells the compiler that the addresses in its vector argument are consecutive, and so the compiler can generate a single vector load or store instead of a gather or scatter. This allows more efficient SIMD code to be generated with @@ -33,25 +33,25 @@ \section{\code{ref}, \code{val}, \code{uval} Modifiers for \code{linear} Clause} \cppexample[5.2]{linear_modifier}{1} \ffreeexample[5.2]{linear_modifier}{1} -\clearpage +%\clearpage -The following example is a variant of the above example. The function \plc{add\_one2} -in the C++ code includes an additional C++ reference parameter \plc{i}. -The loop index \plc{i} of the caller loop \plc{i} in the main program -is passed as a reference to the function \plc{add\_one2} call. -The loop index \plc{i} has a uniform address with +The following example is a variant of the above example. The function \ucode{add_one2} +in the C++ code includes an additional C++ reference parameter \ucode{i}. +The loop index \ucode{i} of the caller loop \ucode{i} in the main program +is passed as a reference to the function \ucode{add_one2} call. +The loop index \ucode{i} has a uniform address with linear value of step 1 across SIMD lanes. -Thus, the \code{uval} modifier is used for the \code{linear} clause -to specify that the C++ reference-type parameter \plc{i} is to match -the property of loop index \plc{i}. +Thus, the \kcode{uval} modifier is used for the \kcode{linear} clause +to specify that the C++ reference-type parameter \ucode{i} is to match +the property of loop index \ucode{i}. -In the corresponding Fortran code the arguments \plc{p} and -\plc{i} in the routine \plc{add\_on2} are passed by references. -Similar modifiers are used for these variables in the \code{linear} clauses +In the corresponding Fortran code the arguments \ucode{p} and +\ucode{i} in the routine \ucode{add_on2} are passed by references. +Similar modifiers are used for these variables in the \kcode{linear} clauses to match with the property at the caller loop in the main program. -When \code{linear(i:~uval)} is applied to an argument passed by reference, it +When \kcode{linear(\ucode{i}: uval)} is applied to an argument passed by reference, it tells the compiler that its addresses in the vector argument are uniform so that the compiler can generate a scalar load or scalar store and create linear values. This allows more efficient SIMD code to be generated with @@ -60,20 +60,20 @@ \section{\code{ref}, \code{val}, \code{uval} Modifiers for \code{linear} Clause} \cppexample[5.2]{linear_modifier}{2} \ffreeexample[5.2]{linear_modifier}{2} -In the following example, the function \plc{func} takes arrays \plc{x} and \plc{y} -as arguments, and accesses the array elements referenced by the index \plc{i}. -The caller loop \plc{i} in the main program passes a linear copy of -the variable \plc{k} to the function \plc{func}. -The \code{val} modifier is used for the \code{linear} clause -in the \code{declare}~\code{simd} directive for the function -\plc{func} to specify that the argument \plc{i} is to match the property of -the actual argument \plc{k} passed in the SIMD loop. -Arrays \plc{x} and \plc{y} have uniform addresses across SIMD lanes. +In the following example, the function \ucode{func} takes arrays \ucode{x} and \ucode{y} +as arguments, and accesses the array elements referenced by the index \ucode{i}. +The caller loop \ucode{i} in the main program passes a linear copy of +the variable \ucode{k} to the function \ucode{func}. +The \kcode{val} modifier is used for the \kcode{linear} clause +in the \kcode{declare simd} directive for the function +\ucode{func} to specify that the argument \ucode{i} is to match the property of +the actual argument \ucode{k} passed in the SIMD loop. +Arrays \ucode{x} and \ucode{y} have uniform addresses across SIMD lanes. -When \code{linear(i:~val,step(1))} is applied to an argument, +When \kcode{linear(\ucode{i}: val,step(\ucode{1}))} is applied to an argument, it tells the compiler that its addresses in the vector argument may not be -consecutive, however, their values are linear (with stride 1 here). When the value of \plc{i} is used -in subscript of array references (e.g., \plc{x[i]}), the compiler can generate +consecutive, however, their values are linear (with stride 1 here). When the value of \ucode{i} is used +in subscript of array references (e.g., \ucode{x[i]}), the compiler can generate a vector load or store instead of a gather or scatter. This allows more efficient SIMD code to be generated with less source changes. diff --git a/Title_Page.tex b/Title_Page.tex index 31a295f..6d28390 100644 --- a/Title_Page.tex +++ b/Title_Page.tex @@ -24,11 +24,10 @@ \vspace{2.3in} %was 3.0 Source codes for OpenMP Examples \VER{} are available at - \href{https://github.com/OpenMP/Examples/tree/v\VER}% - {github (https://github.com/OpenMP/Examples/tree/v\VER)}.\\ + \examplestree{\VER}{github (\examplesrepo/tree/v\VER)}.\\ \begin{adjustwidth}{0pt}{1em}\setlength{\parskip}{0.25\baselineskip}% -Copyright \copyright{} 1997-2022 OpenMP Architecture Review Board.\\ +Copyright \copyright{} 1997-2024 OpenMP Architecture Review Board.\\ Permission to copy without fee all or part of this material is granted, provided the OpenMP Architecture Review Board copyright notice and the title of this document appear. Notice is given that copying is by diff --git a/affinity/affinity.tex b/affinity/affinity.tex index d42753c..54e5e97 100644 --- a/affinity/affinity.tex +++ b/affinity/affinity.tex @@ -1,12 +1,12 @@ \pagebreak -\section{\code{proc\_bind} Clause} +\section{\kcode{proc_bind} Clause} \label{sec:affinity} -\index{affinity!proc_bind clause@\scode{proc_bind} clause} -\index{clauses!proc_bind@\scode{proc_bind}} -\index{proc_bind clause@\scode{proc_bind} clause} +\index{affinity!proc_bind clause@\kcode{proc_bind} clause} +\index{clauses!proc_bind@\kcode{proc_bind}} +\index{proc_bind clause@\kcode{proc_bind} clause} -The following examples demonstrate how to use the \code{proc\_bind} clause to -control the thread binding for a team of threads in a \code{parallel} region. +The following examples demonstrate how to use the \kcode{proc_bind} clause to +control the thread binding for a team of threads in a \kcode{parallel} region. The machine architecture is depicted in Figure~\ref{fig:mach_arch}. It consists of two sockets, each equipped with a quad-core processor and configured to execute two hardware threads simultaneously on each core. These examples assume a contiguous core numbering @@ -24,19 +24,19 @@ \section{\code{proc\_bind} Clause} The following equivalent place list declarations consist of eight places (which we designate as p0 to p7): -\code{OMP\_PLACES=\texttt{"}\{0,1\},\{2,3\},\{4,5\},\{6,7\},\{8,9\},\{10,11\},\{12,13\},\{14,15\}\texttt{"}} +\kcode{OMP_PLACES}=\verb+"{0,1},{2,3},{4,5},{6,7},{8,9},{10,11},{12,13},{14,15}"+ or -\code{OMP\_PLACES=\texttt{"}\{0:2\}:8:2\texttt{"}} +\kcode{OMP_PLACES}=\verb+"{0:2}:8:2"+ \subsection{Spread Affinity Policy} \label{subsec:affinity_spread} -\index{affinity!spread policy@\code{spread} policy} -\index{spread policy@\code{spread} policy} +\index{affinity!spread policy@\kcode{spread} policy} +\index{spread policy@\kcode{spread} policy} -The following example shows the result of the \code{spread} affinity policy on +The following example shows the result of the \kcode{spread} affinity policy on the partition list when the number of threads is less than or equal to the number of places in the parent's place partition, for the machine architecture depicted above. Note that the threads are bound to the first place of each subpartition. @@ -73,13 +73,13 @@ \subsection{Spread Affinity Policy} \item thread 3 executes on p0 with the place partition p0,p1 \end{compactitem} -The following example illustrates the \code{spread} thread affinity policy when +The following example illustrates the \kcode{spread} thread affinity policy when the number of threads is greater than the number of places in the parent's place partition. -Let \plc{T} be the number of threads in the team, and \plc{P} be the number of places in the -parent's place partition. The first \plc{T/P} threads of the team (including the primary -thread) execute on the parent's place. The next \plc{T/P} threads execute on the next +Let \ucode{T} be the number of threads in the team, and \ucode{P} be the number of places in the +parent's place partition. The first \ucode{T/P} threads of the team (including the primary +thread) execute on the parent's place. The next \ucode{T/P} threads execute on the next place in the place partition, and so on, with wrap around. \cexample[4.0]{affinity}{2} @@ -131,13 +131,13 @@ \subsection{Spread Affinity Policy} \subsection{Close Affinity Policy} \label{subsec:affinity_close} -\index{affinity!close policy@\code{close} policy} -\index{close policy@\code{close} policy} +\index{affinity!close policy@\kcode{close} policy} +\index{close policy@\kcode{close} policy} -The following example shows the result of the \code{close} affinity policy on +The following example shows the result of the \kcode{close} affinity policy on the partition list when the number of threads is less than or equal to the number of places in parent's place partition, for the machine architecture depicted above. -The place partition is not changed by the \code{close} policy. +The place partition is not changed by the \kcode{close} policy. \cexample[4.0]{affinity}{3} @@ -145,7 +145,7 @@ \subsection{Close Affinity Policy} It is unspecified on which place the primary thread is initially started. If the primary thread is initially started on p0, the following placement of threads will -be applied in the \code{parallel} region: +be applied in the \kcode{parallel} region: \begin{compactitem} \item thread 0 executes on p0 with the place partition p0-p7 @@ -170,15 +170,15 @@ \subsection{Close Affinity Policy} \item thread 3 executes on p5 with the place partition p0-p7 \end{compactitem} -The following example illustrates the \code{close} thread affinity policy when +The following example illustrates the \kcode{close} thread affinity policy when the number of threads is greater than the number of places in the parent's place partition. -Let \plc{T} be the number of threads in the team, and \plc{P} be the number of places in the -parent's place partition. The first \plc{T/P} threads of the team (including the primary -thread) execute on the parent's place. The next \plc{T/P} threads execute on the next +Let \ucode{T} be the number of threads in the team, and \ucode{P} be the number of places in the +parent's place partition. The first \ucode{T/P} threads of the team (including the primary +thread) execute on the parent's place. The next \ucode{T/P} threads execute on the next place in the place partition, and so on, with wrap around. The place partition -is not changed by the \code{close} policy. +is not changed by the \kcode{close} policy. \cexample[4.0]{affinity}{4} @@ -229,10 +229,10 @@ \subsection{Close Affinity Policy} \subsection{Primary Affinity Policy} \label{subsec:affinity_primary} -\index{affinity!primary policy@\code{primary} policy} -\index{primary policy@\code{primary} policy} +\index{affinity!primary policy@\kcode{primary} policy} +\index{primary policy@\kcode{primary} policy} -The following example shows the result of the \code{primary} affinity policy on +The following example shows the result of the \kcode{primary} affinity policy on the partition list for the machine architecture depicted above. The place partition is not changed by the primary policy. diff --git a/affinity/affinity_display.tex b/affinity/affinity_display.tex index 0a619fd..0b06920 100644 --- a/affinity/affinity_display.tex +++ b/affinity/affinity_display.tex @@ -1,32 +1,32 @@ \section{Affinity Display} \label{sec:affinity_display} -\index{affinity display!OMP_DISPLAY_AFFINITY@\scode{OMP_DISPLAY_AFFINITY}} -\index{environment variables!OMP_DISPLAY_AFFINITY@\scode{OMP_DISPLAY_AFFINITY}} -\index{OMP_DISPLAY_AFFINITY@\scode{OMP_DISPLAY_AFFINITY}} -\index{affinity display!OMP_AFFINITY_FORMAT@\scode{OMP_AFFINITY_FORMAT}} -\index{environment variables!OMP_AFFINITY_FORMAT@\scode{OMP_AFFINITY_FORMAT}} -\index{OMP_AFFINITY_FORMAT@\scode{OMP_AFFINITY_FORMAT}} -\index{affinity display!omp_display_affinity routine@\scode{omp_display_affinity} routine} -\index{routines!omp_display_affinity@\scode{omp_display_affinity}} -\index{omp_display_affinity routine@\scode{omp_display_affinity} routine} +\index{affinity display!OMP_DISPLAY_AFFINITY@\kcode{OMP_DISPLAY_AFFINITY}} +\index{environment variables!OMP_DISPLAY_AFFINITY@\kcode{OMP_DISPLAY_AFFINITY}} +\index{OMP_DISPLAY_AFFINITY@\kcode{OMP_DISPLAY_AFFINITY}} +\index{affinity display!OMP_AFFINITY_FORMAT@\kcode{OMP_AFFINITY_FORMAT}} +\index{environment variables!OMP_AFFINITY_FORMAT@\kcode{OMP_AFFINITY_FORMAT}} +\index{OMP_AFFINITY_FORMAT@\kcode{OMP_AFFINITY_FORMAT}} +\index{affinity display!omp_display_affinity routine@\kcode{omp_display_affinity} routine} +\index{routines!omp_display_affinity@\kcode{omp_display_affinity}} +\index{omp_display_affinity routine@\kcode{omp_display_affinity} routine} The following examples illustrate ways to display thread affinity. Automatic display of affinity can be invoked by setting -the \code{OMP\_DISPLAY\_AFFINITY} environment variable to \code{TRUE}. +the \kcode{OMP_DISPLAY_AFFINITY} environment variable to \vcode{TRUE}. The format of the output can be customized by setting the -\code{OMP\_AFFINITY\_FORMAT} environment variable to an appropriate string. +\kcode{OMP_AFFINITY_FORMAT} environment variable to an appropriate string. Also, there are API calls for the user to display thread affinity at selected locations within code. -For the first example the environment variable \code{OMP\_DISPLAY\_AFFINITY} has been -set to \code{TRUE}, and execution occurs on an 8-core system with \code{OMP\_NUM\_THREADS} set to 8. +For the first example the environment variable \kcode{OMP_DISPLAY_AFFINITY} has been +set to \vcode{TRUE}, and execution occurs on an 8-core system with \kcode{OMP_NUM_THREADS} set to 8. The affinity for the primary thread is reported through a call to the API -\code{omp\_display\_affinity()} routine. For default affinity settings +\kcode{omp_display_affinity()} routine. For default affinity settings the report shows that the primary thread can execute on any of the cores. In the following parallel region the affinity for each of the team threads is reported -automatically since the \code{OMP\_DISPLAY\_AFFINITY} environment variable has been set -to \code{TRUE}. +automatically since the \kcode{OMP_DISPLAY_AFFINITY} environment variable has been set +to \vcode{TRUE}. These two reports are often useful (as in hybrid codes using both MPI and OpenMP) to observe the affinity (for an MPI task) before the parallel region, @@ -48,25 +48,25 @@ \section{Affinity Display} These OpenMP environment variables have been set: \begin{compactitem} -\item \code{OMP\_PROC\_BIND}="TRUE" -\item \code{OMP\_NUM\_THREADS}="2,4" -\item \code{OMP\_PLACES}="\{0,2,4,6\},\{1,3,5,7\}" -\item \code{OMP\_AFFINITY\_FORMAT}="nest\_level= \%L, parent\_thrd\_num= \%a, thrd\_num= \%n, thrd\_affinity= \%A" +\item \kcode{OMP_PROC_BIND}=\verb+"TRUE"+ +\item \kcode{OMP_NUM_THREADS}=\verb+"2,4"+ +\item \kcode{OMP_PLACES}=\verb+"{0,2,4,6},{1,3,5,7}"+ +\item \kcode{OMP_AFFINITY_FORMAT}=\verb+"nest_level= %L, parent_thrd_num= %a,+ \verb+thrd_num= %n, thrd_affinity= %A"+ \end{compactitem} -where the numbers correspond to core ids for the system. Note, \code{OMP\_DISPLAY\_AFFINITY} is not -set and is \code{FALSE} by default. This example shows how to use API routines to +where the numbers correspond to core ids for the system. Note, \kcode{OMP_DISPLAY_AFFINITY} is not +set and is \vcode{FALSE} by default. This example shows how to use API routines to perform affinity display operations. -\index{environment variables!OMP_PLACES@\scode{OMP_PLACES}} -\index{OMP_PLACES@\scode{OMP_PLACES}} -For each of the two first-level threads the \code{OMP\_PLACES} variable specifies +\index{environment variables!OMP_PLACES@\kcode{OMP_PLACES}} +\index{OMP_PLACES@\kcode{OMP_PLACES}} +For each of the two first-level threads the \kcode{OMP_PLACES} variable specifies a place with all the core-ids of the socket (\{0,2,4,6\} for one thread and \{1,3,5,7\} for the other). (As is sometimes the case in 2-socket systems, one socket may consist of the even id numbers, while the other may have the odd id numbers.) The affinities -are printed according to the \code{OMP\_AFFINITY\_FORMAT} format: providing -the parallel nesting level (\%L), the ancestor thread number (\%a), the thread number (\%n) -and the thread affinity (\%A). In the nested parallel region within the \plc{socket\_work} routine +are printed according to the \kcode{OMP_AFFINITY_FORMAT} format: providing +the parallel nesting level (\ucode{\%L}), the ancestor thread number (\ucode{\%a}), the thread number (\ucode{\%n}) +and the thread affinity (\ucode{\%A}). In the nested parallel region within the \ucode{socket_work} routine the affinities for the threads on each socket are printed according to this format. \cexample[5.0]{affinity_display}{2}[3] @@ -74,23 +74,23 @@ \section{Affinity Display} \ffreeexample[5.0]{affinity_display}{2}[3] %\newpage -\index{affinity display!omp_get_affinity_format routine@\scode{omp_get_affinity_format} routine} -\index{routines!omp_get_affinity_format@\scode{omp_get_affinity_format}} -\index{omp_get_affinity_format routine@\scode{omp_get_affinity_format} routine} -\index{affinity display!omp_set_affinity_format routine@\scode{omp_set_affinity_format} routine} -\index{routines!omp_set_affinity_format@\scode{omp_set_affinity_format}} -\index{omp_set_affinity_format routine@\scode{omp_set_affinity_format} routine} +\index{affinity display!omp_get_affinity_format routine@\kcode{omp_get_affinity_format} routine} +\index{routines!omp_get_affinity_format@\kcode{omp_get_affinity_format}} +\index{omp_get_affinity_format routine@\kcode{omp_get_affinity_format} routine} +\index{affinity display!omp_set_affinity_format routine@\kcode{omp_set_affinity_format} routine} +\index{routines!omp_set_affinity_format@\kcode{omp_set_affinity_format}} +\index{omp_set_affinity_format routine@\kcode{omp_set_affinity_format} routine} The next example illustrates more details about affinity formatting. -First, the \code{omp\_get\_affinity\_format()} API routine is used to +First, the \kcode{omp_get_affinity_format()} API routine is used to obtain the default format. The code checks to make sure the storage provides enough space to hold the format. -Next, the \code{omp\_set\_affinity\_format()} API routine sets a user-defined -format: \plc{host=\%20H thrd\_num=\%0.4n binds\_to=\%A}. +Next, the \kcode{omp_set_affinity_format()} API routine sets a user-defined +format: \ucode{host=\%20H~thrd_num=\%0.4n~binds_to=\%A}. -The host, thread number and affinity fields are specified by \plc{\%20H}, -\plc{\%0.4n} and \plc{\%A}: \plc{H}, \plc{n} and \plc{A} are single character ``short names'' +The host, thread number and affinity fields are specified by \ucode{\%20H}, +\ucode{\%0.4n} and \ucode{\%A}: \ucode{H}, \ucode{n} and \ucode{A} are single character ``short names'' for the host, thread\_num and thread\_affinity data to be printed, -with format sizes of \plc{20}, \plc{4}, and ``size as needed''. +with format sizes of \ucode{20}, \ucode{4}, and ``size as needed''. The period (.) indicates that the field is displayed right-justified (default is left-justified) and the ``0'' indicates that any unused space is to be prefixed with zeros (e.g. instead of ``1'', ``0001'' is displayed for the field size of 4). @@ -101,12 +101,12 @@ \section{Affinity Display} %The period (\plc{.}) indicates right justified and \plc{0} leading zeros. %All other text in the format is just user narrative. -\index{affinity display!omp_capture_affinity routine@\scode{omp_capture_affinity} routine} -\index{routines!omp_capture_affinity@\scode{omp_capture_affinity}} -\index{omp_capture_affinity routine@\scode{omp_capture_affinity} routine} +\index{affinity display!omp_capture_affinity routine@\kcode{omp_capture_affinity} routine} +\index{routines!omp_capture_affinity@\kcode{omp_capture_affinity}} +\index{omp_capture_affinity routine@\kcode{omp_capture_affinity} routine} Within the parallel region the affinity for each thread is captured by -\code{omp\_capture\_affinity()} into a buffer array with elements indexed -by the thread number (\plc{thrd\_num}). +\kcode{omp_capture_affinity()} into a buffer array with elements indexed +by the thread number (\ucode{thrd_num}). After the parallel region, the thread affinities are printed in thread-number order. If the storage area in buffer is inadequate for holding the affinity @@ -114,10 +114,10 @@ \section{Affinity Display} %The \plc{max} reduction on the required storage, returned by %\code{omp\_capture\_affinity} in \plc{nchars}, is used to report %possible truncation (if \plc{max\_req\_store} > \plc{buffer\_store}). -The maximum value for the number of characters (\plc{nchars}) returned by -\code{omp\_capture\_affinity} is captured by the \code{reduction(max:max\_req\_store)} -clause and the \plc{if(nchars >= max\_req\_store) max\_req\_store=nchars} statement. -It is used to report possible truncation (if \plc{max\_req\_store} > \plc{buffer\_store}). +The maximum value for the number of characters (\ucode{nchars}) returned by +\kcode{omp_capture_affinity} is captured by the \kcode{reduction(max: \ucode{max_req_store})} +clause and the \ucode{if(nchars >= max_req_store) max_req_store=nchars} statement. +It is used to report possible truncation (if \ucode{max_req_store} > \ucode{buffer_store}). \cexample[5.0]{affinity_display}{3} diff --git a/affinity/affinity_query.tex b/affinity/affinity_query.tex index b78b4ed..f9ba882 100644 --- a/affinity/affinity_query.tex +++ b/affinity/affinity_query.tex @@ -1,19 +1,19 @@ \newpage \section{Affinity Query Functions} \label{sec: affinity_query} -\index{affinity query!omp_get_num_places routine@\scode{omp_get_num_places} routine} -\index{routines!omp_get_num_places@\scode{omp_get_num_places}} -\index{omp_get_num_places routine@\scode{omp_get_num_places} routine} -\index{affinity query!omp_get_place_num routine@\scode{omp_get_place_num} routine} -\index{routines!omp_get_place_num@\scode{omp_get_place_num}} -\index{omp_get_place_num routine@\scode{omp_get_place_num} routine} -\index{affinity query!omp_get_place_num_procs routine@\scode{omp_get_place_num_procs} routine} -\index{routines!omp_get_place_num_procs@\scode{omp_get_place_num_procs}} -\index{omp_get_place_num_procs routine@\scode{omp_get_place_num_procs} routine} -\index{affinity!spread policy@\code{spread} policy} -\index{spread policy@\code{spread} policy} -\index{environment variables!OMP_PLACES@\scode{OMP_PLACES}} -\index{OMP_PLACES@\scode{OMP_PLACES}} +\index{affinity query!omp_get_num_places routine@\kcode{omp_get_num_places} routine} +\index{routines!omp_get_num_places@\kcode{omp_get_num_places}} +\index{omp_get_num_places routine@\kcode{omp_get_num_places} routine} +\index{affinity query!omp_get_place_num routine@\kcode{omp_get_place_num} routine} +\index{routines!omp_get_place_num@\kcode{omp_get_place_num}} +\index{omp_get_place_num routine@\kcode{omp_get_place_num} routine} +\index{affinity query!omp_get_place_num_procs routine@\kcode{omp_get_place_num_procs} routine} +\index{routines!omp_get_place_num_procs@\kcode{omp_get_place_num_procs}} +\index{omp_get_place_num_procs routine@\kcode{omp_get_place_num_procs} routine} +\index{affinity!spread policy@\kcode{spread} policy} +\index{spread policy@\kcode{spread} policy} +\index{environment variables!OMP_PLACES@\kcode{OMP_PLACES}} +\index{OMP_PLACES@\kcode{OMP_PLACES}} In the example below a team of threads is generated on each socket of the system, using nested parallelism. Several query functions are used @@ -23,25 +23,25 @@ \section{Affinity Query Functions} For proper execution of the code, the user must create a place partition, such that each place is a listing of the core numbers for a socket. For example, in a 2 socket system with 8 cores in each socket, and sequential numbering -in the socket for the core numbers, the \code{OMP\_PLACES} variable would be set -to "\{0:8\},\{8:8\}", using the place syntax \{\plc{lower\_bound}:\plc{length}:\plc{stride}\}, +in the socket for the core numbers, the \kcode{OMP_PLACES} variable would be set +to "\{0:8\},\{8:8\}", using the place syntax \{\splc{lower_bound:length:stride}\}, and the default stride of 1. -The code determines the number of sockets (\plc{n\_sockets}) -using the \code{omp\_get\_num\_places()} query function. +The code determines the number of sockets (\ucode{n_sockets}) +using the \kcode{omp_get_num_places()} query function. In this example each place is constructed with a list of each socket's core numbers, hence the number of places is equal to the number of sockets. The outer parallel region forms a team of threads, and each thread -executes on a socket (place) because the \code{proc\_bind} clause uses -\code{spread} in the outer \code{parallel} construct. -Next, in the \plc{socket\_init} function, an inner parallel region creates a team +executes on a socket (place) because the \kcode{proc_bind} clause uses +\kcode{spread} in the outer \kcode{parallel} construct. +Next, in the \ucode{socket_init} function, an inner parallel region creates a team of threads equal to the number of elements (core numbers) from the place -of the parent thread. Because the outer \code{parallel} construct uses -a \code{spread} affinity policy, each of its threads inherits a subpartition of -the original partition. Hence, the \code{omp\_get\_place\_num\_procs} query function -returns the number of elements (here procs = cores) in the subpartition of the thread. +of the parent thread. Because the outer \kcode{parallel} construct uses +a \kcode{spread} affinity policy, each of its threads inherits a sub-partition of +the original partition. Hence, the \kcode{omp_get_place_num_procs} query function +returns the number of elements (here procs = cores) in the sub-partition of the thread. After each parent thread creates its nested parallel region on the section, the socket number and thread number are reported. diff --git a/affinity/task_affinity.tex b/affinity/task_affinity.tex index 767d0cb..2ab881c 100644 --- a/affinity/task_affinity.tex +++ b/affinity/task_affinity.tex @@ -1,13 +1,13 @@ \section{Task Affinity} \label{sec: task_affinity} \index{affinity!task affinity} -\index{affinity!affinity clause@\code{affinity} clause} -\index{clauses!affinity@\code{affinity}} -\index{affinity clause@\code{affinity} clause} +\index{affinity!affinity clause@\kcode{affinity} clause} +\index{clauses!affinity@\kcode{affinity}} +\index{affinity clause@\kcode{affinity} clause} -The next example illustrates the use of the \code{affinity} -clause with a \code{task} construct. -The variables in the \code{affinity} clause provide a +The next example illustrates the use of the \kcode{affinity} +clause with a \kcode{task} construct. +The variables in the \kcode{affinity} clause provide a hint to the runtime that the task should execute ``close'' to the physical storage location of the variables. For example, on a two-socket platform with a local memory component @@ -15,19 +15,19 @@ \section{Task Affinity} schedule the task execution on the socket where the storage is located. Because the C/C++ code employs a pointer, an array section is used in -the \code{affinity} clause. +the \kcode{affinity} clause. Fortran code can use an array reference to specify the storage, as shown here. -Note, in the second task of the C/C++ code the \plc{B} pointer is declared +Note, in the second task of the C/C++ code the \ucode{B} pointer is declared shared. Otherwise, by default, it would be firstprivate since it is a local variable, and would probably be saved for the second task before being assigned a storage address by the first task. Also, one might think it reasonable to use -the \code{affinity} clause \plc{affinity(B[:N])} on the second \code{task} construct. -However, the storage behind \plc{B} is created in the first task, and the +the \kcode{affinity} clause \kcode{affinity(\ucode{B[:N]})} on the second \kcode{task} construct. +However, the storage behind \ucode{B} is created in the first task, and the array section reference may not be valid when the second task is generated. -The use of the \plc{A} array is sufficient for this case, because one -would expect the storage for \plc{A} and \plc{B} would be physically ``close'' +The use of the \ucode{A} array is sufficient for this case, because one +would expect the storage for \ucode{A} and \ucode{B} would be physically ``close'' (as provided by the hint in the first task). \cexample[5.0]{affinity}{6} diff --git a/data_environment/associate.tex b/data_environment/associate.tex index dea9204..3b75b3f 100644 --- a/data_environment/associate.tex +++ b/data_environment/associate.tex @@ -1,67 +1,65 @@ -\pagebreak -\section{Fortran \code{ASSOCIATE} Construct} +%\pagebreak +\section{Fortran \bcode{ASSOCIATE} Construct} \fortranspecificstart \label{sec:associate} -\index{ASSOCIATE construct, Fortran@\code{ASSOCIATE} construct, Fortran} +\index{ASSOCIATE construct, Fortran@\bcode{ASSOCIATE} construct, Fortran} The following is an invalid example of specifying an associate name on a data-sharing attribute -clause. The constraint in the Data Sharing Attribute Rules section in the OpenMP -4.0 API Specifications states that an associate name preserves the association -with the selector established at the \code{ASSOCIATE} statement. The associate -name \plc{b} is associated with the shared variable \plc{a}. With the predetermined data-sharing -attribute rule, the associate name \plc{b} is not allowed to be specified on the \code{private} +clause. The constraint in the \docref{Data Sharing Attribute Rules} section in the OpenMP +4.0 API Specification states that an associate name preserves the association +with the selector established at the \bcode{ASSOCIATE} statement. The associate +name \ucode{b} is associated with the shared variable \ucode{a}. With the predetermined data-sharing +attribute rule, the associate name \ucode{b} is not allowed to be specified on the \kcode{private} clause. +\pagebreak \fnexample[4.0]{associate}{1} -In next example, within the \code{parallel} construct, the association name \plc{thread\_id} -is associated with the private copy of \plc{i}. The print statement should output the +In next example, within the \kcode{parallel} construct, the association name \ucode{thread_id} +is associated with the private copy of \ucode{i}. The print statement should output the unique thread number. +\topmarker{Fortran} \fnexample[4.0]{associate}{2} The following example illustrates the effect of specifying a selector name on a data-sharing -attribute clause. The associate name \plc{u} is associated with \plc{v} and the variable \plc{v} -is specified on the \code{private} clause of the \code{parallel} construct. -The construct association is established prior to the \code{parallel} region. -The association between \plc{u} and the original \plc{v} is retained (see the Data Sharing -Attribute Rules section in the OpenMP 4.0 API Specifications). Inside the \code{parallel} -region, \plc{v} has the value of -1 and \plc{u} has the value of the original \plc{v}. +attribute clause. The associate name \ucode{u} is associated with \ucode{v} and the variable \ucode{v} +is specified on the \kcode{private} clause of the \kcode{parallel} construct. +The construct association is established prior to the \kcode{parallel} region. +The association between \ucode{u} and the original \ucode{v} is retained (see the \docref{Data Sharing +Attribute Rules} section in the OpenMP 4.0 API Specification). Inside the \kcode{parallel} +region, \ucode{v} has the value of -1 and \ucode{u} has the value of the original \ucode{v}. -\pagebreak \ffreenexample[4.0]{associate}{3} -% blue line floater at top of this page for "Fortran, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} -\end{figure} +\topmarker{Fortran} \label{sec:associate_target} \bigskip The following example illustrates mapping behavior for a Fortran -associate name and its selector for a \scode{target} construct. +associate name and its selector for a \kcode{target} construct. -For the first 3 \scode{target} constructs the associate name \splc{a_aray} is -associated with the selector \splc{aray}, an array. -For the \scode{target} construct of code block TARGET 1 just the selector -\splc{aray} is used and is implicitly mapped, -likewise for the associate name \splc{a_aray} in the TARGET 2 block. +For the first 3 \kcode{target} constructs the associate name \ucode{a_aray} is +associated with the selector \ucode{aray}, an array. +For the \kcode{target} construct of code block TARGET 1 just the selector +\ucode{aray} is used and is implicitly mapped, +likewise for the associate name \ucode{a_aray} in the TARGET 2 block. However, mapping an associate name and its selector is not valid for the same -\scode{target} construct. Hence the TARGET 3 block is non-conforming. +\kcode{target} construct. Hence the TARGET 3 block is non-conforming. -In TARGET 4, the \splc{scalr} selector used in the \scode{target} region +In TARGET 4, the \ucode{scalr} selector used in the \kcode{target} region has an implicit data-sharing attribute of firstprivate since it is a scalar. Hence, the assigned value is not returned. -In TARGET 5, the associate name \splc{a_scalr} is implicitly mapped and the -assigned value is returned to the host (default \scode{tofrom} mapping behavior). -In TARGET 6, the use of the associate name and its selector in the \scode{target} +In TARGET 5, the associate name \ucode{a_scalr} is implicitly mapped and the +assigned value is returned to the host (default \kcode{tofrom} mapping behavior). +In TARGET 6, the use of the associate name and its selector in the \kcode{target} region is conforming because the scalar firstprivate behavior of the selector and the implicit mapping of the associate name are allowed. -At the end of the \scode{target} region only the +At the end of the \kcode{target} region only the associate name's value is returned to the host. In TARGET 7, the selector and associate name appear in -an explicit mapping for the same \scode{target} construct, +an explicit mapping for the same \kcode{target} construct, hence the code block is non-conforming. \ffreenexample[5.1]{associate}{4} diff --git a/data_environment/carrays_fpriv.tex b/data_environment/carrays_fpriv.tex index fb94285..607ff0d 100644 --- a/data_environment/carrays_fpriv.tex +++ b/data_environment/carrays_fpriv.tex @@ -1,33 +1,33 @@ -\pagebreak -\section{C/C++ Arrays in a \code{firstprivate} Clause} +%\pagebreak +\section{C/C++ Arrays in a \kcode{firstprivate} Clause} \ccppspecificstart \label{sec:carrays_fpriv} -\index{clauses!firstprivate@\code{firstprivate}} -\index{firstprivate clause@\code{firstprivate} clause!C/C++ arrays in} +\index{clauses!firstprivate@\kcode{firstprivate}} +\index{firstprivate clause@\kcode{firstprivate} clause!C/C++ arrays in} The following example illustrates the size and value of list items of array or -pointer type in a \code{firstprivate} clause . The size of new list items is +pointer type in a \kcode{firstprivate} clause. The size of new list items is based on the type of the corresponding original list item, as determined by the base language. In this example: \begin{compactitem} -\item The type of \code{A} is array of two arrays of two ints. +\item The type of \ucode{A} is array of two arrays of two \bcode{int}s. -\item The type of \code{B} is adjusted to pointer to array of \code{n} -ints, because it is a function parameter. +\item The type of \ucode{B} is adjusted to pointer to array of \ucode{n} +\bcode{int}s, because it is a function parameter. -\item The type of \code{C} is adjusted to pointer to int, because +\item The type of \ucode{C} is adjusted to pointer to \bcode{int}, because it is a function parameter. -\item The type of \code{D} is array of two arrays of two ints. +\item The type of \ucode{D} is array of two arrays of two \bcode{int}s. -\item The type of \code{E} is array of \code{n} arrays of \code{n} -ints. +\item The type of \ucode{E} is array of \ucode{n} arrays of \ucode{n} +\bcode{int}s. \end{compactitem} -Note that \code{B} and \code{E} involve variable length array types. +Note that \ucode{B} and \ucode{E} involve variable length array types. The new items of array type are initialized as if each integer element of the original array is assigned to the corresponding element of the new array. Those of pointer diff --git a/data_environment/copyin.tex b/data_environment/copyin.tex index 103166e..57361b6 100644 --- a/data_environment/copyin.tex +++ b/data_environment/copyin.tex @@ -1,13 +1,13 @@ -\pagebreak -\section{\code{copyin} Clause} +%\pagebreak +\section{\kcode{copyin} Clause} \label{sec:copyin} -\index{clauses!copyin@\code{copyin}} -\index{copyin clause@\code{copyin} clause} -\index{directives!threadprivate@\code{threadprivate}} -\index{threadprivate directive@\code{threadprivate} directive} +\index{clauses!copyin@\kcode{copyin}} +\index{copyin clause@\kcode{copyin} clause} +\index{directives!threadprivate@\kcode{threadprivate}} +\index{threadprivate directive@\kcode{threadprivate} directive} -The \code{copyin} clause is used to initialize threadprivate data upon entry -to a \code{parallel} region. The value of the threadprivate variable in the primary +The \kcode{copyin} clause is used to initialize threadprivate data upon entry +to a \kcode{parallel} region. The value of the threadprivate variable in the primary thread is copied to the threadprivate variable of each other team member. \cexample{copyin}{1} diff --git a/data_environment/copyprivate.tex b/data_environment/copyprivate.tex index 8387048..463995a 100644 --- a/data_environment/copyprivate.tex +++ b/data_environment/copyprivate.tex @@ -1,22 +1,22 @@ -\pagebreak -\section{\code{copyprivate} Clause} +%\pagebreak +\section{\kcode{copyprivate} Clause} \label{sec:copyprivate} -\index{clauses!copyprivate@\code{copyprivate}} -\index{copyprivate clause@\code{copyprivate} clause} +\index{clauses!copyprivate@\kcode{copyprivate}} +\index{copyprivate clause@\kcode{copyprivate} clause} -The \code{copyprivate} clause can be used to broadcast values acquired by a single +The \kcode{copyprivate} clause can be used to broadcast values acquired by a single thread directly to all instances of the private variables in the other threads. In this example, if the routine is called from the sequential part, its behavior -is not affected by the presence of the directives. If it is called from a \code{parallel} -region, then the actual arguments with which \code{a} and \code{b} are associated +is not affected by the presence of the directives. If it is called from a \kcode{parallel} +region, then the actual arguments with which \ucode{a} and \ucode{b} are associated must be private. -\index{constructs!single@\code{single}} -\index{single construct@\code{single} construct} -The thread that executes the structured block associated with the \code{single} - construct broadcasts the values of the private variables \code{a}, \code{b}, -\code{x}, and -\code{y} from its implicit task's data environment to the data environments +\index{constructs!single@\kcode{single}} +\index{single construct@\kcode{single} construct} +The thread that executes the structured block associated with the \kcode{single} + construct broadcasts the values of the private variables \ucode{a}, \ucode{b}, +\ucode{x}, and +\ucode{y} from its implicit task's data environment to the data environments of the other implicit tasks in the thread team. The broadcast completes before any of the threads have left the barrier at the end of the construct. @@ -24,32 +24,32 @@ \section{\code{copyprivate} Clause} \fexample{copyprivate}{1} -\index{constructs!masked@\code{masked}} -\index{masked construct@\code{masked} construct} +\index{constructs!masked@\kcode{masked}} +\index{masked construct@\kcode{masked} construct} In this example, assume that the input must be performed by the primary thread. -Since the \code{masked} construct does not support the \code{copyprivate} clause, -it cannot broadcast the input value that is read. However, \code{copyprivate} +Since the \kcode{masked} construct does not support the \kcode{copyprivate} clause, +it cannot broadcast the input value that is read. However, \kcode{copyprivate} is used to broadcast an address where the input value is stored. \cexample[5.1]{copyprivate}{2} \fexample[5.1]{copyprivate}{2} -Suppose that the number of lock variables required within a \code{parallel} region -cannot easily be determined prior to entering it. The \code{copyprivate} clause +Suppose that the number of lock variables required within a \kcode{parallel} region +cannot easily be determined prior to entering it. The \kcode{copyprivate} clause can be used to provide access to shared lock variables that are allocated within -that \code{parallel} region. +that \kcode{parallel} region. \cexample{copyprivate}{3} \fortranspecificstart \fnexample{copyprivate}{3} -Note that the effect of the \code{copyprivate} clause on a variable with the -\code{allocatable} attribute is different than on a variable with the \code{pointer} -attribute. The value of \code{A} is copied (as if by intrinsic assignment) and -the pointer \code{B} is copied (as if by pointer assignment) to the corresponding -list items in the other implicit tasks belonging to the \code{parallel} region. +Note that the effect of the \kcode{copyprivate} clause on a variable with the +\bcode{allocatable} attribute is different than on a variable with the \bcode{pointer} +attribute. The value of \ucode{A} is copied (as if by intrinsic assignment) and +the pointer \ucode{B} is copied (as if by pointer assignment) to the corresponding +list items in the other implicit tasks belonging to the \kcode{parallel} region. \fnexample{copyprivate}{4} \fortranspecificend diff --git a/data_environment/cpp_reference.tex b/data_environment/cpp_reference.tex index 6022330..e77c0a9 100644 --- a/data_environment/cpp_reference.tex +++ b/data_environment/cpp_reference.tex @@ -5,11 +5,11 @@ \section{C++ Reference in Data-Sharing Clauses} \index{data-sharing clauses, C++ reference in} C++ reference types are allowed in data-sharing attribute clauses as of OpenMP 4.5, except -for the \code{threadprivate}, \code{copyin} and \code{copyprivate} clauses. -(See the Data-Sharing Attribute Clauses Section of the 4.5 OpenMP specification.) +for the \kcode{threadprivate}, \kcode{copyin} and \kcode{copyprivate} clauses. +(See the \docref{Data-Sharing Attribute Clauses} section of the 4.5 OpenMP specification.) When a variable with C++ reference type is privatized, the object the reference refers to is privatized in addition to the reference itself. The following example shows the use of reference types in data-sharing clauses in the usual way. -Additionally it shows how the data-sharing of formal arguments with a C++ reference type on an orphaned task generating construct is determined implicitly. (See the Data-sharing Attribute Rules for Variables Referenced in a Construct Section of the 4.5 OpenMP specification.) +Additionally it shows how the data-sharing of formal arguments with a C++ reference type on an orphaned task generating construct is determined implicitly. (See the \docref{Data-sharing Attribute Rules for Variables Referenced in a Construct} section of the 4.5 OpenMP specification.) \cppnexample[4.5]{cpp_reference}{1} diff --git a/data_environment/default_none.tex b/data_environment/default_none.tex index 93a189a..da244c4 100644 --- a/data_environment/default_none.tex +++ b/data_environment/default_none.tex @@ -1,17 +1,17 @@ -\pagebreak -\section{\code{default(none)} Clause} +%\pagebreak +\section{\kcode{default(none)} Clause} \label{sec:default_none} -\index{clauses!default(none)@\code{default(none)}} -\index{default(none) clause@\code{default(none)} clause} +\index{clauses!default(none)@\kcode{default(none)}} +\index{default(none) clause@\kcode{default(none)} clause} -The following example distinguishes the variables that are affected by the \code{default(none)} +The following example distinguishes the variables that are affected by the \kcode{default(none)} clause from those that are not. \ccppspecificstart -Beginning with OpenMP 4.0, variables with \code{const}-qualified type and no mutable member -are no longer predetermined shared. Thus, these variables (variable \plc{c} in the example) +Beginning with OpenMP 4.0, variables with \bcode{const}-qualified type and no mutable member +are no longer predetermined shared. Thus, these variables (variable \ucode{c} in the example) need to be explicitly listed -in data-sharing attribute clauses when the \code{default(none)} clause is specified. +in data-sharing attribute clauses when the \kcode{default(none)} clause is specified. \cnexample{default_none}{1} \ccppspecificend diff --git a/data_environment/fort_loopvar.tex b/data_environment/fort_loopvar.tex index 0cd098f..52af4bb 100644 --- a/data_environment/fort_loopvar.tex +++ b/data_environment/fort_loopvar.tex @@ -1,14 +1,15 @@ -\pagebreak +%\pagebreak \section{Fortran Private Loop Iteration Variables} \label{sec:fort_loopvar} \fortranspecificstart \index{loop variables, Fortran} In general loop iteration variables will be private, when used in the \plc{do-loop} -of a \code{do} and \code{parallel do} construct or in sequential loops in a -\code{parallel} construct (see Section 2.7.1 and Section 2.14.1 of +of a \kcode{do} and \kcode{parallel do} construct or in sequential loops in a +\kcode{parallel} construct (see the \docref{Loop Construct} section and +the \docref{Data-sharing Attribute Rules} section of the OpenMP 4.0 specification). In the following example of a sequential -loop in a \code{parallel} construct the loop iteration variable \plc{I} will +loop in a \kcode{parallel} construct the loop iteration variable \ucode{I} will be private. \ffreenexample{fort_loopvar}{1} diff --git a/data_environment/fort_sa_private.tex b/data_environment/fort_sa_private.tex index e0e1098..2c3af7f 100644 --- a/data_environment/fort_sa_private.tex +++ b/data_environment/fort_sa_private.tex @@ -1,26 +1,23 @@ -\pagebreak -\section{Fortran Restrictions on Storage Association with the \code{private} Clause} -\fortranspecificstart +%\pagebreak +\section{Fortran Restrictions on Storage Association with the \kcode{private} Clause} \label{sec:fort_sa_private} -\index{clauses!private@\code{private}} -\index{private clause@\code{private} clause!storage association, Fortran} +\index{clauses!private@\kcode{private}} +\index{private clause@\kcode{private} clause!storage association, Fortran} -The following non-conforming examples illustrate the implications of the \code{private} +The following non-conforming examples illustrate the implications of the \kcode{private} clause rules with regard to storage association. +\pagebreak +\fortranspecificstart \fnexample{fort_sa_private}{1} \fnexample{fort_sa_private}{2} -\clearpage \fnexample{fort_sa_private}{3} -% blue line floater at top of this page for "Fortran, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} -\end{figure} \fnexample{fort_sa_private}{4} +\topmarker{Fortran} \fnexample[5.1]{fort_sa_private}{5} \fortranspecificend diff --git a/data_environment/fort_shared_var.tex b/data_environment/fort_shared_var.tex new file mode 100644 index 0000000..57faa54 --- /dev/null +++ b/data_environment/fort_shared_var.tex @@ -0,0 +1,45 @@ +%\pagebreak +\section{Passing Shared Variable to Procedure in Fortran} +\fortranspecificstart +\label{sec:fort_shared_var} +\index{clauses!shared@\kcode{shared}} +\index{shared clause@\kcode{shared} clause!storage association, Fortran} + +Passing a shared variable to a procedure in Fortran may result in the use of +temporary storage in place of the actual argument when the corresponding dummy +argument does not have the \bcode{VALUE} or \bcode{CONTIGUOUS} attribute and +its data-sharing attribute is implementation-defined as per the rules in +Section \docref{Variables Referenced in a Region but not in a Construct} of +the OpenMP Specification. +These conditions effectively result in references to, and definitions of, the +temporary storage during the procedure reference. Furthermore, the value of the +shared variable is copied into the intervening temporary storage before the +procedure reference when the dummy argument does not have the +\bcode{INTENT(OUT)} attribute, and is copied out of the temporary storage into +the shared variable when the dummy argument does not have the +\bcode{INTENT(IN)} attribute. Any references to (or definitions of) the shared +storage that is associated with the dummy argument by any other task must be +synchronized with the procedure reference to avoid possible data races. + +The following examples illustrate the implications of passing a shared +variable \ucode{a} to subroutine \ucode{sub1} or \ucode{sub2} in +a \kcode{parallel} region. +For \ucode{sub1}, an implementation may or may not generate a copy-in/copy-out +for the temporary storage associated with variable \ucode{b}. +If there is a copy-in/copy-out, the code for copy-in/copy-out will result in +a race condition, even though there is an \kcode{atomic} +directive for the update of variable \ucode{b(i)} in the subroutine. +If the implementation can create a temporary descriptor for \ucode{a(::2)} +with the correct stride and passed it to subroutine \ucode{sub1}, +the same memory is accessed inside the subroutine and the result +(\ucode{sum1}) is then well defined. +For \ucode{sub2}, there is the \bcode{CONTIGUOUS} attribute for +variable \ucode{b} and the implementation will generate a copy-in/copy-out +for the temporary storage. +The code will have a race condition and the result (\ucode{sum2}) is +not well defined. + +\topmarker{Fortran} +\ffreenexample{fort_shared_var}{1} +\fortranspecificend + diff --git a/data_environment/fort_sp_common.tex b/data_environment/fort_sp_common.tex index 962b44d..1fbebe7 100644 --- a/data_environment/fort_sp_common.tex +++ b/data_environment/fort_sp_common.tex @@ -1,36 +1,33 @@ -\pagebreak -\section{Fortran Restrictions on \code{shared} and \code{private} Clauses with Common Blocks} +%\pagebreak +\section{Fortran Restrictions on \kcode{shared} and \kcode{private} Clauses with Common Blocks} \fortranspecificstart \label{sec:fort_sp_common} -\index{clauses!private@\code{private}} -\index{clauses!shared@\code{shared}} -\index{private clause@\code{private} clause!common blocks, Fortran} -\index{shared clause@\code{shared} clause!common blocks, Fortran} +\index{clauses!private@\kcode{private}} +\index{clauses!shared@\kcode{shared}} +\index{private clause@\kcode{private} clause!common blocks, Fortran} +\index{shared clause@\kcode{shared} clause!common blocks, Fortran} -When a named common block is specified in a \code{private}, \code{firstprivate}, -or \code{lastprivate} clause of a construct, none of its members may be declared +When a named common block is specified in a \kcode{private}, \kcode{firstprivate}, +or \kcode{lastprivate} clause of a construct, none of its members may be declared in another data-sharing attribute clause on that construct. The following examples illustrate this point. The following example is conforming: +\pagebreak \fnexample{fort_sp_common}{1} The following example is also conforming: \fnexample{fort_sp_common}{2} -% blue line floater at top of this page for "Fortran, cont." -%\begin{figure}[t!] -%\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} -%\end{figure} -\clearpage +\topmarker{Fortran} The following example is conforming: \fnexample{fort_sp_common}{3} -The following example is non-conforming because \code{x} is a constituent element -of \code{c}: +The following example is non-conforming because \ucode{x} is a constituent element +of \ucode{c}: \fnexample{fort_sp_common}{4} diff --git a/data_environment/lastprivate.tex b/data_environment/lastprivate.tex index 306b4f6..bf644a1 100644 --- a/data_environment/lastprivate.tex +++ b/data_environment/lastprivate.tex @@ -1,11 +1,11 @@ -\pagebreak -\section{\code{lastprivate} Clause} +%\pagebreak +\section{\kcode{lastprivate} Clause} \label{sec:lastprivate} -\index{clauses!lastprivate@\code{lastprivate}} -\index{lastprivate clause@\code{lastprivate} clause} +\index{clauses!lastprivate@\kcode{lastprivate}} +\index{lastprivate clause@\kcode{lastprivate} clause} Correct execution sometimes depends on the value that the last iteration of a loop -assigns to a variable. Such programs must list all such variables in a \code{lastprivate} +assigns to a variable. Such programs must list all such variables in a \kcode{lastprivate} clause so that the values of the variables are the same as when the loop is executed sequentially. @@ -13,11 +13,10 @@ \section{\code{lastprivate} Clause} \fexample{lastprivate}{1} -\clearpage -\index{lastprivate clause@\code{lastprivate} clause!conditional modifier@\code{conditional} modifier} -\index{conditional modifier@\code{conditional} modifier} -The next example illustrates the use of the \code{conditional} modifier in -a \code{lastprivate} clause to return the last value when it may not come from +\index{lastprivate clause@\kcode{lastprivate} clause!conditional modifier@\kcode{conditional} modifier} +\index{conditional modifier@\kcode{conditional} modifier} +The next example illustrates the use of the \kcode{conditional} modifier in +a \kcode{lastprivate} clause to return the last value when it may not come from the last iteration of a loop. That is, users can preserve the serial equivalence semantics of the loop. The conditional lastprivate ensures the final value of the variable after the loop diff --git a/data_environment/private.tex b/data_environment/private.tex index 2950fb0..0c9d2cd 100644 --- a/data_environment/private.tex +++ b/data_environment/private.tex @@ -1,28 +1,29 @@ -\pagebreak -\section{\code{private} Clause} +%\pagebreak +\section{\kcode{private} Clause} \label{sec:private} -\index{clauses!private@\code{private}} -\index{private clause@\code{private} clause} +\index{clauses!private@\kcode{private}} +\index{private clause@\kcode{private} clause} -In the following example, the values of original list items \plc{i} and \plc{j} -are retained on exit from the \code{parallel} region, while the private list -items \plc{i} and \plc{j} are modified within the \code{parallel} construct. +In the following example, the values of original list items \ucode{i} and \ucode{j} +are retained on exit from the \kcode{parallel} region, while the private list +items \ucode{i} and \ucode{j} are modified within the \kcode{parallel} construct. \cexample{private}{1} \fexample{private}{1} -In the following example, all uses of the variable \plc{a} within the loop construct -in the routine \plc{f} refer to a private list item \plc{a}, while it is -unspecified whether references to \plc{a} in the routine \plc{g} are to a +\pagebreak +In the following example, all uses of the variable \ucode{a} within the loop construct +in the routine \ucode{f} refer to a private list item \ucode{a}, while it is +unspecified whether references to \ucode{a} in the routine \ucode{g} are to a private list item or the original list item. \cexample{private}{2} \fexample{private}{2} -The following example demonstrates that a list item that appears in a \code{private} - clause in a \code{parallel} construct may also appear in a \code{private} +The following example demonstrates that a list item that appears in a \kcode{private} + clause in a \kcode{parallel} construct may also appear in a \kcode{private} clause in an enclosed worksharing construct, which results in an additional private copy. diff --git a/data_environment/reduction.tex b/data_environment/reduction.tex index df321e1..ceeebcb 100644 --- a/data_environment/reduction.tex +++ b/data_environment/reduction.tex @@ -5,15 +5,15 @@ \section{Reduction} This section covers ways to perform reductions in parallel, task, taskloop, and SIMD regions. -\subsection{\code{reduction} Clause} +\subsection{\kcode{reduction} Clause} \label{subsec:reduction} -\index{clauses!reduction@\code{reduction}} -\index{reduction clause@\code{reduction} clause} -\index{reductions!reduction clause@\code{reduction} clause} +\index{clauses!reduction@\kcode{reduction}} +\index{reduction clause@\kcode{reduction} clause} +\index{reductions!reduction clause@\kcode{reduction} clause} -The following example demonstrates the \code{reduction} clause; note that some -reductions can be expressed in the loop in several ways, as shown for the \code{max} -and \code{min} reductions below: +The following example demonstrates the \kcode{reduction} clause; note that some +reductions can be expressed in the loop in several ways, as shown for the \kcode{max} +and \kcode{min} reductions below: \cexample[3.1]{reduction}{1} @@ -30,46 +30,43 @@ \subsection{\code{reduction} Clause} \ffreenexample{reduction}{2} The following program is non-conforming because the reduction is on the -\emph{intrinsic procedure name} \code{MAX} but that name has been redefined to be the variable -named \code{MAX}. +\emph{intrinsic procedure name} \bcode{MAX} but that name has been redefined to be the variable +named \ucode{MAX}. \ffreenexample{reduction}{3} -% blue line floater at top of this page for "Fortran, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} -\end{figure} +\topmarker{Fortran} The following conforming program performs the reduction using the -\emph{intrinsic procedure name} \code{MAX} even though the intrinsic \code{MAX} has been renamed -to \code{REN}. +\emph{intrinsic procedure name} \kcode{MAX} even though the intrinsic \bcode{MAX} has been renamed +to \ucode{REN}. \ffreenexample{reduction}{4} The following conforming program performs the reduction using -\plc{intrinsic procedure name} \code{MAX} even though the intrinsic \code{MAX} has been renamed -to \code{MIN}. +\plc{intrinsic procedure name} \kcode{MAX} even though the intrinsic \bcode{MAX} has been renamed +to \ucode{MIN}. \ffreenexample{reduction}{5} \fortranspecificend %\pagebreak -The following example is non-conforming because the initialization (\code{a = -0}) of the original list item \code{a} is not synchronized with the update of -\code{a} as a result of the reduction computation in the \code{for} loop. Therefore, -the example may print an incorrect value for \code{a}. +The following example is non-conforming because the initialization (\ucode{a = +0}) of the original list item \ucode{a} is not synchronized with the update of +\ucode{a} as a result of the reduction computation in the \bcode{for} loop. Therefore, +the example may print an incorrect value for \ucode{a}. -To avoid this problem, the initialization of the original list item \code{a} -should complete before any update of \code{a} as a result of the \code{reduction} +To avoid this problem, the initialization of the original list item \ucode{a} +should complete before any update of \ucode{a} as a result of the \kcode{reduction} clause. This can be achieved by adding an explicit barrier after the assignment -\code{a = 0}, or by enclosing the assignment \code{a = 0} in a \code{single} -directive (which has an implied barrier), or by initializing \code{a} before -the start of the \code{parallel} region. +\ucode{a = 0}, or by enclosing the assignment \ucode{a = 0} in a \kcode{single} +directive (which has an implied barrier), or by initializing \ucode{a} before +the start of the \kcode{parallel} region. \cexample[5.1]{reduction}{6} \fexample[5.1]{reduction}{6} -The following example demonstrates the reduction of array \plc{a}. In C/C++ this is illustrated by the explicit use of an array section \plc{a[0:N]} in the \code{reduction} clause. The corresponding Fortran example uses array syntax supported in the base language. As of the OpenMP 4.5 specification the explicit use of array section in the \code{reduction} clause in Fortran is not permitted. But this oversight has been fixed in the OpenMP 5.0 specification. +The following example demonstrates the reduction of array \ucode{a}. In C/C++ this is illustrated by the explicit use of an array section \ucode{a[0:N]} in the \kcode{reduction} clause. The corresponding Fortran example uses array syntax supported in the base language. As of the OpenMP 4.5 specification the explicit use of array section in the \kcode{reduction} clause in Fortran is not permitted. But this oversight has been fixed in the OpenMP 5.0 specification. \cexample[4.5]{reduction}{7} @@ -78,27 +75,27 @@ \subsection{\code{reduction} Clause} \subsection{Task Reduction} \label{subsec:task_reduction} -\index{clauses!task_reduction@\scode{task_reduction}} -\index{task_reduction clause@\scode{task_reduction} clause} -\index{reductions!task_reduction clause@\scode{task_reduction} clause} -\index{clauses!in_reduction@\scode{in_reduction}} -\index{in_reduction clause@\scode{in_reduction} clause} -\index{reductions!in_reduction clause@\scode{in_reduction} clause} +\index{clauses!task_reduction@\kcode{task_reduction}} +\index{task_reduction clause@\kcode{task_reduction} clause} +\index{reductions!task_reduction clause@\kcode{task_reduction} clause} +\index{clauses!in_reduction@\kcode{in_reduction}} +\index{in_reduction clause@\kcode{in_reduction} clause} +\index{reductions!in_reduction clause@\kcode{in_reduction} clause} -In OpenMP 5.0 the \code{task\_reduction} clause was created for the \code{taskgroup} construct, -to allow reductions among explicit tasks that have an \code{in\_reduction} clause. +In OpenMP 5.0 the \kcode{task_reduction} clause was created for the \kcode{taskgroup} construct, +to allow reductions among explicit tasks that have an \kcode{in_reduction} clause. -In the \plc{task\_reduction.1} example below a reduction is performed as the algorithm +In the \example{task_reduction.1} example below a reduction is performed as the algorithm traverses a linked list. The reduction statement is assigned to be an explicit task using -a \code{task} construct and is specified to be a reduction participant with -the \code{in\_reduction} clause. -A \code{taskgroup} construct encloses the tasks participating in the reduction, and -specifies, with the \code{task\_reduction} clause, that the taskgroup has tasks participating -in a reduction. After the \code{taskgroup} region the original variable will contain +a \kcode{task} construct and is specified to be a reduction participant with +the \kcode{in_reduction} clause. +A \kcode{taskgroup} construct encloses the tasks participating in the reduction, and +specifies, with the \kcode{task_reduction} clause, that the taskgroup has tasks participating +in a reduction. After the \kcode{taskgroup} region the original variable will contain the final value of the reduction. -Note: The \plc{res} variable is private in the \plc{linked\_list\_sum} routine -and is not required to be shared (as in the case of a \code{parallel} construct +Note: The \ucode{res} variable is private in the \ucode{linked_list_sum} routine +and is not required to be shared (as in the case of a \kcode{parallel} construct reduction). @@ -106,36 +103,36 @@ \subsection{Task Reduction} \ffreeexample[5.0]{task_reduction}{1} -\index{reduction clause@\code{reduction} clause!task modifier@\code{task} modifier} -\index{task modifier@\code{task} modifier} -In OpenMP 5.0 the \code{task} \plc{reduction-modifier} for the \code{reduction} clause was +\index{reduction clause@\kcode{reduction} clause!task modifier@\kcode{task} modifier} +\index{task modifier@\kcode{task} modifier} +In OpenMP 5.0 the \kcode{task} \plc{reduction-modifier} for the \kcode{reduction} clause was introduced to provide a means of performing reductions among implicit and explicit tasks. -The \code{reduction} clause of a \code{parallel} or worksharing construct may -specify the \code{task} \plc{reduction-modifier} to include explicit task reductions +The \kcode{reduction} clause of a \kcode{parallel} or worksharing construct may +specify the \kcode{task} \plc{reduction-modifier} to include explicit task reductions within their region, provided the reduction operators (\plc{reduction-identifiers}) -and variables (\plc{list items}) of the participating tasks match those of the +and variables (list items) of the participating tasks match those of the implicit tasks. -There are 2 reduction use cases (identified by USE CASE \#) in the \plc{task\_reduction.2} example below. +There are 2 reduction use cases (identified by USE CASE \#) in the \example{task_reduction.2} example below. -In USE CASE 1 a \code{task} modifier in the \code{reduction} clause -of the \code{parallel} construct is used to include the reductions of any -participating tasks, those with an \code{in\_reduction} clause and matching -\plc{reduction-identifiers} (\code{+}) and list items (\code{x}). +In USE CASE 1 a \kcode{task} modifier in the \kcode{reduction} clause +of the \kcode{parallel} construct is used to include the reductions of any +participating tasks, those with an \kcode{in_reduction} clause and matching +\plc{reduction-identifiers} (\kcode{+}) and list items (\ucode{x}). -Note, a \code{taskgroup} construct (with a \code{task\_reduction} clause) in not +Note, a \kcode{taskgroup} construct (with a \kcode{task_reduction} clause) is not necessary to scope the explicit task reduction (as seen in the example above). -Hence, even without the implicit task reduction statement (without the C \code{x++\;} -and Fortran \code{x=x+1} statements), the \code{task} \plc{reduction-modifier} -in a \code{reduction} clause of the \code{parallel} construct -can be used to avoid having to create a \code{taskgroup} construct -(and its \code{task\_reduction} clause) around the task generating structure. +Hence, even without the implicit task reduction statement (without the C \ucode{x++;} +and Fortran \ucode{x=x+1} statements), the \kcode{task} \plc{reduction-modifier} +in a \kcode{reduction} clause of the \kcode{parallel} construct +can be used to avoid having to create a \kcode{taskgroup} construct +(and its \kcode{task_reduction} clause) around the task generating structure. In USE CASE 2 tasks participating in the reduction are within a worksharing region (a parallel worksharing-loop construct). -Here, too, no \code{taskgroup} is required, and the \plc{reduction-identifier} (\code{+}) -and list item (variable \code{x}) match as required. +Here, too, no \kcode{taskgroup} is required, and the \plc{reduction-identifier} (\kcode{+}) +and list item (variable \ucode{x}) match as required. \cexample[5.0]{task_reduction}{2} @@ -145,39 +142,45 @@ \subsection{Task Reduction} \subsection{Reduction on Combined Target Constructs} \label{subsec:target_reduction} -\index{reduction clause@\code{reduction} clause!on target construct@on \code{target} construct} -\index{constructs!target@\code{target}} -\index{target construct@\code{target} construct} +\index{reduction clause@\kcode{reduction} clause!on target construct@on \kcode{target} construct} +\index{constructs!target@\kcode{target}} +\index{target construct@\kcode{target} construct} -When a \code{reduction} clause appears on a combined construct that combines -a \code{target} construct with another construct, there is an implicit map -of the list items with a \code{tofrom} map type for the \code{target} construct. +When a \kcode{reduction} clause appears on a combined construct that combines +a \kcode{target} construct with another construct, there is an implicit map +of the list items with a \kcode{tofrom} map type for the \kcode{target} construct. Otherwise, the list items (if they are scalar variables) would be -treated as firstprivate by default in the \code{target} construct, which +treated as firstprivate by default in the \kcode{target} construct, which is unlikely to provide the intended behavior since the result of the reduction that is in the firstprivate variable would be discarded -at the end of the \code{target} region. +at the end of the \kcode{target} region. -In the following example, the use of the \code{reduction} clause on \code{sum1} -or \code{sum2} should, by default, result in an implicit \code{tofrom} map for -that variable. So long as neither \code{sum1} nor \code{sum2} were already +In the following example, the use of the \kcode{reduction} clause on \ucode{sum1} +or \ucode{sum2} should, by default, result in an implicit \kcode{tofrom} map for +that variable. So long as neither \ucode{sum1} nor \ucode{sum2} were already present on the device, the mapping behavior ensures the value for -\code{sum1} computed in the first \code{target} construct is used in the -second \code{target} construct. +\ucode{sum1} computed in the first \kcode{target} construct is used in the +second \kcode{target} construct. + +Note: a \kcode{declare target} directive is needed for procedures, +\ucode{f} and \ucode{g}, called in \kcode{target} region in Fortran codes. +This directive is not required in C codes because functions, \ucode{f} +and \ucode{g}, are defined in the same compilation unit of the \kcode{target} +construct in which these functions are called. \cexample[5.0]{target_reduction}{1} \ffreeexample[5.0]{target_reduction}{1} %\clearpage -In next example, the variables \code{sum1} and \code{sum2} remain on the -device for the duration of the \code{target}~\code{data} region so that it is +In next example, the variables \ucode{sum1} and \ucode{sum2} remain on the +device for the duration of the \kcode{target data} region so that it is their device copies that are updated by the reductions. Note the significance -of mapping \code{sum1} on the second \code{target} construct; otherwise, it +of mapping \ucode{sum1} on the second \kcode{target} construct; otherwise, it would be treated by default as firstprivate and the result computed for -\code{sum1} in the prior \code{target} region may not be used. Alternatively, a -\code{target}~\code{update} construct could be used between the two -\code{target} constructs to update the host version of \code{sum1} with the +\ucode{sum1} in the prior \kcode{target} region may not be used. Alternatively, a +\kcode{target update} construct could be used between the two +\kcode{target} constructs to update the host version of \ucode{sum1} with the value that is in the corresponding device version after the completion of the first construct. @@ -188,20 +191,20 @@ \subsection{Reduction on Combined Target Constructs} \subsection{Task Reduction with Target Constructs} \label{subsec:target_task_reduction} -\index{in_reduction clause@\scode{in_reduction} clause} -\index{constructs!target@\code{target}} -\index{target construct@\code{target} construct} +\index{in_reduction clause@\kcode{in_reduction} clause} +\index{constructs!target@\kcode{target}} +\index{target construct@\kcode{target} construct} -\index{clauses!enter@\code{enter}} -\index{enter clause@\code{enter} clause} +\index{clauses!enter@\kcode{enter}} +\index{enter clause@\kcode{enter} clause} The following examples illustrate how task reductions can apply to target tasks -that result from a \code{target} construct with the \code{in\_reduction} -clause. Here, the \code{in\_reduction} clause specifies that the target task +that result from a \kcode{target} construct with the \kcode{in_reduction} +clause. Here, the \kcode{in_reduction} clause specifies that the target task participates in the task reduction defined in the scope of the enclosing -\code{taskgroup} construct. Partial results from all tasks participating in the +\kcode{taskgroup} construct. Partial results from all tasks participating in the task reduction will be combined (in some order) into the original variable -listed in the \code{task\_reduction} clause before exiting the \code{taskgroup} +listed in the \kcode{task_reduction} clause before exiting the \kcode{taskgroup} region. \cexample[5.2]{target_task_reduction}{1} @@ -209,26 +212,26 @@ \subsection{Task Reduction with Target Constructs} \ffreeexample[5.2]{target_task_reduction}{1} \clearpage -\index{reduction clause@\code{reduction} clause!task modifier@\code{task} modifier} -\index{task modifier@\code{task} modifier} +\index{reduction clause@\kcode{reduction} clause!task modifier@\kcode{task} modifier} +\index{task modifier@\kcode{task} modifier} In the next pair of examples, the task reduction is defined by a -\code{reduction} clause with the \code{task} modifier, rather than a -\code{task\_reduction} clause on a \code{taskgroup} construct. Again, the +\kcode{reduction} clause with the \kcode{task} modifier, rather than a +\kcode{task_reduction} clause on a \kcode{taskgroup} construct. Again, the partial results from the participating tasks will be combined in some order -into the original reduction variable, \code{sum}. +into the original reduction variable, \ucode{sum}. \cexample[5.2]{target_task_reduction}{2a} \ffreeexample[5.2]{target_task_reduction}{2a} -\index{in_reduction clause@\scode{in_reduction} clause!with target construct@with \code{target} construct} -\index{constructs!target@\code{target}} -\index{target construct@\code{target} construct} -Next, the \code{task} modifier is again used to define a task reduction over +\index{in_reduction clause@\kcode{in_reduction} clause!with target construct@with \kcode{target} construct} +\index{constructs!target@\kcode{target}} +\index{target construct@\kcode{target} construct} +Next, the \kcode{task} modifier is again used to define a task reduction over participating tasks. This time, the participating tasks are a target task -resulting from a \code{target} construct with the \code{in\_reduction} clause, +resulting from a \kcode{target} construct with the \kcode{in_reduction} clause, and the implicit task (executing on the primary thread) that calls -\code{host\_compute}. As before, the partial results from these participating +\ucode{host_compute}. As before, the partial results from these participating tasks are combined in some order into the original reduction variable. \cexample[5.2]{target_task_reduction}{2b} @@ -238,25 +241,25 @@ \subsection{Task Reduction with Target Constructs} \subsection{Taskloop Reduction} \label{subsec:taskloop_reduction} -\index{reduction clause@\code{reduction} clause!on taskloop construct@on \code{taskloop} construct} -\index{constructs!taskloop@\code{taskloop}} -\index{taskloop construct@\code{taskloop} construct} +\index{reduction clause@\kcode{reduction} clause!on taskloop construct@on \kcode{taskloop} construct} +\index{constructs!taskloop@\kcode{taskloop}} +\index{taskloop construct@\kcode{taskloop} construct} -In the OpenMP 5.0 Specification the \code{taskloop} construct +In the OpenMP 5.0 Specification the \kcode{taskloop} construct was extended to include the reductions. The following two examples show how to implement a reduction over an array using taskloop reduction in two different ways. In the first -example we apply the \code{reduction} clause to the \code{taskloop} construct. As it was +example we apply the \kcode{reduction} clause to the \kcode{taskloop} construct. As it was explained above in the task reduction examples, a reduction over tasks is divided in two components: the scope of the reduction, which is defined by a -\code{taskgroup} region, and the tasks that participate in the reduction. In this -example, the \code{reduction} clause defines both semantics. First, it specifies that -the implicit \code{taskgroup} region associated with the \code{taskloop} construct is the scope of the -reduction, and second, it defines all tasks created by the \code{taskloop} construct as +\kcode{taskgroup} region, and the tasks that participate in the reduction. In this +example, the \kcode{reduction} clause defines both semantics. First, it specifies that +the implicit \kcode{taskgroup} region associated with the \kcode{taskloop} construct is the scope of the +reduction, and second, it defines all tasks created by the \kcode{taskloop} construct as participants of the reduction. About the first property, it is important to note -that if we add the \code{nogroup} clause to the \code{taskloop} construct the code will be +that if we add the \kcode{nogroup} clause to the \kcode{taskloop} construct the code will be nonconforming, basically because we have a set of tasks that participate in a reduction that has not been defined. @@ -278,18 +281,18 @@ \subsection{Taskloop Reduction} %create a new reduction and also that all tasks generated by the taskloop will %participate on it. -The second example computes exactly the same value as in the preceding \plc{taskloop\_reduction.1} code section, +The second example computes exactly the same value as in the preceding \example{taskloop_reduction.1} code section, but in a very different way. -First, in the \plc{array\_sum} function a \code{taskgroup} region is created -that defines the scope of a new reduction using the \code{task\_reduction} clause. +First, in the \ucode{array_sum} function a \kcode{taskgroup} region is created +that defines the scope of a new reduction using the \kcode{task_reduction} clause. After that, a task and also the tasks generated by a taskloop participate in -that reduction by using the \code{in\_reduction} clause on the \code{task} -and \code{taskloop} constructs, respectively. -Note that the \code{nogroup} clause was added to the \code{taskloop} construct. -This is allowed because what is expressed with the \code{in\_reduction} clause -is different from what is expressed with the \code{reduction} clause. +that reduction by using the \kcode{in_reduction} clause on the \kcode{task} +and \kcode{taskloop} constructs, respectively. +Note that the \kcode{nogroup} clause was added to the \kcode{taskloop} construct. +This is allowed because what is expressed with the \kcode{in_reduction} clause +is different from what is expressed with the \kcode{reduction} clause. In one case the generated tasks are specified to participate in a previously -declared reduction (\code{in\_reduction} clause) whereas in the other case +declared reduction (\kcode{in_reduction} clause) whereas in the other case creation of a new reduction is specified and also all tasks generated by the taskloop will participate on it. @@ -297,70 +300,70 @@ \subsection{Taskloop Reduction} \ffreeexample[5.0]{taskloop_reduction}{2} %\clearpage -In the OpenMP 5.0 Specification, \code{reduction} clauses for the -\code{taskloop}~\code{ simd} construct were also added. +In the OpenMP 5.0 Specification, \kcode{reduction} clauses for the +\kcode{taskloop simd} construct were also added. -\index{reduction clause@\code{reduction} clause!on taskloop simd construct@on \code{taskloop}~\code{simd} construct} -\index{combined constructs!taskloop simd@\code{taskloop}~\code{simd}} -\index{taskloop simd construct@\code{taskloop}~\code{simd} construct} -The examples below compare reductions for the \code{taskloop} and the \code{taskloop}~\code{simd} constructs. -These examples illustrate the use of \code{reduction} clauses within -``stand-alone'' \code{taskloop} constructs, and the use of \code{in\_reduction} clauses for tasks of taskloops to participate +\index{reduction clause@\kcode{reduction} clause!on taskloop simd construct@on \kcode{taskloop simd} construct} +\index{combined constructs!taskloop simd@\kcode{taskloop simd}} +\index{taskloop simd construct@\kcode{taskloop simd} construct} +The examples below compare reductions for the \kcode{taskloop} and the \kcode{taskloop simd} constructs. +These examples illustrate the use of \kcode{reduction} clauses within +``stand-alone'' \kcode{taskloop} constructs, and the use of \kcode{in_reduction} clauses for tasks of taskloops to participate with other reductions within the scope of a parallel region. \textbf{taskloop reductions:} In the \plc{taskloop reductions} section of the example below, -\plc{taskloop 1} uses the \code{reduction} clause -in a \code{taskloop} construct for a sum reduction, accumulated in \plc{asum}. -The behavior is as though a \code{taskgroup} construct encloses the -taskloop region with a \code{task\_reduction} clause, and each taskloop -task has an \code{in\_reduction} clause with the specifications -of the \code{reduction} clause. -At the end of the taskloop region \plc{asum} contains the result of the reduction. - -The next taskloop, \plc{taskloop 2}, illustrates the use of the -\code{in\_reduction} clause to participate in a previously defined -reduction scope of a \code{parallel} construct. - -The task reductions of \plc{task 2} and \plc{taskloop 2} are combined -across the \code{taskloop} construct and the single \code{task} construct, as specified -in the \code{reduction(task,}~\code{+:asum)} clause of the \code{parallel} construct. -At the end of the parallel region \plc{asum} contains the combined result of all reductions. +\example{taskloop 1} uses the \kcode{reduction} clause +in a \kcode{taskloop} construct for a sum reduction, accumulated in \ucode{asum}. +The behavior is as though a \kcode{taskgroup} construct encloses the +taskloop region with a \kcode{task_reduction} clause, and each taskloop +task has an \kcode{in_reduction} clause with the specifications +of the \kcode{reduction} clause. +At the end of the taskloop region \ucode{asum} contains the result of the reduction. + +The next taskloop, \example{taskloop 2}, illustrates the use of the +\kcode{in_reduction} clause to participate in a previously defined +reduction scope of a \kcode{parallel} construct. + +The task reductions of \example{task 2} and \example{taskloop 2} are combined +across the \kcode{taskloop} construct and the single \kcode{task} construct, as specified +in the \kcode{reduction(task,+: \ucode{asum})} clause of the \kcode{parallel} construct. +At the end of the parallel region \ucode{asum} contains the combined result of all reductions. \textbf{taskloop simd reductions:} -Reductions for the \code{taskloop}~\code{simd} construct are shown in the second half of the code. -Since each component construct, \code{taskloop} and \code{simd}, -can accept a reduction-type clause, the \code{taskloop}~\code{simd} construct +Reductions for the \kcode{taskloop simd} construct are shown in the second half of the code. +Since each component construct, \kcode{taskloop} and \kcode{simd}, +can accept a reduction clause, the \kcode{taskloop simd} construct is a composite construct, and the specific application of the reduction clause is defined -within the \code{taskloop}~\code{simd} construct section of the OpenMP 5.0 Specification. +within the \docref{\kcode{taskloop simd} Construct} section of the OpenMP 5.0 Specification. The code below illustrates use cases for these reductions. In the \plc{taskloop simd reduction} section of the example below, -\plc{taskloop simd 3} uses the \code{reduction} clause -in a \code{taskloop}~\code{simd} construct for a sum reduction within a loop. -For this case a \code{reduction} clause is used, as one would use -for a \code{simd} construct. +\example{taskloop simd 3} uses the \kcode{reduction} clause +in a \kcode{taskloop simd} construct for a sum reduction within a loop. +For this case a \kcode{reduction} clause is used, as one would use +for a \kcode{simd} construct. The SIMD reductions of each task are combined, and the results of these tasks are further -combined just as in the \code{taskloop} construct with the \code{reduction} clause for \plc{taskloop 1}. -At the end of the taskloop region \plc{asum} contains the combined result of all reductions. +combined just as in the \kcode{taskloop} construct with the \kcode{reduction} clause for \example{taskloop 1}. +At the end of the taskloop region \ucode{asum} contains the combined result of all reductions. -If a \code{taskloop}~\code{simd} construct is to participate in a previously defined +If a \kcode{taskloop simd} construct is to participate in a previously defined reduction scope, the reduction participation should be specified with -a \code{in\_reduction} clause, as shown in the \code{parallel} region enclosing -\plc{task 4} and \plc{taskloop simd 4} code sections. +a \kcode{in_reduction} clause, as shown in the \kcode{parallel} region enclosing +\example{task 4} and \example{taskloop simd 4} code sections. -Here the \code{taskloop}~\code{simd} construct's -\code{in\_reduction} clause specifies participation of the construct's tasks as +Here the \kcode{taskloop simd} construct's +\kcode{in_reduction} clause specifies participation of the construct's tasks as a task reduction within the scope of the parallel region. -That is, the results of each task of the \code{taskloop} construct component -contribute to the reduction in a broader level, just as in \plc{parallel reduction a} code section above. -Also, each \code{simd}-component construct -occurs as if it has a \code{reduction} clause, and the +That is, the results of each task of the \kcode{taskloop} construct component +contribute to the reduction in a broader level, just as in \example{parallel reduction a} code section above. +Also, each \kcode{simd}-component construct +occurs as if it has a \kcode{reduction} clause, and the SIMD results of each task are combined as though to form a single result for -each task (that participates in the \code{in\_reduction} clause). -At the end of the parallel region \plc{asum} contains the combined result of all reductions. +each task (that participates in the \kcode{in_reduction} clause). +At the end of the parallel region \ucode{asum} contains the combined result of all reductions. %Just as in \plc{parallel reduction a} the %\code{taskloop simd} construct reduction results are combined @@ -376,15 +379,15 @@ \subsection{Taskloop Reduction} \ffreeexample[5.1]{taskloop_simd_reduction}{1} -\subsection{Reduction with the \code{scope} Construct} +\subsection{Reduction with the \kcode{scope} Construct} \label{subsec:reduction_scope} -\index{reduction clause@\code{reduction} clause!on scope construct@on \code{scope} construct} -\index{constructs!scope@\code{scope}} -\index{scope construct@\code{scope} construct} +\index{reduction clause@\kcode{reduction} clause!on scope construct@on \kcode{scope} construct} +\index{constructs!scope@\kcode{scope}} +\index{scope construct@\kcode{scope} construct} -The following example illustrates the use of the \code{scope} construct -to perform a reduction in a \code{parallel} region. The case is useful for -producing a reduction and accessing reduction variables inside a \code{parallel} region +The following example illustrates the use of the \kcode{scope} construct +to perform a reduction in a \kcode{parallel} region. The case is useful for +producing a reduction and accessing reduction variables inside a \kcode{parallel} region without using a worksharing-loop construct. \cppexample[5.1]{scope_reduction}{1} diff --git a/data_environment/scan.tex b/data_environment/scan.tex index fd47523..4de05bd 100644 --- a/data_environment/scan.tex +++ b/data_environment/scan.tex @@ -1,47 +1,47 @@ -\pagebreak -\section{\code{scan} Directive} +%\pagebreak +\section{\kcode{scan} Directive} \label{sec:scan} -\index{directives!scan@\code{scan}} -\index{scan directive@\code{scan} directive} -\index{reduction clause@\code{reduction} clause!inscan modifier@\code{inscan} modifier} -\index{inscan modifier@\code{inscan} modifier} +\index{directives!scan@\kcode{scan}} +\index{scan directive@\kcode{scan} directive} +\index{reduction clause@\kcode{reduction} clause!inscan modifier@\kcode{inscan} modifier} +\index{inscan modifier@\kcode{inscan} modifier} The following examples illustrate how to parallelize a loop that saves the \emph{prefix sum} of a reduction. This is accomplished by using -the \code{inscan} modifier in the \code{reduction} clause for the input -variable of the scan, and specifying with a \code{scan} directive whether +the \kcode{inscan} modifier in the \kcode{reduction} clause for the input +variable of the scan, and specifying with a \kcode{scan} directive whether the storage statement includes or excludes the scan input of the present -iteration (\texttt{k}). - -\index{scan directive@\code{scan} directive!inclusive clause@\code{inclusive} clause} -\index{scan directive@\code{scan} directive!exclusive clause@\code{exclusive} clause} -\index{clauses!inclusive@\code{inclusive}} -\index{inclusive clause@\code{inclusive} clause} -\index{clauses!exclusive@\code{exclusive}} -\index{exclusive clause@\code{exclusive} clause} -Basically, the \code{inscan} modifier connects a loop and/or SIMD reduction to -the scan operation, and a \code{scan} construct with an \code{inclusive} or -\code{exclusive} clause specifies whether the ``scan phase'' (lexical block +iteration (\ucode{k}). + +\index{scan directive@\kcode{scan} directive!inclusive clause@\kcode{inclusive} clause} +\index{scan directive@\kcode{scan} directive!exclusive clause@\kcode{exclusive} clause} +\index{clauses!inclusive@\kcode{inclusive}} +\index{inclusive clause@\kcode{inclusive} clause} +\index{clauses!exclusive@\kcode{exclusive}} +\index{exclusive clause@\kcode{exclusive} clause} +Basically, the \kcode{inscan} modifier connects a loop and/or SIMD reduction to +the scan operation, and a \kcode{scan} construct with an \kcode{inclusive} or +\kcode{exclusive} clause specifies whether the ``scan phase'' (lexical block before and after the directive, respectively) is to use an \plc{inclusive} or -\plc{exclusive} scan value for the list item (\texttt{x}). +\plc{exclusive} scan value for the list item (\ucode{x}). The first example uses the \plc{inclusive} scan operation on a composite -loop-SIMD construct. The \code{scan} directive separates the reduction -statement on variable \texttt{x} from the use of \texttt{x} (saving to array \texttt{b}). +loop-SIMD construct. The \kcode{scan} directive separates the reduction +statement on variable \ucode{x} from the use of \ucode{x} (saving to array \ucode{b}). The order of the statements in this example indicates that -value \texttt{a[k]} (\texttt{a(k)} in Fortran) is included in the computation of -the prefix sum \texttt{b[k]} (\texttt{b(k)} in Fortran) for iteration \texttt{k}. +value \ucode{a[k]} (\ucode{a(k)} in Fortran) is included in the computation of +the prefix sum \ucode{b[k]} (\ucode{b(k)} in Fortran) for iteration \ucode{k}. \cexample[5.0]{scan}{1} \ffreeexample[5.0]{scan}{1} The second example uses the \plc{exclusive} scan operation on a composite -loop-SIMD construct. The \code{scan} directive separates the use of \texttt{x} -(saving to array \texttt{b}) from the reduction statement on variable \texttt{x}. +loop-SIMD construct. The \kcode{scan} directive separates the use of \ucode{x} +(saving to array \ucode{b}) from the reduction statement on variable \ucode{x}. The order of the statements in this example indicates that -value \texttt{a[k]} (\texttt{a(k)} in Fortran) is excluded from the computation -of the prefix sum \texttt{b[k]} (\texttt{b(k)} in Fortran) for iteration \texttt{k}. +value \ucode{a[k]} (\ucode{a(k)} in Fortran) is excluded from the computation +of the prefix sum \ucode{b[k]} (\ucode{b(k)} in Fortran) for iteration \ucode{k}. \cexample[5.0]{scan}{2} diff --git a/data_environment/sources/fort_shared_var.1.f90 b/data_environment/sources/fort_shared_var.1.f90 new file mode 100644 index 0000000..85970a2 --- /dev/null +++ b/data_environment/sources/fort_shared_var.1.f90 @@ -0,0 +1,51 @@ +! @@name: fort_shared_var.1 +! @@type: F-free +! @@operation: run +! @@expect: undefined +! @@version: pre_omp_3.0 +program fort_shared_var + implicit none + integer, parameter :: N = 100 + integer a(N) + integer i + interface + subroutine sub1(b) + integer b(:) + end subroutine + subroutine sub2(b) + integer, contiguous :: b(:) + end subroutine + end interface + + a = [(i, i=1,N)] + !$omp parallel shared(a) num_threads(2) + call sub1(a(::2)) ! copy-in/copy-out may or may not occur + !$omp end parallel + print *, 'sum1 =', sum(a) ! sum1 may/may not be well defined + + a = [(i, i=1,N)] + !$omp parallel shared(a) num_threads(2) + call sub2(a(::2)) ! copy-in/copy-out result in a data race + !$omp end parallel + print *, 'sum2 =', sum(a) ! sum2 is not well defined +end + +subroutine sub1(b) + implicit none + integer b(:) + integer i + do i = 1, size(b) + !$omp atomic + b(i) = b(i) + 1 + end do +end subroutine + +subroutine sub2(b) + implicit none + integer, contiguous :: b(:) + integer i + do i = 1, size(b) + !$omp atomic + b(i) = b(i) + 1 + end do +end subroutine diff --git a/data_environment/sources/target_reduction.1.f90 b/data_environment/sources/target_reduction.1.f90 index 122a797..ffcf993 100644 --- a/data_environment/sources/target_reduction.1.f90 +++ b/data_environment/sources/target_reduction.1.f90 @@ -31,9 +31,11 @@ function g(res) integer function f(res) integer :: res + !$omp declare target enter(f) f = res*2 end function integer function g(res) integer :: res + !$omp declare target enter(g) g = res*3 end function diff --git a/data_environment/sources/target_reduction.2.f90 b/data_environment/sources/target_reduction.2.f90 index 3a7a911..2e4088d 100644 --- a/data_environment/sources/target_reduction.2.f90 +++ b/data_environment/sources/target_reduction.2.f90 @@ -33,9 +33,11 @@ function g(res) integer function f(res) integer :: res + !$omp declare target enter(f) f = res*2 end function integer function g(res) integer :: res + !$omp declare target enter(g) g = res*3 end function diff --git a/data_environment/sources/target_task_reduction.1.f90 b/data_environment/sources/target_task_reduction.1.f90 index b1f4d57..387eea5 100644 --- a/data_environment/sources/target_task_reduction.1.f90 +++ b/data_environment/sources/target_task_reduction.1.f90 @@ -31,6 +31,7 @@ end subroutine host_compute subroutine device_compute(sum) integer :: sum + !$omp declare target enter(device_compute) sum = 1 end subroutine subroutine host_compute(sum) diff --git a/data_environment/sources/target_task_reduction.2a.f90 b/data_environment/sources/target_task_reduction.2a.f90 index 1ef81b7..3c576a3 100644 --- a/data_environment/sources/target_task_reduction.2a.f90 +++ b/data_environment/sources/target_task_reduction.2a.f90 @@ -29,6 +29,7 @@ end subroutine host_compute subroutine device_compute(sum) integer :: sum + !$omp declare target enter(device_compute) sum = 1 end subroutine subroutine host_compute(sum) diff --git a/data_environment/sources/target_task_reduction.2b.f90 b/data_environment/sources/target_task_reduction.2b.f90 index 882f166..67bac59 100644 --- a/data_environment/sources/target_task_reduction.2b.f90 +++ b/data_environment/sources/target_task_reduction.2b.f90 @@ -28,6 +28,7 @@ end subroutine host_compute subroutine device_compute(sum) integer :: sum + !$omp declare target enter(device_compute) sum = 1 end subroutine subroutine host_compute(sum) diff --git a/data_environment/sources/udr.3.f90 b/data_environment/sources/udr.3.f90 index dfb374b..412d5a9 100644 --- a/data_environment/sources/udr.3.f90 +++ b/data_environment/sources/udr.3.f90 @@ -5,7 +5,6 @@ ! @@version: omp_4.0 program max_loc implicit none - type :: mx_s real value integer index diff --git a/data_environment/threadprivate.tex b/data_environment/threadprivate.tex index 961f2af..1bc19b2 100644 --- a/data_environment/threadprivate.tex +++ b/data_environment/threadprivate.tex @@ -1,33 +1,34 @@ -\pagebreak -\section{\code{threadprivate} Directive} +%\pagebreak +\section{\kcode{threadprivate} Directive} \label{sec:threadprivate} -\index{directives!threadprivate@\code{threadprivate}} -\index{threadprivate directive@\code{threadprivate} directive} +\index{directives!threadprivate@\kcode{threadprivate}} +\index{threadprivate directive@\kcode{threadprivate} directive} -The following examples demonstrate how to use the \code{threadprivate} directive +The following examples demonstrate how to use the \kcode{threadprivate} directive to give each thread a separate counter. \cexample{threadprivate}{1} \fexample{threadprivate}{1} +\pagebreak \ccppspecificstart -The following example uses \code{threadprivate} on a static variable: +The following example uses \kcode{threadprivate} on a static variable: \cnexample{threadprivate}{2} The following example demonstrates unspecified behavior for the initialization -of a \code{threadprivate} variable. A \code{threadprivate} variable is initialized -once at an unspecified point before its first reference. Because \code{a} is -constructed using the value of \code{x} (which is modified by the statement -\code{x++}), the value of \code{a.val} at the start of the \code{parallel} -region could be either 1 or 2. This problem is avoided for \code{b}, which uses -an auxiliary \code{const} variable and a copy-constructor. +of a \kcode{threadprivate} variable. A \kcode{threadprivate} variable is initialized +once at an unspecified point before its first reference. Because \ucode{a} is +constructed using the value of \ucode{x} (which is modified by the statement +\ucode{x++}), the value of \ucode{a.val} at the start of the \kcode{parallel} +region could be either 1 or 2. This problem is avoided for \ucode{b}, which uses +an auxiliary \bcode{const} variable and a copy-constructor. \cppnexample{threadprivate}{3} \ccppspecificend -The following examples show non-conforming uses and correct uses of the \code{threadprivate} +The following examples show non-conforming uses and correct uses of the \kcode{threadprivate} directive. \fortranspecificstart @@ -45,62 +46,56 @@ \section{\code{threadprivate} Directive} \fnexample{threadprivate}{4} -The following is an example of the use of \code{threadprivate} for local variables: -% blue line floater at top of this page for "Fortran, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} -\end{figure} +The following is an example of the use of \kcode{threadprivate} for local variables: +\topmarker{Fortran} \fnexample{threadprivate}{5} The above program, if executed by two threads, will print one of the following two sets of output: -\code{a = 11 12 13} +\pout{a = 11 12 13} \\ -\code{ptr = 4} +\pout{ptr = 4} \\ -\code{i = 15} +\pout{i = 15} -\code{A is not allocated} +\pout{A is not allocated} \\ -\code{ptr = 4} +\pout{ptr = 4} \\ -\code{i = 5} +\pout{i = 5} or -\code{A is not allocated} +\pout{A is not allocated} \\ -\code{ptr = 4} +\pout{ptr = 4} \\ -\code{i = 15} +\pout{i = 15} -\code{a = 1 2 3} +\pout{a = 1 2 3} \\ -\code{ptr = 4} +\pout{ptr = 4} \\ -\code{i = 5} +\pout{i = 5} -The following is an example of the use of \code{threadprivate} for module variables: -% blue line floater at top of this page for "Fortran, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} -\end{figure} +The following is an example of the use of \kcode{threadprivate} for module variables: +\topmarker{Fortran} \fnexample{threadprivate}{6} \fortranspecificend \cppspecificstart -The following example illustrates initialization of \code{threadprivate} variables -for class-type \code{T}. \code{t1} is default constructed, \code{t2} is constructed -taking a constructor accepting one argument of integer type, \code{t3} is copy -constructed with argument \code{f()}: +The following example illustrates initialization of \kcode{threadprivate} variables +for class-type \ucode{T}. \ucode{t1} is default constructed, \ucode{t2} is constructed +taking a constructor accepting one argument of integer type, \ucode{t3} is copy +constructed with argument \ucode{f()}: \cppnexample{threadprivate}{4} -The following example illustrates the use of \code{threadprivate} for static -class members. The \code{threadprivate} directive for a static class member must +The following example illustrates the use of \kcode{threadprivate} for static +class members. The \kcode{threadprivate} directive for a static class member must be placed inside the class definition. \cppnexample{threadprivate}{5} diff --git a/data_environment/udr.tex b/data_environment/udr.tex index 6eabd02..210a11c 100644 --- a/data_environment/udr.tex +++ b/data_environment/udr.tex @@ -1,20 +1,20 @@ \subsection{User-Defined Reduction} \label{subsec:UDR} \index{reductions!user-defined} -\index{reductions!declare reduction directive@\code{declare}~\code{reduction} directive} -\index{declare reduction directive@\code{declare}~\code{reduction} directive} -\index{directives!declare reduction@\code{declare}~\code{reduction}} -\index{declare reduction directive@\code{declare}~\code{reduction} directive!initializer clause@\code{initializer} clause} -\index{declare reduction directive@\code{declare}~\code{reduction} directive!combiner} -\index{declare reduction directive@\code{declare}~\code{reduction} directive!OpenMP variable identifiers} -\index{OpenMP variable identifiers!omp_in@\scode{omp_in}} -\index{OpenMP variable identifiers!omp_out@\scode{omp_out}} -\index{OpenMP variable identifiers!omp_priv@\scode{omp_priv}} +\index{reductions!declare reduction directive@\kcode{declare reduction} directive} +\index{declare reduction directive@\kcode{declare reduction} directive} +\index{directives!declare reduction@\kcode{declare reduction}} +\index{declare reduction directive@\kcode{declare reduction} directive!initializer clause@\kcode{initializer} clause} +\index{declare reduction directive@\kcode{declare reduction} directive!combiner} +\index{declare reduction directive@\kcode{declare reduction} directive!OpenMP variable identifiers} +\index{OpenMP variable identifiers!omp_in@\kcode{omp_in}} +\index{OpenMP variable identifiers!omp_out@\kcode{omp_out}} +\index{OpenMP variable identifiers!omp_priv@\kcode{omp_priv}} \index{combiner} -\index{clauses!initializer@\code{initializer}} -\index{initializer clause@\code{initializer} clause} +\index{clauses!initializer@\kcode{initializer}} +\index{initializer clause@\kcode{initializer} clause} -The \code{declare}~\code{reduction} directive can be used to specify +The \kcode{declare reduction} directive can be used to specify user-defined reductions (UDR) for user data types. %The following examples show how user-defined reductions can be used to support user data types in the \code{reduction} clause. @@ -22,81 +22,105 @@ \subsection{User-Defined Reduction} %The following example computes the enclosing rectangle of a set of points. The point data structure (\code{struct}~\code{point}) is not supported by the \code{reduction} clause. Using two \code{declare}~\code{reduction} directives we define how a reduction for the point data structure is done for the \plc{min} and \plc{max} operations. Each \code{declare}~\code{reduction} directive calls the appropriate function that passes the two special variables that can be used in the user-defined reduction expression: \code{omp\_in}, which holds one of the two values to reduce, and \code{omp\_out}, which holds the other value and should hold also the result of the reduction once the expression has been executed. Note, also, that when defining the user-defined reduction for \plc{min} we specify how the private variables of each thread are to be initialized (that is, the neutral value). This is not the case for \plc{max} as the default values (that is, zero filling) are already adequate. -In the following example, \code{declare}~\code{reduction} directives are used to define -\plc{min} and \plc{max} operations for the \plc{point} data structure for computing +In the following example, \kcode{declare reduction} directives are used to define +\plc{min} and \plc{max} operations for the \ucode{point} data structure for computing the rectangle that encloses a set of 2-D points. -Each \code{declare}~\code{reduction} directive defines new reduction identifiers, -\plc{min} and \plc{max}, to be used in a \code{reduction} clause. The next item in the -declaration list is the data type (\plc{struct} \plc{point}) used in the reduction, -followed by the combiner, here the functions \plc{minproc} and \plc{maxproc} perform -the min and max operations, respectively, on the user data (of type \plc{struct} \plc{point}). -In the function argument list are two special OpenMP variable identifiers, \code{omp\_in} and \code{omp\_out}, +Each \kcode{declare reduction} directive defines new reduction identifiers, +\ucode{min} and \ucode{max}, to be used in a \kcode{reduction} clause. The next item in the +declaration list is the data type (\ucode{struct point}) used in the reduction, +followed by the combiner, here the functions \ucode{minproc} and \ucode{maxproc} perform +the min and max operations, respectively, on the user data (of type \ucode{struct point}). +In the function argument list are two special OpenMP variable identifiers, \kcode{omp_in} and \kcode{omp_out}, that denote the two values to be combined in the ``real'' function; -the \code{omp\_out} identifier indicates which one is to hold the result. +the \kcode{omp_out} identifier indicates which one is to hold the result. -The initializer of the \code{declare}~\code{reduction} directive specifies +The initializer of the \kcode{declare reduction} directive specifies the initial value for the private variable of each implicit task. -The \code{omp\_priv} identifier is used to denote the private variable. +The \kcode{omp_priv} identifier is used to denote the private variable. \cexample[4.0]{udr}{1} %\clearpage The following example shows the corresponding code in Fortran. -The \code{declare}~\code{reduction} directives are specified as part of -the declaration in subroutine \plc{find\_enclosing\_rectangle} and +The \kcode{declare reduction} directives are specified as part of +the declaration in subroutine \ucode{find_enclosing_rectangle} and the procedures that perform the min and max operations are specified as subprograms. \ffreeexample[4.0]{udr}{1} -The following example shows the same computation as \plc{udr.1} but it illustrates that you can craft complex expressions in the user-defined reduction declaration. In this case, instead of calling the \plc{minproc} and \plc{maxproc} functions we inline the code in a single expression. +The following example shows the same computation as \example{udr.1} but it illustrates that you can craft complex expressions in the user-defined +reduction declaration. In this case, instead of calling the \ucode{minproc} +and \ucode{maxproc} functions we inline the code in a single expression. \cexample[4.0]{udr}{2} The corresponding code of the same example in Fortran is very similar -except that the assignment expression in the \code{declare}~\code{reduction} +except that the assignment expression in the \kcode{declare reduction} directive can only be used for a single variable, in this case through -a type structure constructor \plc{point($\ldots$)}. +a type structure constructor \ucode{point($\ldots$)}. \ffreeexample[4.0]{udr}{2} -\index{OpenMP variable identifiers!omp_orig@\scode{omp_orig}} -The following example shows the use of special variables in arguments for combiner (\code{omp\_in} and \code{omp\_out}) and initializer (\code{omp\_priv} and \code{omp\_orig}) routines. This example returns the maximum value of an array and the corresponding index value. The \code{declare}~\code{reduction} directive specifies a user-defined reduction operation \plc{maxloc} for data type \plc{struct} \plc{mx\_s}. The function \plc{mx\_combine} is the combiner and the function \plc{mx\_init} is the initializer. +\index{OpenMP variable identifiers!omp_orig@\kcode{omp_orig}} +The following example shows the use of special variables in arguments for +combiner (\kcode{omp_in} and \kcode{omp_out}) and initializer (\kcode{omp_priv} +and \kcode{omp_orig}) routines. This example returns the maximum value of an +array and the corresponding index value. The \kcode{declare reduction} +directive specifies a user-defined reduction operation \ucode{maxloc} for +data type \ucode{struct mx_s}. The function \ucode{mx_combine} is the combiner +and the function \ucode{mx_init} is the initializer. \cexample[4.0]{udr}{3} -Below is the corresponding Fortran version of the above example. The \code{declare}~\code{reduction} directive specifies the user-defined operation \plc{maxloc} for user-derived type \plc{mx\_s}. The combiner \plc{mx\_combine} and the initializer \plc{mx\_init} are specified as subprograms. +Below is the corresponding Fortran version of the above example. The +\kcode{declare reduction} directive specifies the user-defined operation +\ucode{maxloc} for user-derived type \ucode{mx_s}. The combiner +\ucode{mx_combine} and the initializer \ucode{mx_init} are specified as +subprograms. \ffreeexample[4.0]{udr}{3} The following example explains a few details of the user-defined reduction -in Fortran through modules. The \code{declare}~\code{reduction} directive is declared in a module (\plc{data\_red}). -The reduction-identifier \plc{.add.} is a user-defined operator that is +in Fortran through modules. The \kcode{declare reduction} directive is declared in a module (\ucode{data_red}). +The reduction-identifier \ucode{.add.} is a user-defined operator that is to allow accessibility in the scope that performs the reduction operation. -The user-defined operator \plc{.add.} and the subroutine \plc{dt\_init} specified in the \code{initializer} clause are defined in the same subprogram. +The user-defined operator \ucode{.add.} and the subroutine \ucode{dt_init} specified in the \kcode{initializer} clause are defined in the same subprogram. -The reduction operation (that is, the \code{reduction} clause) is in the main program. -The reduction identifier \plc{.add.} is accessible by use association. -Since \plc{.add.} is a user-defined operator, the explicit interface +The reduction operation (that is, the \kcode{reduction} clause) is in the main program. +The reduction identifier \ucode{.add.} is accessible by use association. +Since \ucode{.add.} is a user-defined operator, the explicit interface should also be accessible by use association in the current program unit. -Since the \code{declare}~\code{reduction} associated to this \code{reduction} clause -has the \code{initializer} clause, the subroutine specified on the clause +Since the \kcode{declare reduction} associated to this \kcode{reduction} clause +has the \kcode{initializer} clause, the subroutine specified on the clause must be accessible in the current scoping unit. In this case, -the subroutine \plc{dt\_init} is accessible by use association. +the subroutine \ucode{dt_init} is accessible by use association. \ffreeexample[4.0]{udr}{4} -The following example uses user-defined reductions to declare a plus (+) reduction for a C++ class. As the \code{declare}~\code{reduction} directive is inside the context of the \plc{V} class the expressions in the \code{declare}~\code{reduction} directive are resolved in the context of the class. Also, note that the \code{initializer} clause uses a copy constructor to initialize the private variables of the reduction and it uses as parameter to its original variable by using the special variable \code{omp\_orig}. +The following example uses user-defined reductions to declare a plus (\kcode{+}) +reduction for a C++ class. As the \kcode{declare reduction} directive is inside +the context of the \ucode{V} class the expressions in the \kcode{declare +reduction} directive are resolved in the context of the class. Also, note that +the \kcode{initializer} clause uses a copy constructor to initialize the +private variables of the reduction and it uses as parameter to its original +variable by using the special variable \kcode{omp_orig}. \cppexample[4.0]{udr}{5} -The following examples shows how user-defined reductions can be defined for some STL containers. The first \code{declare}~\code{reduction} defines the plus (+) operation for \plc{std::vector} by making use of the \plc{std::transform} algorithm. The second and third define the merge (or concatenation) operation for \plc{std::vector} and \plc{std::list}. +The following examples shows how user-defined reductions can be defined for +some STL containers. The first \kcode{declare reduction} defines the plus +(\kcode{+}) +operation for \ucode{std::vector} by making use of the +\ucode{std::transform} algorithm. The second and third define the merge +(or concatenation) operation for \ucode{std::vector} and +\ucode{std::list}. %It shows how the same user-defined reduction operation can be defined to be done differently depending on the specified data type. It shows how the user-defined reduction operation can be applied to specific data types of an STL. diff --git a/devices/C++_virtual_functions.tex b/devices/C++_virtual_functions.tex index c989258..5819b5a 100644 --- a/devices/C++_virtual_functions.tex +++ b/devices/C++_virtual_functions.tex @@ -1,4 +1,4 @@ -\pagebreak +%\pagebreak \section{C++ Virtual Functions} \label{sec:virtual_functions} @@ -6,33 +6,45 @@ \section{C++ Virtual Functions} The 5.2 OpenMP Specification clarified restrictions on the use of polymorphic classes and virtual functions when used within -\scode{target} regions. The following example identifies -problem cases in which the restrictions are not followed -(for Unified Shared Memory, as prescribed by the \scode{requires} -directive). +\kcode{target} regions. The following examples illustrate +use cases and the limitations imposed by restrictions for +references and the use of Unified Shared Memory. + +The first example illustrates two simple cases of using a virtual function +through a pointer and reference without Unified Shared Memory. + +A class, \ucode{D}, is derived from class \ucode{A}. +Function \ucode{vf} in class \ucode{A} is declared +virtual, and the function \ucode{vf} in class \ucode{D} is declared with override. +An object, \ucode{d} of type \ucode{D}, is created and mapped through a \kcode{target data} +directive. + +In the first case, a pointer of type \ucode{A}, \ucode{ap}, is assigned to point +to the derived object \ucode{d}, and in the first \kcode{target} region the pointer +is used to call the \ucode{vf} function of the derived class, \ucode{D}. + +In the second case, the reference variable \ucode{ar} of type \ucode{A} references +the derived object \ucode{d}). The use of the reference variable \ucode{ar} +in the \kcode{target} region is illegal due to the restriction that the static and dynamic +types must match when mapping an object for the first time. +That is, the behavior of the implicit map of \ucode{ar} +is non-conforming -- its static type \ucode{A} doesn't match its dynamic type \ucode{D}. +Hence the behavior of the access to the virtual functions is unspecified. -The first section illustrates the restriction -that when mapping an object for the first time, -the static and dynamic types must match. - -For the first target region the behavior of the implicit map of \splc{ar} -is not specified-- its static type (A) doesn't match its dynamic type (D). -Hence access to the virtual functions is undefined. -However, the second target region can access \splc{D::vf()} -since the object to which \splc{ap} points is not mapped and -therefore the restriction does not apply. +\cppexample[5.2]{virtual_functions}{1} -The second section illustrates the restriction: +The second example illustrates the restriction: \emph{``Invoking a virtual member function of an object on a device other than the device on which the object was constructed results in unspecified behavior, unless the object is accessible and was constructed on the host device.''} -An instantiation of a polymorphic class (\splc{A}) occurs in the -\scode{target} region, and access of its virtual function +In the first case, an instantiation \ucode{ap} of a polymorphic class (\ucode{A}) occurs in the +\kcode{target} region, and access of its virtual function is incorrectly attempted on the host (another device). -However, once the object is deleted on -the target device and instantiated on the host, access within -the next \scode{target} region is permitted. -\cppexample[5.2]{virtual_functions}{1} +In the second case, the object \ucode{ap} is instantiated on the host; access of \ucode{ap} within +the next \kcode{target} region is permitted. (Unified Shared Memory is +used here to minimize mapping concerns.) + +\cppexample[5.2]{virtual_functions}{2} diff --git a/devices/array_sections.tex b/devices/array_sections.tex index 87dd1c1..d0b3726 100644 --- a/devices/array_sections.tex +++ b/devices/array_sections.tex @@ -1,19 +1,19 @@ \pagebreak \section{Array Sections in Device Constructs} \label{sec:array_sections} -\index{array sections!in map clause@in \code{map} clause} -\index{map clause@\code{map} clause} -\index{clauses!map@\code{map}} -\index{target construct@\code{target} construct} -\index{constructs!target@\code{target}} -\index{target data construct@\code{target}~\code{data} construct} -\index{constructs!target data@\code{target}~\code{data}} +\index{array sections!in map clause@in \kcode{map} clause} +\index{map clause@\kcode{map} clause} +\index{clauses!map@\kcode{map}} +\index{target construct@\kcode{target} construct} +\index{constructs!target@\kcode{target}} +\index{target data construct@\kcode{target data} construct} +\index{constructs!target data@\kcode{target data}} -The following examples show the usage of array sections in \code{map} clauses -on \code{target} and \code{target} \code{data} constructs. +The following examples show the usage of \plc{array sections} in \kcode{map} clauses +on \kcode{target} and \kcode{target data} constructs. This example shows the invalid usage of two separate sections of the same array -inside of a \code{target} construct. +inside of a \kcode{target} construct. \cexample[4.0]{array_sections}{1} @@ -21,7 +21,7 @@ \section{Array Sections in Device Constructs} \pagebreak This example shows the invalid usage of two separate sections of the same array -inside of a \code{target} construct. +inside of a \kcode{target} construct. \cexample[4.0]{array_sections}{2} @@ -29,7 +29,7 @@ \section{Array Sections in Device Constructs} \pagebreak This example shows the valid usage of two separate sections of the same array inside -of a \code{target} construct. +of a \kcode{target} construct. \cexample[4.0]{array_sections}{3} @@ -37,7 +37,7 @@ \section{Array Sections in Device Constructs} \pagebreak This example shows the valid usage of a wholly contained array section of an already -mapped array section inside of a \code{target} construct. +mapped array section inside of a \kcode{target} construct. \cexample[4.0]{array_sections}{4} diff --git a/devices/array_shaping.tex b/devices/array_shaping.tex index 6615ebb..cfdb1e3 100644 --- a/devices/array_shaping.tex +++ b/devices/array_shaping.tex @@ -1,15 +1,15 @@ \section{Array Shaping} \label{sec:array-shaping} \index{array shaping!in motion-clause@in \plc{motion-clause}} -\index{constructs!target update@\code{target}~\code{update}} -\index{target update construct@\code{target}~\code{update} construct!to clause@\code{to} clause} -\index{target update construct@\code{target}~\code{update} construct!from clause@\code{from} clause} +\index{constructs!target update@\kcode{target update}} +\index{target update construct@\kcode{target update} construct!to clause@\kcode{to} clause} +\index{target update construct@\kcode{target update} construct!from clause@\kcode{from} clause} -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} \ccppspecificstart A pointer variable can be shaped to a multi-dimensional array to facilitate @@ -20,27 +20,28 @@ \section{Array Shaping} \end{description} where each $s_i$ is an integral-type expression of positive value. The shape-operator can appear in either the \plc{motion-clause} -of the \code{target}~\code{update} directive or the \code{depend} clause. +of the \kcode{target update} directive or the \kcode{depend} clause. The following example shows the use of the shape-operator in the -\code{target}~\code{update} directive. The shape-operator \code{([nx][ny+2])} -casts pointer variable $a$ to a 2-dimensional array of size -\plc{nx}$\times$\plc{(ny+2)}. The resulting array is then accessed as -array sections (such as \code{[0:nx][1]} and \code{[0:nx][ny]}) -in the \code{from} or \code{to} clause for transferring two columns of +\kcode{target update} directive. The shape-operator \ucode{([nx][ny+2])} +casts pointer variable \ucode{a} to a 2-dimensional array of size +%\ucode{nx}$\times$\ucode{(ny+2)}. The resulting array is then accessed as +\ucode{nx}*\ucode{(ny+2)}. The resulting array is then accessed as +array sections (such as \ucode{[0:nx][1]} and \ucode{[0:nx][ny]}) +in the \kcode{from} or \kcode{to} clause for transferring two columns of noncontiguous boundary data from or to the device. Note the use of additional parentheses -around the shape-operator and $a$ to ensure the correct precedence +around the shape-operator and \ucode{a} to ensure the correct precedence over array-section operations. \cnexample[5.1]{array_shaping}{1} \ccppspecificend -\clearpage +%\clearpage The shape operator is not defined for Fortran. Explicit array shaping of procedure arguments can be used instead to achieve a similar goal. -Below is the Fortran-equivalent of the above example that illustrates +Below is the Fortran equivalent of the above example that illustrates the support of transferring two rows of noncontiguous boundary -data in the \code{target}~\code{update} directive. +data in the \kcode{target update} directive. \ffreeexample[5.2]{array_shaping}{1} diff --git a/devices/async_target_depend.tex b/devices/async_target_depend.tex index 5410d8a..ea5b742 100644 --- a/devices/async_target_depend.tex +++ b/devices/async_target_depend.tex @@ -1,14 +1,14 @@ \pagebreak -\section{Asynchronous \code{target} Execution and Dependences} +\section{Asynchronous \kcode{target} Execution and Dependences} \label{sec:async_target_exec_depend} -Asynchronous execution of a \code{target} region can be accomplished -by creating an explicit task around the \code{target} region. Examples +Asynchronous execution of a \kcode{target} region can be accomplished +by creating an explicit task around the \kcode{target} region. Examples with explicit tasks are shown at the beginning of this section. -As of OpenMP 4.5 and beyond the \code{nowait} clause can be used on the -\code{target} directive for asynchronous execution. Examples with -\code{nowait} clauses follow the explicit \code{task} examples. +As of OpenMP 4.5 and beyond the \kcode{nowait} clause can be used on the +\kcode{target} directive for asynchronous execution. Examples with +\kcode{nowait} clauses follow the examples with explicit tasks. -This section also shows the use of \code{depend} clauses to order +This section also shows the use of \kcode{depend} clauses to order executions through dependences. diff --git a/devices/async_target_nowait.tex b/devices/async_target_nowait.tex index c60ea8e..5e14fe0 100644 --- a/devices/async_target_nowait.tex +++ b/devices/async_target_nowait.tex @@ -1,33 +1,33 @@ -\subsection{\code{nowait} Clause on \code{target} Construct} +\subsection{\kcode{nowait} Clause on \kcode{target} Construct} \label{subsec:target_nowait_clause} -\index{target construct@\code{target} construct!nowait clause@\code{nowait} clause} -\index{clauses!nowait@\code{nowait}} -\index{nowait clause@\code{nowait} clause} +\index{target construct@\kcode{target} construct!nowait clause@\kcode{nowait} clause} +\index{clauses!nowait@\kcode{nowait}} +\index{nowait clause@\kcode{nowait} clause} The following example shows how to execute code asynchronously on a -device without an explicit task. The \code{nowait} clause on a \code{target} +device without an explicit task. The \kcode{nowait} clause on a \kcode{target} construct allows the thread of the \plc{target task} to perform other -work while waiting for the \code{target} region execution to complete. -Hence, the \code{target} region can execute asynchronously on the +work while waiting for the \kcode{target} region execution to complete. +Hence, the \kcode{target} region can execute asynchronously on the device (without requiring a host thread to idle while waiting for -the \plc{target task} execution to complete). +the target task execution to complete). -In this example the product of two vectors (arrays), \plc{v1} -and \plc{v2}, is formed. One half of the operations is performed +In this example the product of two vectors (arrays), \ucode{v1} +and \ucode{v2}, is formed. One half of the operations is performed on the device, and the last half on the host, concurrently. After a team of threads is formed the primary thread generates -the \plc{target task} while the other threads can continue on, without a barrier, +the target task while the other threads can continue on, without a barrier, to the execution of the host portion of the vector product. -The completion of the \plc{target task} (asynchronous target execution) is +The completion of the target task (asynchronous target execution) is guaranteed by the synchronization in the implicit barrier at the end of the -host vector-product worksharing loop region. See the \code{barrier} +host vector-product worksharing loop region. See the \kcode{barrier} glossary entry in the OpenMP specification for details. -The host loop scheduling is \code{dynamic}, to balance the host thread executions, since +The host loop scheduling is \kcode{dynamic}, to balance the host thread executions, since one thread is being used for offload generation. In the situation where -little time is spent by the \plc{target task} in setting -up and tearing down the target execution, \code{static} scheduling may be desired. +little time is spent by the target task in setting +up and tearing down the target execution, \kcode{static} scheduling may be desired. \cexample[5.1]{async_target}{3} diff --git a/devices/async_target_nowait_depend.tex b/devices/async_target_nowait_depend.tex index 8ea4e01..3b1b55b 100644 --- a/devices/async_target_nowait_depend.tex +++ b/devices/async_target_nowait_depend.tex @@ -1,21 +1,21 @@ %begin -\subsection{Asynchronous \code{target} with \code{nowait} and \code{depend} Clauses} +\subsection{Asynchronous \kcode{target} with \kcode{nowait} and \kcode{depend} Clauses} \label{subsec:async_target_nowait_depend} -\index{target construct@\code{target} construct!nowait clause@\code{nowait} clause} -\index{target construct@\code{target} construct!depend clause@\code{depend} clause} -\index{nowait clause@\code{nowait} clause} -\index{depend clause@\code{depend} clause} -\index{clauses!nowait@\code{nowait}} -\index{clauses!depend@\code{depend}} +\index{target construct@\kcode{target} construct!nowait clause@\kcode{nowait} clause} +\index{target construct@\kcode{target} construct!depend clause@\kcode{depend} clause} +\index{nowait clause@\kcode{nowait} clause} +\index{depend clause@\kcode{depend} clause} +\index{clauses!nowait@\kcode{nowait}} +\index{clauses!depend@\kcode{depend}} More details on dependences can be found in \specref{sec:task_depend}, Task Dependences. In this example, there are three flow dependences. In the first two dependences the target task does not execute until the preceding explicit tasks have finished. These -dependences are produced by arrays \plc{v1} and \plc{v2} with the \code{out} dependence type in the first two tasks, and the \code{in} dependence type in the target task. +dependences are produced by arrays \ucode{v1} and \ucode{v2} with the \kcode{out} dependence type in the first two tasks, and the \kcode{in} dependence type in the \plc{target task}. -The last dependence is produced by array \plc{p} with the \code{out} dependence type in the target task, and the \code{in} dependence type in the last task. The last task does not execute until the target task finishes. +The last dependence is produced by array \ucode{p} with the \kcode{out} dependence type in the target task, and the \kcode{in} dependence type in the last task. The last task does not execute until the target task finishes. -The \code{nowait} clause on the \code{target} construct creates a deferrable \plc{target task}, allowing the encountering task to continue execution without waiting for the completion of the \plc{target task}. +The \kcode{nowait} clause on the \kcode{target} construct creates a deferrable target task, allowing the encountering task to continue execution without waiting for the completion of the target task. \cexample[4.5]{async_target}{4} diff --git a/devices/async_target_with_tasks.tex b/devices/async_target_with_tasks.tex index ab274f8..83c76d4 100644 --- a/devices/async_target_with_tasks.tex +++ b/devices/async_target_with_tasks.tex @@ -1,34 +1,34 @@ -\subsection{Asynchronous \code{target} with Tasks} +\subsection{Asynchronous \kcode{target} with Tasks} \label{subsec:async_target_with_tasks} -\index{target construct@\code{target} construct} -\index{task construct@\code{task} construct} +\index{target construct@\kcode{target} construct} +\index{task construct@\kcode{task} construct} -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} -The following example shows how the \code{task} and \code{target} constructs -are used to execute multiple \code{target} regions asynchronously. The task that -encounters the \code{task} construct generates an explicit task that contains -a \code{target} region. The thread executing the explicit task encounters a task -scheduling point while waiting for the execution of the \code{target} region +The following example shows how the \kcode{task} and \kcode{target} constructs +are used to execute multiple \kcode{target} regions asynchronously. The task that +encounters the \kcode{task} construct generates an explicit task that contains +a \kcode{target} region. The thread executing the explicit task encounters a task +scheduling point while waiting for the execution of the \kcode{target} region to complete, allowing the thread to switch back to the execution of the encountering task or one of the previously generated explicit tasks. \cexample[5.1]{async_target}{1} \pagebreak -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} -The Fortran version has an interface block that contains the \code{declare} \code{target}. +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} +The Fortran version has an interface block that contains the \kcode{declare target}. An identical statement exists in the function declaration (not shown here). \ffreeexample[4.0]{async_target}{1} -The following example shows how the \code{task} and \code{target} constructs -are used to execute multiple \code{target} regions asynchronously. The task dependence +The following example shows how the \kcode{task} and \kcode{target} constructs +are used to execute multiple \kcode{target} regions asynchronously. The task dependence ensures that the storage is allocated and initialized on the device before it is accessed. @@ -36,31 +36,31 @@ \subsection{Asynchronous \code{target} with Tasks} The Fortran example below is similar to the C version above. Instead of pointers, though, it uses the convenience of Fortran allocatable arrays on the device. In order to preserve the arrays -allocated on the device across multiple \code{target} regions, a \code{target}~\code{data} region +allocated on the device across multiple \kcode{target} regions, a \kcode{target data} region is used in this case. -If there is no shape specified for an allocatable array in a \code{map} clause, only the array descriptor +If there is no shape specified for an allocatable array in a \kcode{map} clause, only the array descriptor (also called a dope vector) is mapped. That is, device space is created for the descriptor, and it -is initially populated with host values. In this case, the \plc{v1} and \plc{v2} arrays will be in a -non-associated state on the device. When space for \plc{v1} and \plc{v2} is allocated on the device -in the first \code{target} region the addresses to the space will be included in their descriptors. +is initially populated with host values. In this case, the \ucode{v1} and \ucode{v2} arrays will be in a +non-associated state on the device. When space for \ucode{v1} and \ucode{v2} is allocated on the device +in the first \kcode{target} region the addresses to the space will be included in their descriptors. -At the end of the first \code{target} region, the arrays \plc{v1} and \plc{v2} are preserved on the device -for access in the second \code{target} region. At the end of the second \code{target} region, the data -in array \plc{p} is copied back, the arrays \plc{v1} and \plc{v2} are not. +At the end of the first \kcode{target} region, the arrays \ucode{v1} and \ucode{v2} are preserved on the device +for access in the second \kcode{target} region. At the end of the second \kcode{target} region, the data +in array \ucode{p} is copied back, the arrays \ucode{v1} and \ucode{v2} are not. -\index{task construct@\code{task} construct!depend clause@\code{depend} clause} -\index{clauses!depend@\code{depend}} -\index{depend clause@\code{depend} clause} -A \code{depend} clause is used in the \code{task} directive to provide a wait at the beginning of the second -\code{target} region, to insure that there is no race condition with \plc{v1} and \plc{v2} in the two tasks. -It would be noncompliant to use \plc{v1} and/or \plc{v2} in lieu of \plc{N} in the \code{depend} clauses, -because the use of non-allocated allocatable arrays as list items in a \code{depend} clause would +\index{task construct@\kcode{task} construct!depend clause@\kcode{depend} clause} +\index{clauses!depend@\kcode{depend}} +\index{depend clause@\kcode{depend} clause} +A \kcode{depend} clause is used in the \kcode{task} directive to provide a wait at the beginning of the second +\kcode{target} region, to insure that there is no race condition with \ucode{v1} and \ucode{v2} in the two tasks. +It would be noncompliant to use \ucode{v1} and/or \ucode{v2} in lieu of \ucode{N} in the \kcode{depend} clauses, +because the use of non-allocated allocatable arrays as list items in a \kcode{depend} clause would lead to unspecified behavior. \noteheader{--} This example is not strictly compliant with the OpenMP 4.5 specification since the allocation status -of allocatable arrays \plc{v1} and \plc{v2} is changed inside the \code{target} region, which is not allowed. -(See the restrictions for the \code{map} clause in the \plc{Data-mapping Attribute Rules and Clauses} +of allocatable arrays \ucode{v1} and \ucode{v2} is changed inside the \kcode{target} region, which is not allowed. +(See the restrictions for the \kcode{map} clause in the \docref{Data-mapping Attribute Rules and Clauses} section of the specification.) However, the intention is to relax the restrictions on mapping of allocatable variables in the next release of the specification so that the example will be compliant. diff --git a/devices/declare_target.tex b/devices/declare_target.tex index c83d352..f1e5a7c 100644 --- a/devices/declare_target.tex +++ b/devices/declare_target.tex @@ -1,8 +1,8 @@ -\pagebreak +%\pagebreak \section{Declare Target Directive} \label{sec:declare_target} -%\index{declare target directive@\code{declare}~\code{target} directive!enter clause@\code{enter} clause} +%\index{declare target directive@\code{declare target} directive!enter clause@\code{enter} clause} %\index{enter clause@\code{enter} clause} %\index{clauses!enter@\code{enter}} @@ -10,96 +10,126 @@ \subsection{Declare Target Directive for a Procedure} \label{subsec:declare_target_function} -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} The following example shows how the declare target directive -is used to indicate that the corresponding call inside a \code{target} region -is to a \code{fib} function that can execute on the default target device. +is used to indicate that the corresponding call inside a \kcode{target} region +is to a \ucode{fib} procedure that can execute on the default target device. -A version of the function is also available on the host device. When the \code{if} -clause conditional expression on the \code{target} construct evaluates to \plc{false}, -the \code{target} region (thus \code{fib}) will execute on the host device. +A version of the function is also available on the host device. When the \kcode{if} +clause conditional expression on the \kcode{target} construct evaluates to \plc{false}, +the \kcode{target} region (thus \ucode{fib}) will execute on the host device. -For the following C/C++ code the declaration of the function \code{fib} appears between the -\code{begin}~\code{declare}~\code{target} and \code{end}~\code{declare}~\code{target} directives. -In the corresponding Fortran code, the \code{declare}~\code{target} directive appears at the +For the following C/C++ code the declaration of the function \ucode{fib} appears between the +\kcode{begin declare target} and \kcode{end declare target} directives. +In the corresponding Fortran code, the \kcode{declare target} directive appears at the end of the specification part of the subroutine. \cexample[5.1]{declare_target}{1} -The Fortran \code{fib} subroutine contains a \code{declare} \code{target} declaration +The Fortran \ucode{fib} subroutine contains a \kcode{declare target} declaration to indicate to the compiler to create an device executable version of the procedure. -The subroutine name has not been included on the \code{declare} \code{target} +The subroutine name has not been included on the \kcode{declare target} directive and is, therefore, implicitly assumed. -The program uses the \code{module\_fib} module, which presents an explicit interface to -the compiler with the \code{declare} \code{target} declarations for processing -the \code{fib} call. +The program uses the \ucode{module_fib} module, which presents an explicit interface to +the compiler with the \kcode{declare target} declarations for processing +the \ucode{fib} call. \ffreeexample[4.0]{declare_target}{1} +\pagebreak The next Fortran example shows the use of an external subroutine. As the subroutine -is neither use associated nor an internal procedure, the \code{declare} \code{target} +is neither use associated nor an internal procedure, the \kcode{declare target} declarations within a external subroutine are unknown to the main program unit; -therefore, a \code{declare} \code{target} must be provided within the program +therefore, a \kcode{declare target} must be provided within the program scope for the compiler to determine that a target binary should be available. \ffreeexample[4.0]{declare_target}{2} +\subsection{Declare Target Directive for Indirect Procedure Call} +\label{subsec:indirect} + +\index{clauses!indirect@\kcode{indirect}} +\index{indirect clause@\kcode{indirect} clause} + +In the OpenMP 5.1 Specification the \kcode{indirect} clause was added to allow +indirect procedure calls, via function pointers, in a \kcode{target} region. +The functions to be allowed indirect invocation are specified in an \kcode{enter} +clause of a declare target directive, along with the \kcode{indirect} clause. +The clause has an optional enabling/disabling argument (default enabled). In the +absence of the indirect clause the function pointer would be mapped as a scalar +(firstprivate) that would point to the host versions of the functions. +Indirect clause informs the compiler that the function can potentially be +used via function pointers and to use device versions of the same within +the target region. + +Only with an enabled \kcode{indirect} clause and a function specification in an \kcode{enter} clause +of a declare target directive may a function be called with an indirect invocation in a \kcode{target} region. +(Note: this feature limits the number of functions that can be used by function +pointers in the \kcode{target} region to a restricted list for the compiler.) +%% KFM should be "to a restricted... -> to those listed in the \code{enter} clause. + +In the following example, the \kcode{declare target} \kcode{enter(\ucode{fun1,fun2})} +\kcode{indirect} directive specifies that the \ucode{fun1} and \ucode{fun2} functions may +be invoked with a function pointer in the \kcode{target} region. +Either the \ucode{fun1} or \ucode{fun2} function is invoked by the \ucode{fptr} function +pointer in the \kcode{target} construct, as determined by the value of \ucode{count}. + +\cexample[5.2]{declare_target_indirect_call}{1} +\ffreeexample[5.2]{declare_target_indirect_call}{1} + \subsection{Declare Target Directive for Class Type} \label{subsec:declare_target_class} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} -The following example shows the use of the \code{begin}~\code{declare}~\code{target} -and \code{end}~\code{declare}~\code{target} pair to designate the beginning and +The following example shows the use of the \kcode{begin declare target} +and \kcode{end declare target} pair to designate the beginning and end of the affected declarations, as introduced in OpenMP 5.1. -The \code{begin}~\code{declare}~\code{target} directive was defined +The \kcode{begin declare target} directive was defined to symmetrically complement the terminating (``end'') directive. \cppspecificstart -The example also shows 3 different ways to use a declare target directive for a -class and an external member-function definition (for the \plc{XOR1}, \plc{XOR2}, -and \plc{XOR3} classes and definitions for their corresponding \plc{foo} member functions). +The example also shows 3 different ways to use a \kcode{declare target} directive for a +class and an external member-function definition (for the \ucode{XOR1}, \ucode{XOR2}, +and \ucode{XOR3} classes and definitions for their corresponding \ucode{foo()} member functions). -For \plc{XOR1}, a \code{begin}~\code{declare}~\code{target} and -\code{end}~\code{declare}~\code{target} directive +For \ucode{XOR1}, a \kcode{begin declare target} and +\kcode{end declare target} directive enclose both the class and its member function definition. The compiler immediately -knows to create a device version of the function for execution in a \code{target} region. +knows to create a device version of the function for execution in a \kcode{target} region. -For \plc{XOR2}, the class member function definition is not specified with a -declare target directive. +For \ucode{XOR2}, the class member function definition is not specified with a +\kcode{declare target} directive. An implicit declare target is created for the member function definition. The same applies if this declaration arrangement for the class and function are included through a header file. -For \plc{XOR3}, the class and its member function are not enclosed by \code{begin}~\code{declare}~\code{target} -and \code{end}~\code{declare}~\code{target} directives, +For \ucode{XOR3}, the class and its member function are not enclosed by \kcode{begin declare target} +and \kcode{end declare target} directives, but there is an implicit declare target since the class, its function -and the \code{target} construct are in the same file scope. That is, the class -and its function are treated as if delimited by a declare target directive. +and the \kcode{target} construct are in the same file scope. That is, the class +and its function are treated as if delimited by a \kcode{declare target} directive. The same applies if the class and function are included through a header file. \cppnexample[5.1]{declare_target}{2a} -% blue line floater at top of this page for "C++, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{C++ (cont.)}{8em} -\end{figure} +\topmarker{C++} Often class definitions and their function definitions are included in separate files, -as shown in \splc{declare_target.2b_classes.hpp} and \splc{declare_target.2b_functions.cpp} below. -In this case, it is necessary to specify in a declare target directive for the classes. -However, as long as the \splc{2b_functions.cpp} file includes the corresponding declare target classes, +as shown in \example{declare_target.2b_classes.hpp} and \example{declare_target.2b_functions.cpp} example code files below. +In this case, it is necessary to specify a declare target directive for the classes. +However, as long as the \example{2b_functions.cpp} file includes the corresponding declare target classes, there is no need to specify the functions with a declare target directive. The functions are treated as if they are specified with a declare target directive. -Compiling the \splc{declare_target.2b_functions.cpp} and \splc{declare_target.2b_main.cpp} files +Compiling the \example{declare_target.2b_functions.cpp} and \example{declare_target.2b_main.cpp} files separately and linking them, will create appropriate executable device functions for the target device. \srcnexample[5.1]{declare_target}{2b_classes}{hpp} @@ -109,23 +139,19 @@ \subsection{Declare Target Directive for Class Type} \cppnexample[5.1]{declare_target}{2b_main}[1] %\cppspecificend -% blue line floater at top of this page for "C++, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{C++ (cont.)}{8em} -\end{figure} +\topmarker{C++} %\cppspecificstart -The following example shows how the \code{begin}~\code{declare} \code{target} and \code{end} -\code{declare} \code{target} directives are used to enclose the declaration -of a variable \plc{varY} with a class type \code{typeY}. +The following example shows how the \kcode{begin declare target} and \kcode{end declare target} directives are used to enclose the declaration +of a variable \ucode{varY} with a class type \ucode{typeY}. %Prior to OpenMP 5.0, the member function \code{typeY::foo()} cannot -%be accessed on a target device because its declaration did not appear between \code{begin}~\code{declare} -%\code{target} and \code{end} \code{declare} \code{target} directives. +%be accessed on a target device because its declaration did not appear between \code{begin declare} +%\code{target} and \code{end declare target} directives. -This example shows pre-OpenMP 5.0 behavior for the \plc{varY.foo()} function call (an error). -The member function \code{typeY::foo()} cannot be accessed on a target device because its -declaration does not appear between \code{begin}~\code{declare}~\code{target} and -\code{end}~\code{declare}~\code{target} directives. As of OpenMP 5.0, the +This example shows pre-OpenMP 5.0 behavior for the \ucode{varY.foo()} function call (an error). +The member function \ucode{typeY::foo()} cannot be accessed on a target device because its +declaration does not appear between \kcode{begin declare target} and +\kcode{end declare target} directives. As of OpenMP 5.0, the function is implicitly declared with a declare target directive and will successfully execute the function on the device. See previous examples. %as if it were included in list or block of a declare target directive, @@ -136,76 +162,78 @@ \subsection{Declare Target Directive for Class Type} \subsection{Declare Target Directive for Variables} \label{subsec:declare_target_variables} -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} The following examples show how the declare target directive is used to indicate that global variables are mapped to the implicit device data environment of each target device. -In the following example, the declarations of the variables \plc{p}, \plc{v1}, and \plc{v2} appear -between \code{begin}~\code{declare}~\code{target} and \code{end}~\code{declare}~\code{target} +In the following example, the declarations of the variables \ucode{p}, \ucode{v1}, and \ucode{v2} appear +between \kcode{begin declare target} and \kcode{end declare target} directives indicating that the variables are mapped to the implicit device data -environment of each target device. The \code{target} \code{update} directive -is then used to manage the consistency of the variables \plc{p}, \plc{v1}, and \plc{v2} between the +environment of each target device. The \kcode{target update} directive +is then used to manage the consistency of the variables \ucode{p}, \ucode{v1}, and \ucode{v2} between the data environment of the encountering host device task and the implicit device data environment of the default target device. \cexample[5.1]{declare_target}{3} The Fortran version of the above C code uses a different syntax. Fortran modules -use a list syntax on the \code{declare} \code{target} directive to declare +use a list syntax on the \kcode{declare target} directive to declare mapped variables. \ffreeexample[4.0]{declare_target}{3} -The following example also indicates that the function \plc{Pfun()} is available on the -target device, as well as the variable \plc{Q}, which is mapped to the implicit device -data environment of each target device. The \code{target} \code{update} directive -is then used to manage the consistency of the variable \plc{Q} between the data environment +\pagebreak +The following example also indicates that the function \ucode{Pfun()} is available on the +target device, as well as the variable \ucode{Q}, which is mapped to the implicit device +data environment of each target device. The \kcode{target update} directive +is then used to manage the consistency of the variable \ucode{Q} between the data environment of the encountering host device task and the implicit device data environment of the default target device. In the following example, the function and variable declarations appear between -the \code{begin}~\code{declare}~\code{target} and \code{end}~\code{declare}~\code{target} +the \kcode{begin declare target} and \kcode{end declare target} directives. \cexample[5.1]{declare_target}{4} The Fortran version of the above C code uses a different syntax. In Fortran modules -a list syntax on the \code{declare} \code{target} directive is used to declare -mapped variables and procedures. The \plc{N} and \plc{Q} variables are declared as a comma -separated list. When the \code{declare} \code{target} directive is used to +a list syntax on the \kcode{declare target} directive is used to declare +mapped variables and procedures. The \ucode{N} and \ucode{Q} variables are declared as a comma +separated list. When the \kcode{declare target} directive is used to declare just the procedure, the procedure name need not be listed -- it is implicitly -assumed, as illustrated in the \plc{Pfun()} function. +assumed, as illustrated in the \ucode{Pfun()} function. \ffreeexample[4.0]{declare_target}{4} -\subsection{Declare Target Directive with \code{declare}~\code{simd}} +\subsection{Declare Target Directive with \kcode{declare simd}} \label{subsec:declare_target_simd} -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} -\index{directives!declare simd@\code{declare}~\code{simd}} -\index{declare simd directive@\code{declare}~\code{simd} directive} +\index{directives!declare simd@\kcode{declare simd}} +\index{declare simd directive@\kcode{declare simd} directive} -The following example shows how the \code{begin}~\code{declare}~\code{target} and -\code{end}~\code{declare}~\code{target} directives are used to indicate that a function -is available on a target device. The \code{declare} \code{simd} directive indicates -that there is a SIMD version of the function \plc{P()} that is available on the target +The following example shows how the \kcode{begin declare target} and +\kcode{end declare target} directives are used to indicate that a function +is available on a target device. The \kcode{declare simd} directive indicates +that there is a SIMD version of the function \ucode{P()} that is available on the target device as well as one that is available on the host device. \cexample[5.1]{declare_target}{5} The Fortran version of the above C code uses a different syntax. Fortran modules -use a list syntax of the \code{declare} \code{target} declaration for the mapping. -Here the \plc{N} and \plc{Q} variables are declared in the list form as a comma separated list. +use a list syntax of the \kcode{declare target} declaration for the mapping. +%%KFM use a list syntax in the \kcode{declare target} directive for the mapping. +Here the \ucode{N} and \ucode{Q} variables are declared in the list form as a comma separated list. The function declaration does not use a list and implicitly assumes the function name. In this Fortran example row and column indices are reversed relative to the C/C++ example, as is usual for codes optimized for memory access. @@ -213,57 +241,58 @@ \subsection{Declare Target Directive with \code{declare}~\code{simd}} \ffreeexample[4.0]{declare_target}{5} -\subsection{Declare Target Directive with \code{link} Clause} +\subsection{Declare Target Directive with \kcode{link} Clause} \label{subsec:declare_target_link} -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} -\index{clauses!link@\code{link}} -\index{link clause@\code{link} clause} +\index{clauses!link@\kcode{link}} +\index{link clause@\kcode{link} clause} -In the OpenMP 4.5 standard the declare target directive was extended to allow static -data to be mapped, \emph{when needed}, through a \code{link} clause. +In the OpenMP 4.5 standard the \kcode{declare target} directive was extended to allow static +data to be mapped, \emph{when needed}, through a \kcode{link} clause. -Data storage for items listed in the \code{link} clause becomes available on the device -when it is mapped implicitly or explicitly in a \code{map} clause, and it persists for the scope of -the mapping (as specified by a \code{target} construct, -a \code{target}~\code{data} construct, or -\code{target}~\code{enter/exit}~\code{data} constructs). +Data storage for items listed in the \kcode{link} clause becomes available on the device +when it is mapped implicitly or explicitly in a \kcode{map} clause, and it persists for the scope of +the mapping (as specified by a \kcode{target} construct, +a \kcode{target data} construct, or +\kcode{target enter/exit data} constructs). Tip: When all the global data items will not fit on a device and are not needed -simultaneously, use the \code{link} clause and map the data only when it is needed. +simultaneously, use the \kcode{link} clause and map the data only when it is needed. +%%KFM simultaneously, use the \kcode{link} clause and map sections of the data only when it is needed. The following C and Fortran examples show two sets of data (single precision and double precision) that are global on the host for the entire execution on the host; but are only used globally on the device for part of the program execution. The single precision data -are allocated and persist only for the first \code{target} region. Similarly, the -double precision data are in scope on the device only for the second \code{target} region. +are allocated and persist only for the first \kcode{target} region. Similarly, the +double precision data are in scope on the device only for the second \kcode{target} region. \cexample[5.1]{declare_target}{6} \ffreeexample[4.5]{declare_target}{6} -\subsection{Declare Target Directive with \code{device\_type} Clause} +\subsection{Declare Target Directive with \kcode{device_type} Clause} \label{subsec:declare_target_device_type} -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} -\index{clauses!device_type@\code{device\_type}} -\index{device_type clause@\code{device\_type} clause} +\index{clauses!device_type@\kcode{device_type}} +\index{device_type clause@\kcode{device_type} clause} -The \code{declare}~\code{target} directives apply to procedures to ensure that they can be executed or accessed on a device. -The \code{device\_type} clause specifies whether a version of the procedure or variable should be made available on the host, device or both. -This example uses \code{nohost} for a procedure \plc{foo}. Only a device version of the procedure \plc{foo} is made available. -If the variant function \plc{foo\_onhost} is not specified for the host fallback execution, the call to \plc{foo} from the \code{target} region will result in a link time error due to the code generated for host execution of the target region. -This is because host symbol for the device routine \plc{foo} marked as \code{nohost} is not required to be present in the host environment. +The \kcode{declare target} directives apply to procedures to ensure that they can be executed or accessed on a device. +The \kcode{device_type} clause specifies whether a version of the procedure or variable should be made available on the host, device or both. +This example uses \kcode{nohost} for a procedure \ucode{foo()}. Only a device version of the procedure \ucode{foo()} is made available. +If the variant function \ucode{foo_onhost()} is not specified for the host fallback execution, the call to \ucode{foo()} from the \kcode{target} region will result in a link time error due to the code generated for host execution of the target region. +This is because host symbol for the device routine \ucode{foo()} marked as \kcode{nohost} is not required to be present in the host environment. \cexample[5.2]{declare_target}{7} \ffreeexample[5.2]{declare_target}{7} diff --git a/devices/device.tex b/devices/device.tex index ac28963..0593a55 100644 --- a/devices/device.tex +++ b/devices/device.tex @@ -1,44 +1,44 @@ -\pagebreak +%\pagebreak \section{Device Routines} \label{sec:device} -\subsection{\code{omp\_is\_initial\_device} Routine} +\subsection{\kcode{omp_is_initial_device} Routine} \label{subsec:device_is_initial} -\index{routines!omp_is_initial_device@\scode{omp_is_initial_device}} -\index{omp_is_initial_device routine@\scode{omp_is_initial_device} routine} +\index{routines!omp_is_initial_device@\kcode{omp_is_initial_device}} +\index{omp_is_initial_device routine@\kcode{omp_is_initial_device} routine} -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} -The following example shows how the \code{omp\_is\_initial\_device} runtime library routine +The following example shows how the \kcode{omp_is_initial_device} runtime library routine can be used to query if a code is executing on the initial host device or on a -target device. The example then sets the number of threads in the \code{parallel} +target device. The example then sets the number of threads in the \kcode{parallel} region based on where the code is executing. \cexample[5.1]{device}{1} \ffreeexample[4.0]{device}{1} -\subsection{\code{omp\_get\_num\_devices} Routine} +\subsection{\kcode{omp_get_num_devices} Routine} \label{subsec:device_num_devices} -The following example shows how the \code{omp\_get\_num\_devices} runtime library routine +The following example shows how the \kcode{omp_get_num_devices} runtime library routine can be used to determine the number of devices. \cexample[4.0]{device}{2} \ffreeexample[4.0]{device}{2} -\subsection{\code{omp\_set\_default\_device} and \\ -\code{omp\_get\_default\_device} Routines} +\subsection{\kcode{omp_set_default_device} and \\ +\kcode{omp_get_default_device} Routines} \label{subsec:device_is_set_get_default} -\index{routines!omp_set_default_device@\scode{omp_set_default_device}} -\index{omp_set_default_device routine@\scode{omp_set_default_device} routine} +\index{routines!omp_set_default_device@\kcode{omp_set_default_device}} +\index{omp_set_default_device routine@\kcode{omp_set_default_device} routine} -The following example shows how the \code{omp\_set\_default\_device} and \code{omp\_get\_default\_device} +The following example shows how the \kcode{omp_set_default_device} and \kcode{omp_get_default_device} runtime library routines can be used to set the default device and determine the default device respectively. @@ -53,102 +53,138 @@ \subsection{\code{omp\_set\_default\_device} and \\ \subsection{Target Memory and Device Pointers Routines} \label{subsec:target_mem_and_device_ptrs} -\index{routines!omp_target_alloc@\scode{omp_target_alloc}} -\index{omp_target_alloc routine@\scode{omp_target_alloc} routine} -\index{routines!omp_target_memcpy@\scode{omp_target_memcpy}} -\index{omp_target_memcpy routine@\scode{omp_target_memcpy} routine} -\index{routines!omp_target_free@\scode{omp_target_free}} -\index{omp_target_free routine@\scode{omp_target_free} routine} +\index{routines!omp_target_alloc@\kcode{omp_target_alloc}} +\index{omp_target_alloc routine@\kcode{omp_target_alloc} routine} +\index{routines!omp_target_memcpy@\kcode{omp_target_memcpy}} +\index{omp_target_memcpy routine@\kcode{omp_target_memcpy} routine} +\index{routines!omp_target_free@\kcode{omp_target_free}} +\index{omp_target_free routine@\kcode{omp_target_free} routine} The following example shows how to create space on a device, transfer data to and from that space, and free the space, using API calls. The API calls directly execute allocation, copy and free operations on the device, without invoking -any mapping through a \code{target} directive. The \code{omp\_target\_alloc} routine allocates space -and returns a device pointer for referencing the space in the \code{omp\_target\_memcpy} -API routine on the host. The \code{omp\_target\_free} routine frees the space on the device. +any mapping through a \kcode{target} directive. The \kcode{omp_target_alloc} routine allocates space +and returns a device pointer for referencing the space in the \kcode{omp_target_memcpy} +API routine on the host. The \kcode{omp_target_free} routine frees the space on the device. -\index{target construct@\code{target} construct!is_device_ptr clause@\scode{is_device_ptr} clause} -\index{is_device_ptr clause@\scode{is_device_ptr} clause} -\index{clauses!is_device_ptr@\scode{is_device_ptr}} +\index{target construct@\kcode{target} construct!is_device_ptr clause@\kcode{is_device_ptr} clause} +\index{is_device_ptr clause@\kcode{is_device_ptr} clause} +\index{clauses!is_device_ptr@\kcode{is_device_ptr}} The example also illustrates how to access that space -in a \code{target} region by exposing the device pointer in an \code{is\_device\_ptr} clause. +in a \kcode{target} region by exposing the device pointer in an \kcode{is_device_ptr} clause. The example creates an array of cosine values on the default device, to be used on the host device. The function fails if a default device is not available. \cexample[4.5]{device}{4} -\index{routines!omp_target_is_present@\scode{omp_target_is_present}} -\index{omp_target_is_present routine@\scode{omp_target_is_present} routine} -\index{routines!omp_target_associate_ptr@\scode{omp_target_associate_ptr}} -\index{omp_target_associate_ptr routine@\scode{omp_target_associate_ptr} routine} -The following Fortran example illustrates how to use the \code{omp\_target\_alloc} -and \code{omp\_target\_memcpy} functions to directly allocate device +\index{routines!omp_target_is_present@\kcode{omp_target_is_present}} +\index{omp_target_is_present routine@\kcode{omp_target_is_present} routine} +\index{routines!omp_target_associate_ptr@\kcode{omp_target_associate_ptr}} +\index{omp_target_associate_ptr routine@\kcode{omp_target_associate_ptr} routine} +The following Fortran example illustrates how to use the \kcode{omp_target_alloc} +and \kcode{omp_target_memcpy} functions to directly allocate device storage and transfer data to and from a device. It also shows how to check for -the presence of device data with the \code{omp\_target\_is\_present} function and -to associate host and device storage with the \code{omp\_target\_associate\_ptr} function. +the presence of device data with the \kcode{omp_target_is_present} function and +to associate host and device storage with the \kcode{omp_target_associate_ptr} function. In Section 1 of the code, 40 bytes of storage are allocated on the default device -with the \code{omp\_target\_alloc} function, which returns a value (of type -\texttt{C\_PTR}) that contains the device address of the storage. -%A Fortran pointer (\texttt{fp}) is associated by the Fortran iso\_c\_binding -%\texttt{c\_f\_pointer} routine with the target of the C pointer (\texttt{cp}). -In the subsequent \code{target} construct, \texttt{cp} is specified on the -\code{is\_device\_ptr} clause to instruct the compiler that \texttt{cp} is +with the \kcode{omp_target_alloc} function, which returns a value (of type +\bcode{C_PTR}) that contains the device address of the storage. +%A Fortran pointer (\ucode{fp}) is associated by the Fortran \splc{iso_c_binding} +%\bcode{c_f_pointer} routine with the target of the C pointer (\ucode{cp}). +In the subsequent \kcode{target} construct, \ucode{cp} is specified on the +\kcode{is_device_ptr} clause to instruct the compiler that \ucode{cp} is a device pointer. -The device pointer (\texttt{cp}) is then associated with the Fortran pointer -(\texttt{fp}) via the \texttt{c\_f\_pointer} routine inside the \code{target} +The device pointer (\ucode{cp}) is then associated with the Fortran pointer +(\ucode{fp}) via the \bcode{c_f_pointer} routine inside the \kcode{target} construct. -As a result, \texttt{fp} points to the storage on the device that is allocated -by the \code{omp\_target\_alloc} routine. -In the \code{target} region, the value 4 is assigned to the storage on the device, +As a result, \ucode{fp} points to the storage on the device that is allocated +by the \kcode{omp_target_alloc} routine. +In the \kcode{target} region, the value 4 is assigned to the storage on the device, using the Fortran pointer. A trivial test checks that all values were correctly assigned. -The Fortran pointer (\texttt{fp}) is nullified before the end of the \code{target} region. -After the \code{target} construct, the space on the device is freed with the -\code{omp\_target\_free} function, using the device \texttt{cp} pointer +The Fortran pointer (\ucode{fp}) is nullified before the end of the \kcode{target} region. +After the \kcode{target} construct, the space on the device is freed with the +\kcode{omp_target_free} function, using the device \ucode{cp} pointer which is set to null after the call. In Section 2, the content of the storage allocated on the host is directly copied to the OpenMP allocated storage on the device. -First, storage is allocated for the device and host using \code{omp\_target\_alloc}. +First, storage is allocated for the device and host using \kcode{omp_target_alloc}. Next, on the host the device pointer, returned from the allocation -\code{omp\_target\_alloc} function, is associated with a Fortran pointer, and +\kcode{omp_target_alloc} function, is associated with a Fortran pointer, and values are assigned to the storage. Similarly, values are assigned on the device -to the device storage, after associating a Fortran pointer (\texttt{fp\_dst}) -with the device's storage pointer (\texttt{cp\_dst}). +to the device storage, after associating a Fortran pointer (\ucode{fp_dst}) +with the device's storage pointer (\ucode{cp_dst}). -Next the \code{omp\_target\_memcpy} function directly copies the host data +Next the \kcode{omp_target_memcpy} function directly copies the host data to the device storage, specified by the respective host and device pointers. This copy will overwrite -1 values in the device storage, and is checked in the -next \code{target} construct. +next \kcode{target} construct. Keyword arguments are used here for clarity. (A positional argument list is used in the next Section.) -In Section 3, space is allocated (with a Fortran ALLOCATE statement) and initialized using a -host Fortran pointer (\texttt{h\_fp}), and the address of the storage is directly assigned to a -host C pointer (\texttt{h\_cp}). -The following \code{omp\_target\_is\_present} function returns \texttt{0} (false, of integer(C\_INT) type) -to indicate that \texttt{h\_cp} does not have any corresponding storage on the default device. +In Section 3, space is allocated (with a Fortran \bcode{ALLOCATE} statement) and initialized using a +host Fortran pointer (\ucode{h_fp}), and the address of the storage is directly assigned to a +host C pointer (\ucode{h_cp}). +The following \kcode{omp_target_is_present} function returns \ucode{0} (\plc{false}, of \bcode{integer(C_INT)} type) +to indicate that \ucode{h_cp} does not have any corresponding storage on the default device. Next, the same amount of space is allocated on the default device with -the \code{omp\_target\_alloc} function, which returns a device pointer (\texttt{d\_cp}). -The device pointer \texttt{d\_cp} and host pointer \texttt{h\_cp} -are then associated using the \code{omp\_target\_associate\_ptr} function. -The device storage to which \texttt{d\_cp} points becomes the corresponding storage of -the host storage to which \texttt{h\_cp} points. -The following \code{omp\_target\_is\_present} call confirms this, by returning -a non-zero value of integer(C\_INT) type for true. - -After the association, the content of the host storage -is copied to the device using the \code{omp\_target\_memcpy} function. -In the final \code{target} construct an array section of \texttt{h\_fp} +the \kcode{omp_target_alloc} function, which returns a device pointer (\ucode{d_cp}). +The device pointer \ucode{d_cp} and host pointer \ucode{h_cp} +are then associated using the \kcode{omp_target_associate_ptr} function. +The device storage to which \ucode{d_cp} points becomes the corresponding storage of +the host storage to which \ucode{h_cp} points. +The following \kcode{omp_target_is_present} call confirms this, by returning +a non-zero value of \bcode{integer(C_INT)} type for true. + +After the association, the content of the host storage +is copied to the device using the \kcode{omp_target_memcpy} function. +In the final \kcode{target} construct an array section of \ucode{h_fp} is mapped to the device, and evaluated for correctness. -The mapping establishes a connection of \texttt{h\_fp} with -the corresponding device data in the \code{target} construct, -but does not produce an update on the device because the previous \scode{omp_target_associate_ptr} routine sets the +The mapping establishes a connection of \ucode{h_fp} with +the corresponding device data in the \kcode{target} construct, +but does not produce an update on the device because the previous \kcode{omp_target_associate_ptr} routine sets the reference count of the mapped object to infinity, meaning a mapping -without the \code{always} modifier will not +without the \kcode{always} modifier will not update the device object. \ffreeexample[5.0]{device}{4} + +\index{routines!omp_target_memcpy_async@\kcode{omp_target_memcpy_async}} +\index{omp_target_memcpy_async routine@\kcode{omp_target_memcpy_async} routine} +The following example illustrates the use of the \kcode{omp_target_memcpy_async} +routine to perform asynchronous memory copies. +The routine acts as if it is a deferrable task so that +a \kcode{taskwait} construct can be used to wait for the completion +of the deferrable task. +In the example the \kcode{omp_target_memcpy_async} routine copies host data +(\ucode{h_buf}) to device (\ucode{d_buf}). +The Fortran code uses the intrinsic \bcode{c_loc} function to get +the corresponding C pointer (\ucode{c_hbuf}) for +passing to the \kcode{omp_target_memcpy_async} routine. +The last two arguments (\ucode{0} and \ucode{NULL}) to the routine +indicate that there is no specified dependence associated with the call. +The Fortran code omits the unused last argument. + +\cexample[5.2]{device}{5} +\ffreeexample[5.2]{device}{5} + +\index{directives!depobj@\kcode{depobj}} +\index{depobj directive@\kcode{depobj} directive} +The following is a more complicated example that shows the use of +the \kcode{omp_target_memcpy_async} routine with a depend object \ucode{obj} to +overlap the memory copy with computation performed by \ucode{do_work}. +The depend object \ucode{obj} was created by the \kcode{depobj} directive +and initialized to an \kcode{out} dependence on the data \ucode{d_buf[0:N]} +(or \ucode{d_buf(1:N)} for Fortran) in advance. +The \kcode{depend(depobj: \ucode{obj})} (or alternatively +\kcode{depend(in: \ucode{d_buf[0:N]})}) clause on the \kcode{target} construct +ensures the asynchronous memory copy is complete before the data \ucode{d_buf} +can be used in the \kcode{target} region. + +\cexample[5.2]{device}{6} +\ffreeexample[5.2]{device}{6} + diff --git a/devices/lambda_expressions.tex b/devices/lambda_expressions.tex index 3d0750d..fabcc7d 100644 --- a/devices/lambda_expressions.tex +++ b/devices/lambda_expressions.tex @@ -1,4 +1,4 @@ -\pagebreak +%\pagebreak \section{Lambda Expressions} \label{sec:lambda_expressions} @@ -7,45 +7,45 @@ \section{Lambda Expressions} \cppspecificstart The following example illustrates the usage of lambda expressions and their -corresponding closure objects within a \scode{target} region. - -In CASE 1, a lambda expression is defined inside a \scode{target} construct -that implicitly maps the structure \textit{s}. Inside the construct, the -lambda captures (by reference) the corresponding \textit{s}, and the resulting -closure object is assigned to \textit{lambda1}. When the call operator is -invoked on \textit{lambda1}, the captured reference to \textit{s} is used in -the call. The modified \textit{s} is then copied back to the host device on -exit from the \scode{target} construct. - -In CASE 2, a lambda expression is instead defined before the \scode{target} -construct and captures (by copy) the pointer \textit{sp}. A -\scode{target}~\scode{data} construct is used to first map the structure, and -then the \scode{target} construct implicitly maps the closure object -referenced by \textit{lambda2}, a zero-length array section based on the -structure pointer \textit{sp}, and a zero-length array section based on the +corresponding closure objects within a \kcode{target} region. + +In Case 1, a lambda expression is defined inside a \kcode{target} construct +that implicitly maps the structure \ucode{s}. Inside the construct, the +lambda captures (by reference) the corresponding \ucode{s}, and the resulting +closure object is assigned to \ucode{lambda1}. When the call operator is +invoked on \ucode{lambda1}, the captured reference to \ucode{s} is used in +the call. The modified \ucode{s} is then copied back to the host device on +exit from the \kcode{target} construct. + +In Case 2, a lambda expression is instead defined before the \kcode{target} +construct and captures (by copy) the pointer \ucode{sp}. A +\kcode{target data} construct is used to first map the structure, and +then the \kcode{target} construct implicitly maps the closure object +referenced by \ucode{lambda2}, a zero-length array section based on the +structure pointer \ucode{sp}, and a zero-length array section based on the captured pointer in the closure object. The implicit maps result in attached -pointers to the corresponding structure. The call for \textit{lambda2} inside -the \scode{target} construct will access \textit{sp->a} and \textit{sp->b} +pointers to the corresponding structure. The call for \ucode{lambda2} inside +the \kcode{target} construct will access \ucode{sp->a} and \ucode{sp->b} from the corresponding structure. -CASE 3 is similar to CASE 2, except \textit{s} is instead captured by -reference by the lambda expression. As for CASE 2, the structure is first -mapped by an enclosing \scode{target}~\scode{data} construct, and then the -\scode{target} construct implicitly maps \textit{s} and the closure object -referenced by \textit{lambda3}. The effect of the map is to make the -the call for \textit{lambda3} refer to the corresponding \textit{s} inside the -\scode{target} construct rather than the original \textit{s}. - -In CASE 4, the program defines a static variable \textit{ss} of the same -structure type as \textit{s}. While the body of the lambda expression refers -to \textit{ss}, it is not captured. In order for \textit{lambda4} to be -callable in the \scode{target} region, the reference to \textit{ss} should be -to a device copy of \textit{ss} that also has static storage. This is achieved -with the use of the \scode{declare}~\scode{target} directive. Inside the -\scode{target} construct, all references to \textit{ss}, including in the -\textit{lambda4()} call, will refer to the corresponding \textit{ss} that -results from the \scode{declare}~\scode{target} directive. The \scode{always} -modifier is used on the \scode{map} clause to transfer the updated values for +Case 3 is similar to Case 2, except \ucode{s} is instead captured by +reference by the lambda expression. As for Case 2, the structure is first +mapped by an enclosing \kcode{target data} construct, and then the +\kcode{target} construct implicitly maps \ucode{s} and the closure object +referenced by \ucode{lambda3}. The effect of the map is to make the +the call for \ucode{lambda3} refer to the corresponding \ucode{s} inside the +\kcode{target} construct rather than the original \ucode{s}. + +In Case 4, the program defines a static variable \ucode{ss} of the same +structure type as \ucode{s}. While the body of the lambda expression refers +to \ucode{ss}, it is not captured. In order for \ucode{lambda4} to be +callable in the \kcode{target} region, the reference to \ucode{ss} should be +to a device copy of \ucode{ss} that also has static storage. This is achieved +with the use of the \kcode{declare target} directive. Inside the +\kcode{target} construct, all references to \ucode{ss}, including in the +\ucode{lambda4} call, will refer to the corresponding \ucode{ss} that +results from the \kcode{declare target} directive. The \kcode{always} +modifier is used on the \kcode{map} clause to transfer the updated values for the structure back to the host device. \cppnexample[5.0]{lambda_expressions}{1} diff --git a/devices/sources/declare_target_indirect_call.1.c b/devices/sources/declare_target_indirect_call.1.c new file mode 100644 index 0000000..ab5ae25 --- /dev/null +++ b/devices/sources/declare_target_indirect_call.1.c @@ -0,0 +1,30 @@ +/* +* @@name: declare_target_indirect_call.1 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_5.2 +*/ +#include +#include + +typedef int(*funcptr)(); + +int fun1() {return 1;} +int fun2() {return 2;} +#pragma omp declare target enter(fun1, fun2) indirect + // indirect defaults to true +int main() +{ + int ret_val=0; + const int choice = rand()%2 + 1; // create runtime number 1 or 2 + + funcptr fptr = (choice == 1) ? &fun1 : &fun2; //select fun1/fun2 for 1/2 + + #pragma omp target map(from: ret_val) + ret_val = fptr(); // ret_val = 1/2 from fun1/fun2 + + if (ret_val != choice) { printf("FAILED\n"); exit(1); } + + return 0; +} diff --git a/devices/sources/declare_target_indirect_call.1.f90 b/devices/sources/declare_target_indirect_call.1.f90 new file mode 100644 index 0000000..16b7425 --- /dev/null +++ b/devices/sources/declare_target_indirect_call.1.f90 @@ -0,0 +1,53 @@ +! @@name: declare_target_indirect_call.1 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_5.2 +module funcs + implicit none + + interface + function func() result(i) + integer :: i + end function + end interface + + contains + function fun1() result(i) + !$omp declare target enter(fun1) indirect !! indirect defaults to true + integer :: i + i=1 + return + end function + + function fun2() result(i) + !$omp declare target enter(fun2) indirect !! indirect defaults to true + integer :: i + i=2 + return + end function + +end module + +program main + use funcs + implicit none + procedure (func), pointer :: fptr=>null() + integer :: ret_val=0, choice=0 + real :: rand_no + + call random_number(rand_no) !! create random ( [0.0 - 1.0) ) + choice = nint(rand_no)+1 !! runtime number 1 or 2 + + if (choice == 1 ) fptr => fun1 + if (choice == 2 ) fptr => fun2 + + !$omp target map(from: ret_val) + ret_val = fptr() !! ret_val = 1/2 from fun1/fun2 + !$omp end target + + if (ret_val /= choice) then + print*, "FAILED"; error stop 1 + endif + +end program diff --git a/devices/sources/device.5.c b/devices/sources/device.5.c new file mode 100644 index 0000000..24b8aba --- /dev/null +++ b/devices/sources/device.5.c @@ -0,0 +1,46 @@ +/* +* @@name: device.5 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_5.2 +*/ +#include +#include +#include + +#define N 128 +extern void do_work(); + +void async_memcpy() { + int h_dev = omp_get_initial_device(); + int d_dev = omp_get_default_device(); + size_t dsize; + float h_buf[N]; + void *d_buf; + int i; + + /* allocate device memory */ + dsize = N * sizeof(float); + d_buf = omp_target_alloc(dsize, d_dev); + if (!d_buf) + abort(); + + /* set up host data */ + for (i = 0; i < N; i++) { + h_buf[i] = i*0.1f; + } + + /* copy data from host to device asynchronously */ + if (omp_target_memcpy_async(d_buf, h_buf, dsize, 0, 0, + d_dev, h_dev, 0, NULL)) + abort(); + + /* do some work here at the same time */ + do_work(); + + /* wait for task completion */ + #pragma omp taskwait + + omp_target_free(d_buf, d_dev); +} diff --git a/devices/sources/device.5.f90 b/devices/sources/device.5.f90 new file mode 100644 index 0000000..8c1bfd2 --- /dev/null +++ b/devices/sources/device.5.f90 @@ -0,0 +1,46 @@ +! @@name: device.5 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_5.2 +subroutine async_memcpy + use omp_lib + use, intrinsic :: iso_c_binding + + implicit none + + integer, parameter :: N = 128 + real, target :: h_buf(N) + type(c_ptr) :: c_dbuf, c_hbuf + integer(c_int) :: d_dev, h_dev + integer(c_size_t) :: dsize + + integer :: i + + h_dev = omp_get_initial_device() + d_dev = omp_get_default_device() + dsize = N * c_sizeof(h_buf(1)) + + ! allocate device memory + c_dbuf = omp_target_alloc(dsize, d_dev) + if (.not.c_associated(c_dbuf)) stop + c_hbuf = c_loc(h_buf) + + ! set up host data + h_buf = [(i*0.1, i = 1, N)] + + ! copy data from host to device asynchronously + if (omp_target_memcpy_async(c_dbuf, c_hbuf, dsize, 0, 0, & + d_dev, h_dev, 0) /= 0) then + stop + endif + + ! do some work here at the same time + call do_work + + ! wait for task completion + !$omp taskwait + + call omp_target_free(c_dbuf, d_dev) + +end subroutine diff --git a/devices/sources/device.6.c b/devices/sources/device.6.c new file mode 100644 index 0000000..f40e22d --- /dev/null +++ b/devices/sources/device.6.c @@ -0,0 +1,38 @@ +/* +* @@name: device.5 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_5.2 +*/ +#include +#include +extern void do_work(int, float *); +extern void do_more_work(int, float *); +#pragma omp declare target enter(do_more_work) + +void async_work(int N, float *d_buf, float *h_buf) +{ + omp_depend_t obj; + int d_dev, h_dev; + size_t dsize; + + h_dev = omp_get_initial_device(); + d_dev = omp_get_default_device(); + dsize = N * sizeof(float); + + // initialize a depend object 'obj' + #pragma omp depobj(obj) depend(out: d_buf[0:N]) + + // start the async memcpy of s_buf to d_buf on device + if (omp_target_memcpy_async(d_buf, h_buf, dsize, 0, 0, + d_dev, h_dev, 1, &obj)) + abort(); + + // do some useful work at the same time on host + do_work(N, h_buf); + + // wait until memcpy finishes before using d_buf in the target region + #pragma omp target is_device_ptr(d_buf) depend(depobj: obj) + do_more_work(N, d_buf); +} diff --git a/devices/sources/device.6.f90 b/devices/sources/device.6.f90 new file mode 100644 index 0000000..719d7b9 --- /dev/null +++ b/devices/sources/device.6.f90 @@ -0,0 +1,48 @@ +! @@name: device.6 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_5.2 +subroutine async_work(N, d_buf, h_buf) + use omp_lib + use, intrinsic :: iso_c_binding + + implicit none + integer :: N + real, pointer :: d_buf(:), h_buf(:) + + type(c_ptr) :: c_dp, c_hp + integer(c_int) :: d_dev, h_dev + integer(c_size_t) :: dsize + integer(omp_depend_kind) :: obj(1) + + external :: do_work + external :: do_more_work + !$omp declare target enter(do_more_work) + integer :: i + + h_dev = omp_get_initial_device() + d_dev = omp_get_default_device() + dsize = N * c_sizeof(d_buf(1)) + + c_dp = c_loc(d_buf) + c_hp = c_loc(h_buf) + + ! initialize a depend object 'obj' + !$omp depobj(obj) depend(out: d_buf(1:N)) + + ! start the async memcpy of h_buf to d_buf on device + if (omp_target_memcpy_async(c_dp, c_hp, dsize, 0, 0, & + d_dev, h_dev, 1, obj) /= 0) then + stop + endif + + ! do some useful work at the same time on host + call do_work(N, h_buf) + + ! wait until memcpy finishes before using d_buf in the target region + !$omp target has_device_addr(d_buf) depend(depobj: obj) + call do_more_work(N, d_buf) + !$omp end target + +end subroutine diff --git a/devices/sources/target_defaultmap.1.c b/devices/sources/target_defaultmap.1.c index 3b497d2..e5a1d86 100644 --- a/devices/sources/target_defaultmap.1.c +++ b/devices/sources/target_defaultmap.1.c @@ -3,7 +3,7 @@ * @@type: C * @@operation: run * @@expect: success -* @@version: omp_5.0 +* @@version: omp_5.2 */ #include #include @@ -12,7 +12,6 @@ int main(){ typedef struct S_struct { int s; int A[N]; } S_struct_t; - int s; //scalar int variable (scalar) int A[N]; //aggregate variable (array) S_struct_t S; //aggregate variable (structure) @@ -29,8 +28,8 @@ int main(){ // Uses defaultmap to set scalars, aggregates & // pointers to normal defaults. #pragma omp target \ - defaultmap(firstprivate: scalar) /* could also be default */ \ - defaultmap(tofrom: aggregate)/* could also be default */ \ + defaultmap(firstprivate: scalar) /* may also be default */ \ + defaultmap(tofrom: aggregate)/* may also be default */ \ defaultmap(default: pointer) /* must be default */ { s = 3; //SCALAR firstprivate, value not returned @@ -43,10 +42,9 @@ int main(){ ptr = &A[0]; //POINTER is private ptr[0] = 2; ptr[1] = 2; - } if(s==2 && A[0]==2 && S.s==2 && S.A[0]==2) - printf(" PASSED 1 of 4\n"); + printf(" PASSED 1 of 5\n"); // Target Region 2 @@ -59,7 +57,7 @@ int main(){ S.A[0]+=5; S.A[1]+=5; } if(s==7 && A[0]==7 && S.s==7 && S.A[0]==7) - printf(" PASSED 2 of 4\n"); + printf(" PASSED 2 of 5\n"); // Target Region 3 @@ -72,7 +70,7 @@ int main(){ s2 += 5; // firstprivate (s2 value not returned to host) s3 += s1 + s2; // mapped as tofrom } - if(s1==1 && s2==1 && s3==13 ) printf(" PASSED 3 of 4\n"); + if(s1==1 && s2==1 && s3==13 ) printf(" PASSED 3 of 5\n"); // Target Region 4 @@ -83,12 +81,24 @@ int main(){ #pragma omp target defaultmap(firstprivate: aggregate) \ map(from: s1, s2) { - A[0]+=1; S.A[0]+=1; //Aggregate changes not returned to host A[1]+=1; S.A[1]+=1; //Aggregate changes not returned to host s1 = A[0]+S.A[0]; //s1 value returned to host s2 = A[1]+S.A[1]; //s1 value returned to host } - if( A[0]==0 && S.A[0]==0 && s1==2 ) printf(" PASSED 4 of 4\n"); + if( A[0]==0 && S.A[0]==0 && s1==2 ) printf(" PASSED 4 of 5\n"); + +// Target Region 5 + // defaultmap using all variable category + + s1=s2=s3=1; + + #pragma omp target defaultmap(to: all) map(from: s3) + { + s1 += 5; // mapped as to + s2 += 5; // mapped as to + s3 = s1 + s2; // mapped as from + } + if(s1==1 && s2==1 && s3==12 ) printf(" PASSED 5 of 5\n"); } diff --git a/devices/sources/target_defaultmap.1.f90 b/devices/sources/target_defaultmap.1.f90 index d36dfdc..2c3cf42 100644 --- a/devices/sources/target_defaultmap.1.f90 +++ b/devices/sources/target_defaultmap.1.f90 @@ -2,7 +2,7 @@ ! @@type: F-free ! @@operation: run ! @@expect: success -! @@version: omp_5.0 +! @@version: omp_5.2 program defaultmap integer, parameter :: N=2 @@ -19,7 +19,7 @@ program defaultmap ! Assign values to scalar, array, allocatable, and pointers - s=2; + s=2 s1=0; s2=0; s3=0 D%s=0; D%A(1)=0; D%A(2)=0 A(1)=0; A(2)=0 @@ -52,7 +52,7 @@ program defaultmap !$omp end target if(s==2 .and. A(1)==2 .and. D%s==2 .and. D%A(1)==2 .and. H(1) == 2) & - print*," PASSED 1 of 4" + print*," PASSED 1 of 5" !! Target Region 2 !! no implicit mapping allowed @@ -65,7 +65,7 @@ program defaultmap !$omp end target if(s==7 .and. A(1)==7 .and. D%s==7 .and. D%A(1)==7) & - print*," PASSED 2 of 4" + print*," PASSED 2 of 5" !! Target Region 3 !! defaultmap & explicit data-sharing clause @@ -73,12 +73,12 @@ program defaultmap s1=1; s2=1; s3=1 !$omp target defaultmap(tofrom: scalar) firstprivate(s1,s2) - s1 = s1+5; !! firstprivate (s1 value not returned to host) - s2 = s2+5; !! firstprivate (s2 value not returned to host) - s3 = s3 +s1 + s2; !! mapped as tofrom + s1 = s1+5 !! firstprivate (s1 value not returned to host) + s2 = s2+5 !! firstprivate (s2 value not returned to host) + s3 = s3 +s1 + s2 !! mapped as tofrom !$omp end target - if(s1==1 .and. s2==1 .and. s3==13) print*," PASSED 3 of 4" + if(s1==1 .and. s2==1 .and. s3==13) print*," PASSED 3 of 5" !! Target Region 4 A(1)=0; A(2)=0 @@ -98,7 +98,20 @@ program defaultmap !$omp end target if(A(1)==0 .and. D%A(1)==0 .and. H(1)==0 .and. s1==3) & - print*," PASSED 4 of 4" + print*," PASSED 4 of 5" + +!! Target Region 5 + !! defaultmap & explicit data-sharing clause + !! with variables in same category + s1=1; s2=1; s3=1 + !$omp target defaultmap(to: all) map(from: s3) + + s1 = s1+5 !! mapped as to + s2 = s2+5 !! mapped as to + s3 = s1 + s2 !! mapped as from + + !$omp end target + if(s1==1 .and. s2==1 .and. s3==12) print*," PASSED 5 of 5" deallocate(H) diff --git a/devices/sources/target_mapper.1.f90 b/devices/sources/target_mapper.1.f90 index 08f1b8c..9aa1541 100644 --- a/devices/sources/target_mapper.1.f90 +++ b/devices/sources/target_mapper.1.f90 @@ -33,6 +33,6 @@ program main subroutine init(s) use my_structures type(myvec_t) :: s - + !$omp declare target s%data = [ (i, i=1,s%len) ] end subroutine diff --git a/devices/sources/target_mapper.3.f90 b/devices/sources/target_mapper.3.f90 index b24a1ba..18a1da5 100644 --- a/devices/sources/target_mapper.3.f90 +++ b/devices/sources/target_mapper.3.f90 @@ -32,5 +32,6 @@ program main !$omp target map(P) call eval_mypts_array(P) + !$omp end target end program diff --git a/devices/sources/target_struct_map.4.c b/devices/sources/target_struct_map.4.c index f0f1e5c..f65fee5 100644 --- a/devices/sources/target_struct_map.4.c +++ b/devices/sources/target_struct_map.4.c @@ -47,7 +47,6 @@ int main() #pragma omp target //implicit map of S1 saxpyfun(&S1); - // Case 2 S2.a = 2.0; diff --git a/devices/sources/target_update.3.c b/devices/sources/target_update.3.c new file mode 100644 index 0000000..30c1832 --- /dev/null +++ b/devices/sources/target_update.3.c @@ -0,0 +1,60 @@ +/* +* @@name: target_update.3 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_5.1 +*/ +#include +#include + +typedef struct{ + int x; + int y; + int z; +}T; + +#pragma omp declare mapper(custom: T S) map(to:S.x) \ + map(from:S.y) map(alloc: S.z) + +int main() +{ + T s; + + s.x = 5; + s.y = 5; + s.z = 5; + #pragma omp target data map(mapper(custom),tofrom: s) + { + int a,b,c; + s.x += 5; + s.y += 5; + s.z += 5; + + #pragma omp target update to(mapper(custom): s) + // becomes #pragma omp target update to(s.x) + + #pragma omp target map(from: a,b,c) + { + a = s.x; + b = s.y; //s.y is undefined here + c = s.z; //s.z is undefined here + + s.y = 5; + + printf("s.x:%d, s.y:%d \n", s.x, s.y); + // s.x:10, s.y:5 (value of s.z is undefined) + } + #pragma omp target update from(mapper(custom): s) + // becomes #pragma omp target update from(s.y) + + printf("s.y:%d \n", s.y); + // s.y:5 + printf("a:%d \n", a); + // a:10 (values of b and c are undefined) + } + printf("s.x:%d, s.y:%d, s.z:%d\n", s.x, s.y, s.z); + // s.x:10, s.y:5, s.z:10 + + return 0; +} diff --git a/devices/sources/target_update.3.f90 b/devices/sources/target_update.3.f90 new file mode 100644 index 0000000..e77ef36 --- /dev/null +++ b/devices/sources/target_update.3.f90 @@ -0,0 +1,52 @@ +! @@name: target_update.3 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_5.1 + module my_struct + type T + integer :: x,y,z + end type + end module + + program main + use my_struct + integer, parameter :: N=100 + integer :: a,b,c + + !$omp declare mapper(custom: T :: v) & + !$omp& map(to:v%x) map(from:v%y) map(alloc: v%z) + + type(T) :: s + + s%x = 5 + s%y = 5 + s%z = 5 + + !$omp target data map(mapper(custom),tofrom: s) + + s%x = s%x + 5 + s%y = s%y + 5 + s%z = s%z + 5 + + !$omp target update to(mapper(custom) : s) + + !$omp target map(from: a,b,c) + a = s%x + b = s%y + c = s%z + + s%y = 5 + print*,"s%x:", s%x, " s%y:", s%y + !! s%x:10, s%y:5 (value of s%z is undefined) + !$omp end target + + !$omp target update from(mapper(custom) : s) + print*, "s%y:", s%y !! s%y:5 + print*, "a:", a !! a:10 (values of b and c are undefined) + + !$omp end target data + + print*, "s%x:", s%x, " s%y:", s%y, " s%z:", s%z + !! s%x:10, s%y:5, s%z:10 + end program diff --git a/devices/sources/usm_scalar_ptr_ref_asc.1.cpp b/devices/sources/usm_scalar_ptr_ref_asc.1.cpp new file mode 100644 index 0000000..efc84f7 --- /dev/null +++ b/devices/sources/usm_scalar_ptr_ref_asc.1.cpp @@ -0,0 +1,47 @@ +/* +* @@name: usm_scalar_ptr_ref_asc.1 +* @@type: C++ +* @@operation: run +* @@expect: success +* @@version: omp_5.2 +*/ +#include + +#pragma omp requires unified_shared_memory + +int main(){ + int x = 0; // scalar + int *ptr = &x; // pointer to a scalar + int &ref = x; // reference to a scalar + + bool pass = true; + + // Case 1: x is firstprivate + #pragma omp target + { + x++; + } + if( x != 0 ) pass = false; + + x = 0; + // Case 2: ptr is firstprivate + // (uses address assigned in host data environment) + #pragma omp target + { + (*ptr)++; + } + if( x != 1 ) pass = false; + + x = 0; + // Case 3: ref and its object are mapped + #pragma omp target + { + ref++; + } + if( x != 1 ) pass = false; + + // Verification + if( pass ) { printf("PASSED\n"); return 0; } + else { printf("FAILED\n"); return 1; } + +} diff --git a/devices/sources/usm_scalar_ptr_ref_asc.1.f90 b/devices/sources/usm_scalar_ptr_ref_asc.1.f90 new file mode 100644 index 0000000..fbb4e71 --- /dev/null +++ b/devices/sources/usm_scalar_ptr_ref_asc.1.f90 @@ -0,0 +1,57 @@ +! @@name: usm_scalar_ptr_ref_ax.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_5.2 +program main + !$omp requires unified_shared_memory + + logical :: pass=.TRUE. + + integer :: x + integer, target :: y + integer, pointer :: ptr + + x = 0 + ! Case 1 : x is firstprivate + !$omp target + x = x + 1 + !$omp end target + if(x /= 0 ) pass = .FALSE. + + x = 0 + ASSOCIATE( ax => x) + + ! Case 2 : + !$omp target + ax = ax + 1 + !$omp end target + if(x /= 1 ) pass = .FALSE. + + end ASSOCIATE + + y = 0 + ptr => y + + ! Case 3a : ptr is mapped + !$omp target + ptr = ptr + 1 + !$omp end target + if(y /= 1 ) pass = .FALSE. + + y = 0 + + ! Case 3b : y is mapped + !$omp target + y = y + 1 + !$omp end target + if(y /= 1 ) pass = .FALSE. + + + if(pass) then + print*, "PASSED" + else + print*, "FAILED"; stop 1 + endif + +end program diff --git a/devices/sources/virtual_functions.1.cpp b/devices/sources/virtual_functions.1.cpp index 5f06620..eec9ee0 100644 --- a/devices/sources/virtual_functions.1.cpp +++ b/devices/sources/virtual_functions.1.cpp @@ -6,7 +6,6 @@ * @@version: omp_5.2 */ #include -#pragma omp requires unified_shared_memory #pragma omp begin declare target class A { @@ -22,39 +21,24 @@ class D: public A { int main(){ - // Section 1 -------------------------------------------------------- - D d; // D derives from A, and A::vf() is virtual - A &ar = d; // reference to Derived object d - - #pragma omp target // implicit map of ar is illegal here - { - ar.vf(); // unspecified whether A::vf() or D::vf() is called - } - - A *ap = &d; // pointer to derived object d - #pragma omp target // No need for mapping with Unified Share Memory - { // implicit ap[:0] map is fine - ap->vf(); // calls D::vf() - } - - // Section 2 -------------------------------------------------------- - ap = nullptr; - #pragma omp target map(ap) - { - ap = new A(); - } - - ap->vf(); // illegal - - #pragma omp target - { - delete ap; - } - ap = new A(); - #pragma omp target // No need for mapping with Unified Share Memory - { - ap->vf(); // ok - } + D d; // D derives from A, and vf() is virtual + + #pragma omp target data map(d) + { + // Case 1 + A *ap = &d; // pointer to derived object d + #pragma omp target // ap is firstprivate + { + ap->vf(); // calls D::vf() + } + + // Case 2 + A &ar = d; // reference to Derived object d + #pragma omp target // ar is implicitly mapped + { + ar.vf(); // unspecified behavior + } + } return 0; } diff --git a/devices/sources/virtual_functions.2.cpp b/devices/sources/virtual_functions.2.cpp new file mode 100644 index 0000000..5188383 --- /dev/null +++ b/devices/sources/virtual_functions.2.cpp @@ -0,0 +1,45 @@ +/* +* @@name: virtual_functions.2 +* @@type: C++ +* @@operation: link +* @@expect: rt-error +* @@version: omp_5.2 +*/ +#include +#pragma omp requires unified_shared_memory + +#pragma omp begin declare target +class A { + public: + virtual void vf() { std::cout << "In A\n"; } +}; + +class D: public A { + public: + void vf() override { std::cout << "In D\n"; } +}; +#pragma omp end declare target + +int main(){ + + A *ap = nullptr; + // Case 1 + #pragma omp target + { + ap = new D(); + } + ap->vf(); // illegal + #pragma omp target + { + delete ap; + } + + // Case 2 + ap = new D(); + #pragma omp target // No need for mapping with Unified Share Memory + { + ap->vf(); // ok + } + + return 0; +} diff --git a/devices/target.tex b/devices/target.tex index 0c349c8..face171 100644 --- a/devices/target.tex +++ b/devices/target.tex @@ -1,77 +1,77 @@ \pagebreak -\section{\code{target} Construct} +\section{\kcode{target} Construct} \label{sec:target} -\subsection{\code{target} Construct on \code{parallel} Construct} +\subsection{\kcode{target} Construct on \kcode{parallel} Construct} \label{subsec:target_parallel} -\index{constructs!target@\code{target}} -\index{target construct@\code{target} construct} -\index{target construct@\code{target} construct!implicit mapping} +\index{constructs!target@\kcode{target}} +\index{target construct@\kcode{target} construct} +\index{target construct@\kcode{target} construct!implicit mapping} -This following example shows how the \code{target} construct offloads a code -region to a target device. The variables \plc{p}, \plc{v1}, \plc{v2}, and \plc{N} are implicitly mapped +This following example shows how the \kcode{target} construct offloads a code +region to a target device. The variables \ucode{p}, \ucode{v1}, \ucode{v2}, and \ucode{N} are implicitly mapped to the target device. \cexample[4.0]{target}{1} \ffreeexample[4.0]{target}{1} -\subsection{\code{target} Construct with \code{map} Clause} +\subsection{\kcode{target} Construct with \kcode{map} Clause} \label{subsec:target_map} -\index{target construct@\code{target} construct!map clause@\code{map} clause} -\index{clauses!map@\code{map}} -\index{map clause@\code{map} clause} +\index{target construct@\kcode{target} construct!map clause@\kcode{map} clause} +\index{clauses!map@\kcode{map}} +\index{map clause@\kcode{map} clause} -This following example shows how the \code{target} construct offloads a code -region to a target device. The variables \plc{p}, \plc{v1} and \plc{v2} are explicitly mapped to the -target device using the \code{map} clause. The variable \plc{N} is implicitly mapped to +This following example shows how the \kcode{target} construct offloads a code +region to a target device. The variables \ucode{p}, \ucode{v1} and \ucode{v2} are explicitly mapped to the +target device using the \kcode{map} clause. The variable \ucode{N} is implicitly mapped to the target device. \cexample[4.0]{target}{2} \ffreeexample[4.0]{target}{2} -\subsection{\code{map} Clause with \code{to}/\code{from} map-types} +\subsection{\kcode{map} Clause with \kcode{to}/\kcode{from} map-types} \label{subsec:target_map_tofrom} -\index{map clause@\code{map} clause!to map-type@\code{to} map-type} -\index{map clause@\code{map} clause!from map-type@\code{from} map-type} +\index{map clause@\kcode{map} clause!to map-type@\kcode{to} map-type} +\index{map clause@\kcode{map} clause!from map-type@\kcode{from} map-type} -The following example shows how the \code{target} construct offloads a code region -to a target device. In the \code{map} clause, the \code{to} and \code{from} +The following example shows how the \kcode{target} construct offloads a code region +to a target device. In the \kcode{map} clause, the \kcode{to} and \kcode{from} map-types define the mapping between the original (host) data and the target (device) -data. The \code{to} map-type specifies that the data will only be read on the -device, and the \code{from} map-type specifies that the data will only be written +data. The \kcode{to} map-type specifies that the data will only be read on the +device, and the \kcode{from} map-type specifies that the data will only be written to on the device. By specifying a guaranteed access on the device, data transfers -can be reduced for the \code{target} region. +can be reduced for the \kcode{target} region. -The \code{to} map-type indicates that at the start of the \code{target} region -the variables \plc{v1} and \plc{v2} are initialized with the values of the corresponding variables -on the host device, and at the end of the \code{target} region the variables -\plc{v1} and \plc{v2} are not assigned to their corresponding variables on the host device. +The \kcode{to} map-type indicates that at the start of the \kcode{target} region +the variables \ucode{v1} and \ucode{v2} are initialized with the values of the corresponding variables +on the host device, and at the end of the \kcode{target} region the variables +\ucode{v1} and \ucode{v2} are not assigned to their corresponding variables on the host device. -The \code{from} map-type indicates that at the start of the \code{target} region -the variable \plc{p} is not initialized with the value of the corresponding variable -on the host device, and at the end of the \code{target} region the variable \plc{p} +The \kcode{from} map-type indicates that at the start of the \kcode{target} region +the variable \ucode{p} is not initialized with the value of the corresponding variable +on the host device, and at the end of the \kcode{target} region the variable \ucode{p} is assigned to the corresponding variable on the host device. \cexample[4.0]{target}{3} -The \code{to} and \code{from} map-types allow programmers to optimize data -motion. Since data for the \plc{v} arrays are not returned, and data for the \plc{p} array +The \kcode{to} and \kcode{from} map-types allow programmers to optimize data +motion. Since data for the \ucode{v} arrays are not returned, and data for the \ucode{p} array are not transferred to the device, only one-half of the data is moved, compared to the default behavior of an implicit mapping. \ffreeexample[4.0]{target}{3} -\subsection{\code{map} Clause with Array Sections} +\subsection{\kcode{map} Clause with Array Sections} \label{subsec:target_array_section} -\index{map clause@\code{map} clause!array sections in} +\index{map clause@\kcode{map} clause!array sections in} -The following example shows how the \code{target} construct offloads a code region -to a target device. In the \code{map} clause, map-types are used to optimize -the mapping of variables to the target device. Because variables \plc{p}, \plc{v1} and \plc{v2} are -pointers, array section notation must be used to map the arrays. The notation \code{:N} -is equivalent to \code{0:N}. +The following example shows how the \kcode{target} construct offloads a code region +to a target device. In the \kcode{map} clause, map-types are used to optimize +the mapping of variables to the target device. Because variables \ucode{p}, \ucode{v1} and \ucode{v2} are +pointers, array section notation must be used to map the arrays. The notation \ucode{:N} +is equivalent to \ucode{0:N}. \cexample[4.0]{target}{4} \clearpage @@ -79,46 +79,46 @@ \subsection{\code{map} Clause with Array Sections} In C, the length of the pointed-to array must be specified. In Fortran the extent of the array is known and the length need not be specified. A section of the array can be specified with the usual Fortran syntax, as shown in the following example. -The value 1 is assumed for the lower bound for array section \plc{v2(:N)}. +The value 1 is assumed for the lower bound for array section \ucode{v2(:N)}. \ffreeexample[4.0]{target}{4} -A more realistic situation in which an assumed-size array is passed to \code{vec\_mult} +A more realistic situation in which an assumed-size array is passed to \ucode{vec_mult} requires that the length of the arrays be specified, because the compiler does not know the size of the storage. A section of the array must be specified with the usual Fortran syntax, as shown in the following example. The value 1 is assumed -for the lower bound for array section \plc{v2(:N)}. +for the lower bound for array section \ucode{v2(:N)}. \ffreeexample[4.0]{target}{4b} -\subsection{\code{target} Construct with \code{if} Clause} +\subsection{\kcode{target} Construct with \kcode{if} Clause} \label{subsec:target_if} -\index{target construct@\code{target} construct!if clause@\code{if} clause} -\index{clauses!if@\code{if}} -\index{if clause@\code{if} clause} +\index{target construct@\kcode{target} construct!if clause@\kcode{if} clause} +\index{clauses!if@\kcode{if}} +\index{if clause@\kcode{if} clause} -The following example shows how the \code{target} construct offloads a code region +The following example shows how the \kcode{target} construct offloads a code region to a target device. -The \code{if} clause on the \code{target} construct indicates that if the variable -\plc{N} is smaller than a given threshold, then the \code{target} region will be executed +The \kcode{if} clause on the \kcode{target} construct indicates that if the variable +\ucode{N} is smaller than a given threshold, then the \kcode{target} region will be executed by the host device. -The \code{if} clause on the \code{parallel} construct indicates that if the -variable \plc{N} is smaller than a second threshold then the \code{parallel} region +The \kcode{if} clause on the \kcode{parallel} construct indicates that if the +variable \ucode{N} is smaller than a second threshold then the \kcode{parallel} region is inactive. \cexample[4.0]{target}{5} \ffreeexample[4.0]{target}{5} -The following example is a modification of the above \plc{target.5} code to show the combined \code{target} -and parallel loop directives. It uses the \plc{directive-name} modifier in multiple \code{if} +The following example is a modification of the above \example{target.5} code to show the combined \kcode{target} +and \kcode{parallel} directives. It uses the \plc{directive-name} modifier in multiple \kcode{if} clauses to specify the component directive to which it applies. -The \code{if} clause with the \code{target} modifier applies to the \code{target} component of the -combined directive, and the \code{if} clause with the \code{parallel} modifier applies -to the \code{parallel} component of the combined directive. +The \kcode{if} clause with the \kcode{target} modifier applies to the \kcode{target} component of the +combined directive, and the \kcode{if} clause with the \kcode{parallel} modifier applies +to the \kcode{parallel} component of the combined directive. \cexample[4.5]{target}{6} @@ -126,44 +126,44 @@ \subsection{\code{target} Construct with \code{if} Clause} \subsection{Target Reverse Offload} \label{subsec:target_reverse_offload} -\index{target reverse offload!reverse_offload clause@\scode{reverse_offload} clause} -\index{target reverse offload!requires directive@\code{requires} directive} -\index{requires directive@\code{requires} directive!reverse_offload clause@\scode{reverse_offload} clause} -\index{directives!requires@\code{requires}} -\index{clauses!reverse_offload@\scode{reverse_offload}} -\index{reverse_offload clause@\scode{reverse_offload} clause} -\index{target construct@\code{target} construct!device clause@\code{device} clause} -\index{clauses!device@\code{device}} -\index{device clause@\code{device} clause!ancestor modifier@\code{ancestor} modifier} -\index{ancestor modifier@\code{ancestor} modifier} -\index{declare target directive@\code{declare}~\code{target} directive!device_type clause@\scode{device_type} clause} -\index{clauses!device_type@\scode{device_type}} -\index{device_type clause@\scode{device_type} clause} -\index{clauses!enter@\code{enter}} -\index{enter clause@\code{enter} clause} - -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} +\index{target reverse offload!reverse_offload clause@\kcode{reverse_offload} clause} +\index{target reverse offload!requires directive@\kcode{requires} directive} +\index{requires directive@\kcode{requires} directive!reverse_offload clause@\kcode{reverse_offload} clause} +\index{directives!requires@\kcode{requires}} +\index{clauses!reverse_offload@\kcode{reverse_offload}} +\index{reverse_offload clause@\kcode{reverse_offload} clause} +\index{target construct@\kcode{target} construct!device clause@\kcode{device} clause} +\index{clauses!device@\kcode{device}} +\index{device clause@\kcode{device} clause!ancestor modifier@\kcode{ancestor} modifier} +\index{ancestor modifier@\kcode{ancestor} modifier} +\index{declare target directive@\kcode{declare target} directive!device_type clause@\kcode{device_type} clause} +\index{clauses!device_type@\kcode{device_type}} +\index{device_type clause@\kcode{device_type} clause} +\index{clauses!enter@\kcode{enter}} +\index{enter clause@\kcode{enter} clause} + +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} Beginning with OpenMP 5.0, implementations are allowed to offload back to the host (reverse offload). -In the example below the \plc{error\_handler} function +In the example below the \ucode{error_handler} function is executed back on the host, if an erroneous value is -detected in the \plc{A} array on the device. +detected in the \ucode{A} array on the device. This is accomplished by specifying the \plc{device-modifier} -\code{ancestor} modifier, along with a device number of \code{1}, +\kcode{ancestor} modifier, along with a device number of \ucode{1}, to indicate that the execution is to be performed on the immediate parent (\plc{1st ancestor})-- the host. -The \code{requires} directive (another 5.0 feature) -uses the \code{reverse\_offload} clause to guarantee +The \kcode{requires} directive (another 5.0 feature) +uses the \kcode{reverse_offload} clause to guarantee that the reverse offload is implemented. -Note that the \code{declare}~\code{target} directive uses the -\code{device\_type} clause (another 5.0 feature) to specify that -the \plc{error\_handler} function is compiled to +Note that the \kcode{declare target} directive uses the +\kcode{device_type} clause (another 5.0 feature) to specify that +the \ucode{error_handler} function is compiled to execute on the \plc{host} only. This ensures that no attempt will be made to create a device version of the function. This feature may be necessary if the function diff --git a/devices/target_associate_ptr.tex b/devices/target_associate_ptr.tex index b4470c5..1f066d4 100644 --- a/devices/target_associate_ptr.tex +++ b/devices/target_associate_ptr.tex @@ -2,45 +2,45 @@ \subsection{Device and Host Memory Association} \label{subsec:target_associate_ptr} \label{sec:target_associate_ptr} -\index{routines!omp_target_associate_ptr@\scode{omp_target_associate_ptr}} -\index{omp_target_associate_ptr routine@\scode{omp_target_associate_ptr} routine} +\index{routines!omp_target_associate_ptr@\kcode{omp_target_associate_ptr}} +\index{omp_target_associate_ptr routine@\kcode{omp_target_associate_ptr} routine} -\index{routines!omp_target_alloc@\scode{omp_target_alloc}} -\index{omp_target_alloc routine@\scode{omp_target_alloc} routine} +\index{routines!omp_target_alloc@\kcode{omp_target_alloc}} +\index{omp_target_alloc routine@\kcode{omp_target_alloc} routine} The association of device memory with host memory -can be established by calling the \scode{omp_target_associate_ptr} +can be established by calling the \kcode{omp_target_associate_ptr} API routine as part of the mapping. The following example shows the use of this routine -to associate device memory of size \splc{CS}, -allocated by the \scode{omp_target_alloc} routine and -pointed to by the device pointer \splc{dev_ptr}, -with a chunk of the host array \splc{arr} starting at index \splc{ioff}. -In Fortran, the intrinsic function \scode{c_loc} is called -to obtain the corresponding C pointer (\splc{h_ptr}) of \splc{arr(ioff)} +to associate device memory of size \ucode{CS}, +allocated by the \kcode{omp_target_alloc} routine and +pointed to by the device pointer \ucode{dev_ptr}, +with a chunk of the host array \ucode{arr} starting at index \ucode{ioff}. +In Fortran, the intrinsic function \vcode{c_loc} is called +to obtain the corresponding C pointer (\ucode{h_ptr}) of \ucode{arr(ioff)} for use in the call to the API routine. -\index{constructs!target update@\code{target}~\code{update}} -\index{target update construct@\code{target}~\code{update} construct} -\index{map clause@\code{map} clause!always modifier@\code{always} modifier} -\index{always modifier@\code{always} modifier} +\index{constructs!target update@\kcode{target update}} +\index{target update construct@\kcode{target update} construct} +\index{map clause@\kcode{map} clause!always modifier@\kcode{always} modifier} +\index{always modifier@\kcode{always} modifier} Since the reference count of the resulting mapping is infinite, -it is necessary to use the \scode{target}~\scode{update} directive (or -the \scode{always} modifier in a \scode{map} clause) to accomplish a +it is necessary to use the \kcode{target update} directive (or +the \kcode{always} modifier in a \kcode{map} clause) to accomplish a data transfer between host and device. -The explicit mapping of the array section \splc{arr[ioff:CS]} -(or \splc{arr(ioff:ioff+CS-1)} in Fortran) on the \scode{target} +The explicit mapping of the array section \ucode{arr[ioff:CS]} +(or \ucode{arr(ioff:ioff+CS-1)} in Fortran) on the \kcode{target} construct ensures that the allocated and associated device memory is used -when referencing the array \splc{arr} in the \scode{target} region. -The device pointer \splc{dev_ptr} cannot be accessed directly -after a call to the \scode{omp_target_associate_ptr} routine. +when referencing the array \ucode{arr} in the \kcode{target} region. +The device pointer \ucode{dev_ptr} cannot be accessed directly +after a call to the \kcode{omp_target_associate_ptr} routine. -\index{routines!omp_target_disassociate_ptr@\scode{omp_target_disassociate_ptr}} -\index{omp_target_disassociate_ptr routine@\scode{omp_target_disassociate_ptr} routine} -\index{routines!omp_target_free@\scode{omp_target_free}} -\index{omp_target_free routine@\scode{omp_target_free} routine} -After the \scode{target} region, the device pointer is disassociated from -the current chunk of the host memory by calling the \scode{omp_target_disassociate_ptr} routine before working on the next chunk. -The device memory is freed by calling the \scode{omp_target_free} +\index{routines!omp_target_disassociate_ptr@\kcode{omp_target_disassociate_ptr}} +\index{omp_target_disassociate_ptr routine@\kcode{omp_target_disassociate_ptr} routine} +\index{routines!omp_target_free@\kcode{omp_target_free}} +\index{omp_target_free routine@\kcode{omp_target_free} routine} +After the \kcode{target} region, the device pointer is disassociated from +the current chunk of the host memory by calling the \kcode{omp_target_disassociate_ptr} routine before working on the next chunk. +The device memory is freed by calling the \kcode{omp_target_free} routine at the end. \cexample[4.5]{target_associate_ptr}{1} diff --git a/devices/target_data.tex b/devices/target_data.tex index 5a50ad7..058c663 100644 --- a/devices/target_data.tex +++ b/devices/target_data.tex @@ -1,19 +1,19 @@ -\pagebreak -\section{\code{target} \code{data} Construct} +%\pagebreak +\section{\kcode{target data} Construct} \label{sec:target_data} -\subsection{Simple \code{target} \code{data} Construct} +\subsection{Simple \kcode{target data} Construct} \label{subsec:target_data_simple} -\index{constructs!target data@\code{target}~\code{data}} -\index{target data construct@\code{target}~\code{data} construct} - -This example shows how the \code{target} \code{data} construct maps variables -to a device data environment. The \code{target} \code{data} construct creates -a new device data environment and maps the variables \plc{v1}, \plc{v2}, and \plc{p} to the new device -data environment. The \code{target} construct enclosed in the \code{target} -\code{data} region creates a new device data environment, which inherits the -variables \plc{v1}, \plc{v2}, and \plc{p} from the enclosing device data environment. The variable -\plc{N} is mapped into the new device data environment from the encountering task's data +\index{constructs!target data@\kcode{target data}} +\index{target data construct@\kcode{target data} construct} + +This example shows how the \kcode{target data} construct maps variables +to a device data environment. The \kcode{target data} construct creates +a new device data environment and maps the variables \ucode{v1}, \ucode{v2}, and \ucode{p} to the new device +data environment. The \kcode{target} construct enclosed in the +\kcode{target data} region creates a new device data environment, which inherits the +variables \ucode{v1}, \ucode{v2}, and \ucode{p} from the enclosing device data environment. The variable +\ucode{N} is mapped into the new device data environment from the encountering task's data environment. \cexample[4.0]{target_data}{1} @@ -25,164 +25,163 @@ \subsection{Simple \code{target} \code{data} Construct} \ffreeexample[4.0]{target_data}{1} -\subsection{\code{target} \code{data} Region Enclosing Multiple \code{target} Regions} +\subsection{\kcode{target data} Region Enclosing Multiple \kcode{target} Regions} \label{subsec:target_data_multiregion} -The following examples show how the \code{target} \code{data} construct maps -variables to a device data environment of a \code{target} region. The \code{target} -\code{data} construct creates a device data environment and encloses \code{target} +The following examples show how the \kcode{target data} construct maps +variables to a device data environment of a \kcode{target} region. +The \kcode{target data} construct creates a device data environment and encloses \kcode{target} regions, which have their own device data environments. The device data environment -of the \code{target} \code{data} region is inherited by the device data environment -of an enclosed \code{target} region. The \code{target} \code{data} construct -is used to create variables that will persist throughout the \code{target} \code{data} +of the \kcode{target data} region is inherited by the device data environment +of an enclosed \kcode{target} region. The \kcode{target data} construct +is used to create variables that will persist throughout the \kcode{target data} region. -In the following example the variables \plc{v1} and \plc{v2} are mapped at each \code{target} -construct. Instead of mapping the variable \plc{p} twice, once at each \code{target} -construct, \plc{p} is mapped once by the \code{target} \code{data} construct. +In the following example the variables \ucode{v1} and \ucode{v2} are mapped at each \kcode{target} +construct. Instead of mapping the variable \ucode{p} twice, once at each \kcode{target} +construct, \ucode{p} is mapped once by the \kcode{target data} construct. \cexample[4.0]{target_data}{2} -The Fortran code uses reference and specifies the extent of the \plc{p}, \plc{v1} and \plc{v2} arrays. -No length information is necessary in the \code{map} clause, as is required with -C/C++ pointers. The arrays \plc{v1} and \plc{v2} are mapped at each \code{target} construct. -Instead of mapping the array \plc{p} twice, once at each target construct, \plc{p} is mapped -once by the \code{target} \code{data} construct. +The Fortran code uses reference and specifies the extent of the \ucode{p}, \ucode{v1} and \ucode{v2} arrays. +No length information is necessary in the \kcode{map} clause, as is required with +C/C++ pointers. The arrays \ucode{v1} and \ucode{v2} are mapped at each \kcode{target} construct. +Instead of mapping the array \ucode{p} twice, once at each target construct, \ucode{p} is mapped +once by the \kcode{target data} construct. \ffreeexample[4.0]{target_data}{2} -\index{target data construct@\code{target}~\code{data} construct!map clause@\code{map} clause} -\index{target construct@\code{target} construct!map clause@\code{map} clause} -\index{target construct@\code{target} construct!implicit mapping} -\index{map clause@\code{map} clause!tofrom map-type@\code{tofrom} map-type} -In the following example, the array \plc{Q} is mapped once at the enclosing -\code{target}~\code{data} region instead of at each \code{target} construct. -In OpenMP 4.0, a scalar variable is implicitly mapped with the \code{tofrom} map-type. -But since OpenMP 4.5, a scalar variable, such as the \plc{tmp} variable, has to be explicitly mapped with -the \code{tofrom} map-type at the first \code{target} construct in order to return +\index{target data construct@\kcode{target data} construct!map clause@\kcode{map} clause} +\index{target construct@\kcode{target} construct!map clause@\kcode{map} clause} +\index{target construct@\kcode{target} construct!implicit mapping} +\index{map clause@\kcode{map} clause!tofrom map-type@\kcode{tofrom} map-type} +In the following example, the array \ucode{Q} is mapped once at the enclosing +\kcode{target data} region instead of at each \kcode{target} construct. +In OpenMP 4.0, a scalar variable is implicitly mapped with the \kcode{tofrom} map-type. +But since OpenMP 4.5, a scalar variable, such as the \ucode{tmp} variable, has to be explicitly mapped with +the \kcode{tofrom} map-type at the first \kcode{target} construct in order to return its reduced value from the parallel loop construct to the host. -The variable defaults to firstprivate at the second \code{target} construct. +The variable defaults to firstprivate at the second \kcode{target} construct. \cexample[4.0]{target_data}{3} \ffreeexample[4.0]{target_data}{3} -\subsection{\code{target} \code{data} Construct with Orphaned Call} +\subsection{\kcode{target data} Construct with Orphaned Call} -The following two examples show how the \code{target} \code{data} construct -maps variables to a device data environment. The \code{target} \code{data} -construct's device data environment encloses the \code{target} construct's device -data environment in the function \code{vec\_mult()}. +The following two examples show how the \kcode{target data} construct +maps variables to a device data environment. The \kcode{target data} +construct's device data environment encloses the \kcode{target} construct's device +data environment in the function \ucode{vec_mult()}. -\index{map clause@\code{map} clause!alloc map-type@\code{alloc} map-type} +\index{map clause@\kcode{map} clause!alloc map-type@\kcode{alloc} map-type} When the type of the variable appearing in an array section is pointer, the pointer variable and the storage location of the corresponding array section are mapped to the device data environment. The pointer variable is treated as if it had appeared -in a \code{map} clause with a map-type of \code{alloc}. The array section's -storage location is mapped according to the map-type in the \code{map} clause -(the default map-type is \code{tofrom}). +in a \kcode{map} clause with a map-type of \kcode{alloc}. The array section's +storage location is mapped according to the map-type in the \kcode{map} clause +(the default map-type is \kcode{tofrom}). -The \code{target} construct's device data environment inherits the storage locations -of the array sections \plc{v1[0:N]}, \plc{v2[:n]}, and \plc{p0[0:N]} from the enclosing \code{target}~\code{data} +The \kcode{target} construct's device data environment inherits the storage locations +of the array sections \ucode{v1[0:N]}, \ucode{v2[:n]}, and \ucode{p0[0:N]} from the enclosing \kcode{target data} construct's device data environment. Neither initialization nor assignment is performed for the array sections in the new device data environment. -The pointer variables \plc{p1}, \plc{v3}, and \plc{v4} are mapped into the \code{target} construct's device +The pointer variables \ucode{p1}, \ucode{v3}, and \ucode{v4} are mapped into the \kcode{target} construct's device data environment with an implicit map-type of alloc and they are assigned the address of the storage location associated with their corresponding array sections. Note -that the following pairs of array section storage locations are equivalent (\plc{p0[:N]}, -\plc{p1[:N]}), (\plc{v1[:N]},\plc{v3[:N]}), and (\plc{v2[:N]},\plc{v4[:N]}). +that the following pairs of array section storage locations are equivalent (\ucode{p0[:N]}, +\ucode{p1[:N]}), (\ucode{v1[:N]},\ucode{v3[:N]}), and (\ucode{v2[:N]},\ucode{v4[:N]}). \cexample[4.0]{target_data}{4} The Fortran code maps the pointers and storage in an identical manner (same extent, -but uses indices from 1 to \plc{N}). +but uses indices from 1 to \ucode{N}). -The \code{target} construct's device data environment inherits the storage locations -of the arrays \plc{v1}, \plc{v2} and \plc{p0} from the enclosing \code{target} \code{data} constructs's +The \kcode{target} construct's device data environment inherits the storage locations +of the arrays \ucode{v1}, \ucode{v2} and \ucode{p0} from the enclosing \kcode{target data} constructs's device data environment. However, in Fortran the associated data of the pointer is known, and the shape is not required. -The pointer variables \plc{p1}, \plc{v3}, and \plc{v4} are mapped into the \code{target} construct's -device data environment with an implicit map-type of \code{alloc} and they are +The pointer variables \ucode{p1}, \ucode{v3}, and \ucode{v4} are mapped into the \kcode{target} construct's +device data environment with an implicit map-type of \kcode{alloc} and they are assigned the address of the storage location associated with their corresponding array sections. Note that the following pair of array storage locations are equivalent -(\plc{p0},\plc{p1}), (\plc{v1},\plc{v3}), and (\plc{v2},\plc{v4}). +(\ucode{p0},\ucode{p1}), (\ucode{v1},\ucode{v3}), and (\ucode{v2},\ucode{v4}). \ffreeexample[4.0]{target_data}{4} -In the following example, the variables \plc{p1}, \plc{v3}, and \plc{v4} are references to the pointer -variables \plc{p0}, \plc{v1} and \plc{v2} respectively. The \code{target} construct's device data -environment inherits the pointer variables \plc{p0}, \plc{v1}, and \plc{v2} from the enclosing \code{target} -\code{data} construct's device data environment. Thus, \plc{p1}, \plc{v3}, and \plc{v4} are already +In the following example, the variables \ucode{p1}, \ucode{v3}, and \ucode{v4} are references to the pointer +variables \ucode{p0}, \ucode{v1} and \ucode{v2} respectively. The \kcode{target} construct's device data +environment inherits the pointer variables \ucode{p0}, \ucode{v1}, and \ucode{v2} from the enclosing +\kcode{target data} construct's device data environment. Thus, \ucode{p1}, \ucode{v3}, and \ucode{v4} are already present in the device data environment. \cppexample[4.0]{target_data}{5} In the following example, the usual Fortran approach is used for dynamic memory. -The \plc{p0}, \plc{v1}, and \plc{v2} arrays are allocated in the main program and passed as references -from one routine to another. In \code{vec\_mult}, \plc{p1}, \plc{v3} and \plc{v4} are references to the -\plc{p0}, \plc{v1}, and \plc{v2} arrays, respectively. The \code{target} construct's device data -environment inherits the arrays \plc{p0}, \plc{v1}, and \plc{v2} from the enclosing target data construct's -device data environment. Thus, \plc{p1}, \plc{v3}, and \plc{v4} are already present in the device +The \ucode{p0}, \ucode{v1}, and \ucode{v2} arrays are allocated in the main program and passed as references +from one routine to another. In \ucode{vec_mult}, \ucode{p1}, \ucode{v3} and \ucode{v4} are references to the +\ucode{p0}, \ucode{v1}, and \ucode{v2} arrays, respectively. The \kcode{target} construct's device data +environment inherits the arrays \ucode{p0}, \ucode{v1}, and \ucode{v2} from the enclosing target data construct's +device data environment. Thus, \ucode{p1}, \ucode{v3}, and \ucode{v4} are already present in the device data environment. \ffreeexample[4.0]{target_data}{5} -\subsection{\code{target} \code{data} Construct with \code{if} Clause} +\subsection{\kcode{target data} Construct with \kcode{if} Clause} \label{subsec:target_data_if} -\index{target data construct@\code{target}~\code{data} construct!if clause@\code{if} clause} -\index{clauses!if@\code{if}} -\index{if clause@\code{if} clause} +\index{target data construct@\kcode{target data} construct!if clause@\kcode{if} clause} +\index{clauses!if@\kcode{if}} +\index{if clause@\kcode{if} clause} -The following two examples show how the \code{target} \code{data} construct +The following two examples show how the \kcode{target data} construct maps variables to a device data environment. -In the following example, the if clause on the \code{target} \code{data} construct -indicates that if the variable \plc{N} is smaller than a given threshold, then the \code{target} -\code{data} construct will not create a device data environment. +In the following example, the if clause on the \kcode{target data} construct +indicates that if the variable \ucode{N} is smaller than a given threshold, then +the \kcode{target data} construct will not create a device data environment. -The \code{target} constructs enclosed in the \code{target} \code{data} region -must also use an \code{if} clause on the same condition, otherwise the pointer -variable \plc{p} is implicitly mapped with a map-type of \code{tofrom}, but the storage -location for the array section \plc{p[0:N]} will not be mapped in the device data environments -of the \code{target} constructs. +The \kcode{target} constructs enclosed in the \kcode{target data} region +must also use an \kcode{if} clause on the same condition, otherwise the pointer +variable \ucode{p} is implicitly mapped with a map-type of \kcode{tofrom}, but the storage +location for the array section \ucode{p[0:N]} will not be mapped in the device data environments +of the \kcode{target} constructs. \cexample[4.0]{target_data}{6} -\pagebreak -The \code{if} clauses work the same way for the following Fortran code. The \code{target} -constructs enclosed in the \code{target} \code{data} region should also use -an \code{if} clause with the same condition, so that the \code{target} \code{data} -region and the \code{target} region are either both created for the device, or +%\pagebreak +The \kcode{if} clauses work the same way for the following Fortran code. The \kcode{target} +constructs enclosed in the \kcode{target data} region should also use +an \kcode{if} clause with the same condition, so that the \kcode{target data} +region and the \kcode{target} region are either both created for the device, or are both ignored. \ffreeexample[4.0]{target_data}{6} -\pagebreak -In the following example, when the \code{if} clause conditional expression on -the \code{target} construct evaluates to \plc{false}, the target region will -execute on the host device. However, the \code{target} \code{data} construct -created an enclosing device data environment that mapped \plc{p[0:N]} to a device data -environment on the default device. At the end of the \code{target} \code{data} -region the array section \plc{p[0:N]} will be assigned from the device data environment +%\pagebreak +In the following example, when the \kcode{if} clause conditional expression on +the \kcode{target} construct evaluates to \vcode{false}, the target region will +execute on the host device. However, the \kcode{target data} construct +created an enclosing device data environment that mapped \ucode{p[0:N]} to a device data +environment on the default device. At the end of the \kcode{target data} +region the array section \ucode{p[0:N]} will be assigned from the device data environment to the corresponding variable in the data environment of the task that encountered -the \code{target} \code{data} construct, resulting in undefined values in \plc{p[0:N]}. +the \kcode{target data} construct, resulting in undefined values in \ucode{p[0:N]}. \cexample[4.0]{target_data}{7} -\pagebreak -The \code{if} clauses work the same way for the following Fortran code. When -the \code{if} clause conditional expression on the \code{target} construct -evaluates to \plc{false}, the \code{target} region will execute on the host -device. However, the \code{target} \code{data} construct created an enclosing -device data environment that mapped the \plc{p} array (and \plc{v1} and \plc{v2}) to a device data -environment on the default target device. At the end of the \code{target} \code{data} -region the \plc{p} array will be assigned from the device data environment to the corresponding -variable in the data environment of the task that encountered the \code{target} -\code{data} construct, resulting in undefined values in \plc{p}. +%\pagebreak +The \kcode{if} clauses work the same way for the following Fortran code. When +the \kcode{if} clause conditional expression on the \kcode{target} construct +evaluates to \vcode{false}, the \kcode{target} region will execute on the host +device. However, the \kcode{target data} construct created an enclosing +device data environment that mapped the \ucode{p} array (and \ucode{v1} and \ucode{v2}) to a device data +environment on the default target device. At the end of the \kcode{target data} +region the \ucode{p} array will be assigned from the device data environment to the corresponding +variable in the data environment of the task that encountered the \kcode{target data} construct, resulting in undefined values in \ucode{p}. \ffreeexample[4.0]{target_data}{7} diff --git a/devices/target_defaultmap.tex b/devices/target_defaultmap.tex index b268c50..34558dc 100644 --- a/devices/target_defaultmap.tex +++ b/devices/target_defaultmap.tex @@ -1,60 +1,68 @@ -\pagebreak -\section{\code{defaultmap} Clause} +%\pagebreak +\section{\kcode{defaultmap} Clause} \label{sec:defaultmap} -\index{target construct@\code{target} construct!defaultmap clause@\code{defaultmap} clause} -\index{clauses!defaultmap@\code{defaultmap}} -\index{defaultmap clause@\code{defaultmap} clause} -\index{defaultmap clause@\code{defaultmap} clause!implicit behavior} -\index{defaultmap clause@\code{defaultmap} clause!variable category} +\index{target construct@\kcode{target} construct!defaultmap clause@\kcode{defaultmap} clause} +\index{clauses!defaultmap@\kcode{defaultmap}} +\index{defaultmap clause@\kcode{defaultmap} clause} +\index{defaultmap clause@\kcode{defaultmap} clause!implicit behavior} +\index{defaultmap clause@\kcode{defaultmap} clause!variable category} The implicitly determined data-mapping and data-sharing attribute -rules of variables referenced in a \code{target} construct can be -changed by the \code{defaultmap} clause. +rules of variables referenced in a \kcode{target} construct can be +changed by the \kcode{defaultmap} clause. As of OpenMP 5.0, the implicit behavior is specified as -\code{alloc}, \code{to}, \code{from}, \code{tofrom}, -\code{firstprivate}, \code{none}, \code{default} or \code{present}, -and is optionally applied to a variable category specified as \code{scalar}, \code{aggregate}, \code{allocatable}, -or \code{pointer}. +\kcode{alloc}, \kcode{to}, \kcode{from}, \kcode{tofrom}, +\kcode{firstprivate}, \kcode{none}, \kcode{default} or \kcode{present}, +and is optionally applied to a variable category specified as \kcode{scalar}, \kcode{aggregate}, \kcode{allocatable}, +or \kcode{pointer}. -A referenced variable that is in a specified ``category`` is treated as having -the specified implicit behavior. In C/C++, \code{scalar} refers to +A referenced variable that is in a specified ``category'' is treated as having +the specified implicit behavior. In C/C++, \kcode{scalar} refers to base-language scalar variables, except pointers. In Fortran it refers to a scalar variable, as defined by the base language, of intrinsic type but -excluding the character type. The \code{aggregate} category refers to arrays and +excluding the character type. The \kcode{aggregate} category refers to arrays and structures (which includes variables of any derived type and of character type for Fortran). Fortran -has the additional category of \code{allocatable} for variables that have the -allocatable attribute. The \code{pointer} category refers to pointers, which +has the additional category of \kcode{allocatable} for variables that have the +allocatable attribute. The \kcode{pointer} category refers to pointers, which for Fortran are variables that have the pointer attribute. -In the example below, the first \code{target} construct uses \code{defaultmap} +In the example below, the first \kcode{target} construct uses \kcode{defaultmap} clauses to set data-mapping and possibly data-sharing attributes that reproduce the default rules for implicitly determined data-mapping and data-sharing -attributes for variables in the construct. That is, if the \code{defaultmap} -clauses were removed, the results would be identical. +attributes for variables in the construct. That is, if the \kcode{defaultmap} +clauses were removed, the results would be identical. As of OpenMP 5.2 +the same effect can now be achieved by \kcode{defaultmap(default)} with +the \kcode{target} construct. -In the second \code{target} construct all implicit behavior is removed -by specifying the \code{none} implicit behavior in the \code{defaultmap} clause. +In the second \kcode{target} construct all implicit behavior is removed +by specifying the \kcode{none} implicit behavior in the \kcode{defaultmap} clause. Hence, all variables that don't have predetermined attributes must be given an -explicit data-mapping or data-sharing attribute. A scalar (\texttt{s}), an -array (\texttt{A}) and a structure (\texttt{S} for the C/C++ example and -\texttt{D} for the Fortran example) are explicitly mapped with the -\code{tofrom} map type. - -The third \code{target} construct shows another usual case for using the -\code{defaultmap} clause. The default mapping for (non-pointer) scalar -variables is specified. Here, the default implicit mapping for \texttt{s3} is -\code{tofrom} as specified in the \code{defaultmap} clause, while \texttt{s1} -and \texttt{s2} are instead explicitly treated as \code{firstprivate}. - -In the fourth \code{target} construct all arrays and structures are given -\code{firstprivate} implicit behavior by default with the use of the -\code{aggregate} variable category. For the Fortran example, the -\code{allocatable} category is used in a separate \code{defaultmap} clause to -specify default \code{firstprivate} implicit behavior for referenced -allocatable variables (in this case, \texttt{H}). +explicit data-mapping or data-sharing attribute. A scalar (\ucode{s}), an +array (\ucode{A}) and a structure (\ucode{S} for the C/C++ example and +\ucode{D} for the Fortran example) are explicitly mapped with the +\kcode{tofrom} map type. + +The third \kcode{target} construct shows another usual case for using the +\kcode{defaultmap} clause. The default mapping for (non-pointer) scalar +variables is specified. Here, the default implicit mapping for \ucode{s3} is +\kcode{tofrom} as specified in the \kcode{defaultmap} clause, while \ucode{s1} +and \ucode{s2} are instead explicitly treated as \kcode{firstprivate}. + +In the fourth \kcode{target} construct all arrays and structures are given +\kcode{firstprivate} implicit behavior by default with the use of the +\kcode{aggregate} variable category. For the Fortran example, the +\kcode{allocatable} category is used in a separate \kcode{defaultmap} clause to +specify default \kcode{firstprivate} implicit behavior for referenced +allocatable variables (in this case, \ucode{H}). % (Common use cases for C/C++ heap storage can be found in % \specref{sec:pointer_mapping}.) -\cexample[5.0]{target_defaultmap}{1} +The fifth \kcode{target} construct shows a case for using the +\kcode{defaultmap} clause with the \kcode{all} variable category which was introduced in +OpenMP 5.2. The scalar variables \ucode{s1} and \ucode{s2} are mapped \kcode{to}. +\ucode{s3} is only mapped \kcode{from} due to the explicit map specified. + +%\pagebreak +\cexample[5.2]{target_defaultmap}{1} -\ffreeexample[5.0]{target_defaultmap}{1} +\ffreeexample[5.2]{target_defaultmap}{1} diff --git a/devices/target_fort_allocatable_array_mapping.tex b/devices/target_fort_allocatable_array_mapping.tex index b792a3e..3a159e5 100644 --- a/devices/target_fort_allocatable_array_mapping.tex +++ b/devices/target_fort_allocatable_array_mapping.tex @@ -1,16 +1,16 @@ -\pagebreak +%\pagebreak \section{Fortran Allocatable Array Mapping} \label{sec:fort_allocatable_array_mapping} \index{mapping!allocatable array, Fortran} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -The following examples illustrate the use of Fortran allocatable arrays in \code{target} regions. +The following examples illustrate the use of Fortran allocatable arrays in \kcode{target} regions. -In the first example, allocatable variables (\plc{a} and \plc{b}) are first allocated +In the first example, allocatable variables (\ucode{a} and \ucode{b}) are first allocated on the host, and then mapped onto a device in the Target 1 and 2 sections, respectively. -For \plc{a} the map is implicit and for \plc{b} an explicit map is used. -Both are mapped with the default \code{tofrom} map type. +For \ucode{a} the map is implicit and for \ucode{b} an explicit map is used. +Both are mapped with the default \kcode{tofrom} map type. The user-level behavior is similar to non-allocatable arrays. However, the mapping operations include creation of the allocatable variable, creation of the allocated storage, setting the allocation status to allocated, @@ -18,17 +18,17 @@ \section{Fortran Allocatable Array Mapping} In Target 3 and 4 sections, allocatable variables are mapped in two different ways before they are allocated on the host and subsequently used on the device. -In one case, a \code{target}~\code{data} construct creates an enclosing region for +In one case, a \kcode{target data} construct creates an enclosing region for the allocatable variable to persist, and in the other case a -\code{declare}~\code{target} directive maps the allocation variable for all device executions. -In both cases the new array storage is mapped \code{tofrom} with the \code{always} modifier. -An explicit map is used here with an \code{always} modifier to ensure that the allocatable +\kcode{declare target} directive maps the allocation variable for all device executions. +In both cases the new array storage is mapped \kcode{tofrom} with the \kcode{always} modifier. +An explicit map is used here with an \kcode{always} modifier to ensure that the allocatable variable status is updated on the device. -Note: OpenMP 5.1 specifies that an \code{always} map modifier guarantees the +Note: OpenMP 5.1 specifies that an \kcode{always} map modifier guarantees the allocation status update for an existing allocatable variable on the device. -In OpenMP 6.0, this restriction may be relaxed to also guarantee updates -without the \code{always} modifier. +%In OpenMP 6.0, this restriction may be relaxed to also guarantee updates +%without the \kcode{always} modifier. In Target 3 and 4 sections, the behavior of an allocatable variable is very much like a Fortran pointer, in which a pointer can be mapped to a device with an associated @@ -40,25 +40,26 @@ \section{Fortran Allocatable Array Mapping} \ffreeexample[5.1]{target_fort_allocatable_map}{1} Once an allocatable variable has been allocated on the host, -its allocation status may not be changed in a \code{target} region, either +its allocation status may not be changed in a \kcode{target} region, either explicitly or implicitly. The following example illustrates typical operations on allocatable variables that violate this restriction. Note, an assignment that reshapes or reassigns (causing a deallocation -and allocation) in a \code{target} region is not conforming. +and allocation) in a \kcode{target} region is not conforming. Also, an initial intrinsic assignment of an allocatable variable -requires deallocation before the \scode{target} region ends. +requires deallocation before the \kcode{target} region ends. \ffreeexample[5.1]{target_fort_allocatable_map}{2} +\newpage The next example illustrates a corner case of this restriction (allocatable status -change in a \code{target} region). -Two allocatable arrays are passed to a subroutine within a \code{target} -region. The dummy-variable arrays are declared allocatable. -Also, the \plc{ain} variable has the \plc{intent(in)} attribute, and \plc{bout} -has the \plc{intent(out)} attribute. -For the dummy argument with the attributes \plc{allocatable} and \plc{intent(out)}, +change in a \kcode{target} region). +Two allocatable arrays are passed to a subroutine within a \kcode{target} +region. The dummy-variable arrays are declared \bcode{allocatable}. +Also, the \ucode{ain} variable has the \bcode{intent(in)} attribute, and \ucode{bout} +has the \bcode{intent(out)} attribute. +For the dummy argument with the attributes \bcode{allocatable} and \bcode{intent(out)}, the compiler will deallocate the associated actual argument when the subroutine is invoked. (However, the allocation on procedure entry can be avoided by specifying the intent -as \plc{intent(inout)}, making the intended use conforming.) +as \bcode{intent(inout)}, making the intended use conforming.) \ffreeexample[5.1]{target_fort_allocatable_map}{3} diff --git a/devices/target_mapper.tex b/devices/target_mapper.tex index e1dd9ef..0179fd8 100644 --- a/devices/target_mapper.tex +++ b/devices/target_mapper.tex @@ -1,31 +1,31 @@ \pagebreak -\section{\code{declare mapper} Directive} +\section{\kcode{declare mapper} Directive} \label{sec:declare_mapper} -\index{directives!declare mapper@\code{declare}~\code{mapper}} -\index{declare mapper directive@\code{declare}~\code{mapper} directive} +\index{directives!declare mapper@\kcode{declare mapper}} +\index{declare mapper directive@\kcode{declare mapper} directive} -The following examples show how to use the \code{declare mapper} +The following examples show how to use the \kcode{declare mapper} directive to prescribe a map for later use. It is also quite useful for pre-defining partitioned and nested structure elements. -In the first example the \code{declare mapper} directive specifies -that any structure of type \plc{myvec\_t} for which implicit data-mapping -rules apply will be mapped according to its \code{map} clause. -The variable \plc{v} is used for referencing the structure and its -elements within the \code{map} clause. -Within the \code{map} clause the \plc{v} variable specifies that all +In the first example the \kcode{declare mapper} directive specifies +that any structure of type \ucode{myvec_t} for which implicit data-mapping +rules apply will be mapped according to its \kcode{map} clause. +The variable \ucode{v} is used for referencing the structure and its +elements within the \kcode{map} clause. +Within the \kcode{map} clause the \ucode{v} variable specifies that all elements of the structure are to be mapped. Additionally, the -array section \plc{v.data[0:v.len]} specifies that the dynamic +array section \ucode{v.data[0:v.len]} specifies that the dynamic storage for data is to be mapped. -Within the main program the \plc{s} variable is typed as \plc{myvec\_t}. -Since the variable is found within the target region and the type has a mapping prescribed by -a \code{declare mapper} directive, it will be automatically mapped according to its prescription: -full structure, plus the dynamic storage of the \plc{data} element. +Within the main program the \ucode{s} variable is typed as \ucode{myvec_t}. +Since the variable is found within the \kcode{target} region and the type has a mapping prescribed by +a \kcode{declare mapper} directive, it will be automatically mapped according to its prescription: +full structure, plus the dynamic storage of the \ucode{data} element. -%Note: By default the mapping is \code{tofrom}. -%The associated Fortran allocatable \plc{data} array is automatically mapped with the derived +%Note: By default the mapping is \kcode{tofrom}. +%The associated Fortran allocatable \ucode{data} array is automatically mapped with the derived %type, it does not require an array section as in the C/C++ example. \cexample[5.0]{target_mapper}{1} @@ -34,31 +34,31 @@ \section{\code{declare mapper} Directive} %\pagebreak \index{mapping!deep copy} -\index{map clause@\code{map} clause!mapper modifier@\code{mapper} modifier} -\index{mapper modifier@\code{mapper} modifier} +\index{map clause@\kcode{map} clause!mapper modifier@\kcode{mapper} modifier} +\index{mapper modifier@\kcode{mapper} modifier} The next example illustrates the use of the \plc{mapper-identifier} and deep copy within a structure. -The structure, \plc{dzmat\_t}, represents a complex matrix, -with separate real (\plc{r\_m}) and imaginary (\plc{i\_m}) elements. -Two map identifiers are created for partitioning the \plc{dzmat\_t} structure. +The structure, \ucode{dzmat_t}, represents a complex matrix, +with separate real (\ucode{r_m}) and imaginary (\ucode{i_m}) elements. +Two map identifiers are created for partitioning the \ucode{dzmat_t} structure. -For the C/C++ code the first identifier is named \plc{top\_id} and maps the top half of -two matrices of type \plc{dzmat\_t}; while the second identifier, \plc{bottom\_id}, +For the C/C++ code the first identifier is named \ucode{top_id} and maps the top half of +two matrices of type \ucode{dzmat_t}; while the second identifier, \ucode{bottom_id}, maps the lower half of two matrices. -Each identifier is applied to a different \code{target} construct, -as \code{map(mapper(top\_id), tofrom: a,b)} -and \code{map(mapper(bottom\_id), tofrom: a,b)}. +Each identifier is applied to a different \kcode{target} construct, +as \kcode{map(mapper(\ucode{top_id}), tofrom: \ucode{a,b})} +and \kcode{map(mapper(\ucode{bottom_id}), tofrom: \ucode{a,b})}. Each target offload is allowed to execute concurrently on two different devices -(\plc{0} and \plc{1}) through the \code{nowait} clause. -%The OpenMP 5.1 \code{parallel masked} construct creates a region of two threads -%for these \code{target} constructs, with a single thread (\plc{primary}) generator. +(\ucode{0} and \ucode{1}) through the \kcode{nowait} clause. +%The OpenMP 5.1 \kcode{parallel masked} construct creates a region of two threads +%for these \kcode{target} constructs, with a single thread (\plc{primary}) generator. -The Fortran code uses the \plc{left\_id} and \plc{right\_id} map identifiers in the -\code{map(mapper(left\_id),tofrom: a,b)} and \code{map(mapper(right\_id),tofrom: a,b)} map clauses. +The Fortran code uses the \ucode{left_id} and \ucode{right_id} map identifiers in the +\kcode{map(mapper(\ucode{left_id}),tofrom: \ucode{a,b})} and \kcode{map(mapper(\ucode{right_id}),tofrom: \ucode{a,b})} map clauses. The array sections for these left and right contiguous portions of the matrices -were defined previously in the \code{declare mapper} directive. +were defined previously in the \kcode{declare mapper} directive. -Note, the \plc{is} and \plc{ie} scalars are firstprivate -by default for a target region, but are declared firstprivate anyway +Note, the \ucode{is} and \ucode{ie} scalars are firstprivate +by default for a \kcode{target} region, but are declared firstprivate anyway to remind the user of important firstprivate data-sharing properties required here. \cexample[5.0]{target_mapper}{2} @@ -66,23 +66,23 @@ \section{\code{declare mapper} Directive} \ffreeexample[5.0]{target_mapper}{2} %\pagebreak -In the third example \plc{myvec} structures are -nested within a \plc{mypoints} structure. The \plc{myvec\_t} type is mapped -as in the first example. Following the \plc{mypoints} structure declaration, -the \plc{mypoints\_t} type is mapped by a \code{declare mapper} directive. -For this structure the \plc{hostonly\_data} element will not be mapped; -also the array section of \plc{x} (\plc{v.x[:1]}) and \plc{x} will be mapped; and -\plc{scratch} will be allocated and used as scratch storage on the device. -The default map-type mapping, \code{tofrom}, applies to the \plc{x} array section, -but not to \plc{scratch} which is explicitly mapped with the \code{alloc} map-type. -Note: the variable \plc{v} is not included in the map list (otherwise -the \plc{hostonly\_data} would be mapped)-- just the elements +In the third example \ucode{myvec} structures are +nested within a \ucode{mypoints} structure. The \ucode{myvec_t} type is mapped +as in the first example. Following the \ucode{mypoints} structure declaration, +the \ucode{mypoints_t} type is mapped by a \kcode{declare mapper} directive. +For this structure the \ucode{hostonly_data} element will not be mapped; +also the array section of \ucode{x} (\ucode{v.x[:1]}) and \ucode{x} will be mapped; and +\ucode{scratch} will be allocated and used as scratch storage on the device. +The default map-type mapping, \kcode{tofrom}, applies to the \ucode{x} array section, +but not to \ucode{scratch} which is explicitly mapped with the \kcode{alloc} map-type. +Note: the variable \ucode{v} is not included in the map list (otherwise +the \ucode{hostonly_data} would be mapped)-- just the elements to be mapped are listed. -The two mappers are combined when a \plc{mypoints\_t} structure type is mapped, -because the mapper \plc{myvec\_t} structure type is used within a \plc{mypoints\_t} +The two mappers are combined when a \ucode{mypoints_t} structure type is mapped, +because the mapper \ucode{myvec_t} structure type is used within a \ucode{mypoints_t} type structure. -%Note, in the main program \plc{P} is an array of \plc{mypoints\_t} type structures, +%Note, in the main program \ucode{P} is an array of \ucode{mypoints_t} type structures, %and hence every element of the array is mapped with the mapper prescription. \cexample[5.0]{target_mapper}{3} diff --git a/devices/target_pointer_mapping.tex b/devices/target_pointer_mapping.tex index 5bccfee..e06ff52 100644 --- a/devices/target_pointer_mapping.tex +++ b/devices/target_pointer_mapping.tex @@ -7,130 +7,131 @@ \section{Pointer Mapping} Pointers that contain host addresses require that those addresses are translated to device addresses for them to be useful in the context of a device data environment. Broadly speaking, there are two scenarios where this is important. -The first scenario is where the pointer is mapped to the device data environment, such that references to the pointer inside a \code{target} region are to the corresponding pointer. Pointer attachment ensures that the corresponding pointer will contain a device address when all of the following conditions are true: +The first scenario is where the pointer is mapped to the device data environment, such that references to the pointer inside a \kcode{target} region are to the corresponding pointer. Pointer \plc{attachment} ensures that the corresponding pointer will contain a device address when all of the following conditions are true: \begin{itemize} \item the pointer is mapped by directive $A$ to a device; \item a list item that uses the pointer as its base pointer (call it the \emph{pointee}) is mapped, to the same device, by directive $B$, which may be the same as $A$; \item the effect of directive $B$ is to create either the corresponding pointer or pointee in the device data environment of the device. \end{itemize} -Given the above conditions, pointer attachment is initiated as a result of directive $B$ and subsequent references to the pointee list item in a target region that use the pointer will access the corresponding pointee. The corresponding pointer remains in this \emph{attached} state until it is removed from the device data environment. +Given the above conditions, pointer attachment is initiated as a result of directive $B$ and subsequent references to the pointee list item in a target region that use the pointer will access the corresponding pointee. The corresponding pointer remains in this \plc{attached} state until it is removed from the device data environment. -The second scenario, which is only applicable for C/C++, is where the pointer is implicitly privatized inside a \code{target} construct when it appears as the base pointer to a list item on the construct and does not appear explicitly as a list item in a \code{map} clause, \code{is\_device\_ptr} clause, or data-sharing attribute clause. This scenario can be further split into two cases: the list item is a zero-length array section (e.g., \plc{p[:0]}) or it is not. +The second scenario, which is only applicable for C/C++, is where the pointer is implicitly privatized inside a \kcode{target} construct when it appears as the base pointer to a list item on the construct and does not appear explicitly as a list item in a \kcode{map} clause, \kcode{is_device_ptr} clause, or data-sharing attribute clause. This scenario can be further split into two cases: the list item is a zero-length array section (e.g., \ucode{p[:0]}) or it is not. -If it is a zero-length array section, this will trigger a runtime check on entry to the \code{target} region for a previously mapped list item where the value of the pointer falls within the range of its base address and ending address. If such a match is found the private pointer is initialized to the device address corresponding to the value of the original pointer, and otherwise it is initialized to NULL (or retains its original value if the \code{unified\_address} requirement is specified for that compilation unit). +If it is a zero-length array section, this will trigger a runtime check on entry to the \kcode{target} region for a previously mapped list item where the value of the pointer falls within the range of its base address and ending address. If such a match is found the private pointer is initialized to the device address corresponding to the value of the original pointer, and otherwise it is initialized to \bcode{NULL} (or retains its original value if the \kcode{unified_address} requirement is specified for that compilation unit). -If the list item (again, call it the \emph{pointee}) is not a zero-length array section, the private pointer will be initialized such that references in the \code{target} region to the pointee list item that use the pointer will access the corresponding pointee. +If the list item (again, call it the \emph{pointee}) is not a zero-length array section, the private pointer will be initialized such that references in the \kcode{target} region to the pointee list item that use the pointer will access the corresponding pointee. The following example shows the basics of mapping pointers with and without associated storage on the host. -Storage for pointers \plc{ptr1} and \plc{ptr2} is created on the host. +Storage for pointers \ucode{ptr1} and \ucode{ptr2} is created on the host. To map storage that is associated with a pointer on the host, the data can be explicitly mapped as an array section so that the compiler knows -the amount of data to be assigned in the device (to the ``corresponding'' data storage area). -On the \code{target} construct array sections are mapped; however, the pointer \plc{ptr1} -is mapped, while \plc{ptr2} is not. Since \plc{ptr2} is not explicitly mapped, it is +the amount of data to be assigned in the device (to the \plc{corresponding} data storage area). +On the \kcode{target} construct array sections are mapped; however, the pointer \ucode{ptr1} +is mapped, while \ucode{ptr2} is not. Since \ucode{ptr2} is not explicitly mapped, it is firstprivate. This creates a subtle difference in the way these pointers can be used. -As a firstprivate pointer, \plc{ptr2} can be manipulated on the device; +As a firstprivate pointer, \ucode{ptr2} can be manipulated on the device; however, as an explicitly mapped pointer, -\plc{ptr1} becomes an \emph{attached} pointer and cannot be manipulated. +\ucode{ptr1} becomes an \emph{attached} pointer and cannot be manipulated. In both cases the host pointer is not updated with the device pointer address---as one would expect for distributed memory. The storage data on the host is updated from the corresponding device -data at the end of the \code{target} region. +data at the end of the \kcode{target} region. -As a comparison, note that the \plc{aray} array is automatically mapped, +As a comparison, note that the \ucode{aray} array is automatically mapped, since the compiler knows the extent of the array. -The pointer \plc{ptr3} is used inside the \code{target} construct, but it does +The pointer \ucode{ptr3} is used inside the \kcode{target} construct, but it does not appear in a data-mapping or data-sharing clause. Nor is there a -\code{defaultmap} clause on the construct to indicate what its implicit -data-mapping or data-sharing attribute should be. For such a case, \plc{ptr3} +\kcode{defaultmap} clause on the construct to indicate what its implicit +data-mapping or data-sharing attribute should be. For such a case, \ucode{ptr3} will be implicitly privatized within the construct and there will be a runtime check to see if the host memory to which it is pointing has corresponding memory in the device data environment. If this runtime check passes, the private -\plc{ptr3} would be initialized to point to the corresponding memory. But in +\ucode{ptr3} would be initialized to point to the corresponding memory. But in this case the check does not pass and so it is initialized to null. -Since \plc{ptr3} is private, the value to which it is assigned in the -\code{target} region is not returned into the original \plc{ptr3} on the host. +Since \ucode{ptr3} is private, the value to which it is assigned in the +\kcode{target} region is not returned into the original \ucode{ptr3} on the host. \cexample[5.0]{target_ptr_map}{1} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} -In the following example the global pointer \plc{p} appears in a -declare target directive. Hence, the pointer \plc{p} will -persist on the device throughout executions in all \code{target} regions. +In the following example the global pointer \ucode{p} appears in a +declare target directive. Hence, the pointer \ucode{p} will +persist on the device throughout executions in all \kcode{target} regions. -The pointer is also used in an array section of a \code{map} clause on -a \code{target} construct. When the pointer of storage associated with +The pointer is also used in an array section of a \kcode{map} clause on +a \kcode{target} construct. When the pointer of storage associated with a declare target directive -is mapped, as for the array section \plc{p[:N]} in the -\code{target} construct, the array section on the device is \emph{attached} -to the device pointer \plc{p} on entry to the construct, and -the value of the device pointer \plc{p} becomes undefined on exit. +is mapped, as for the array section \ucode{p[:N]} in the +\kcode{target} construct, the array section on the device is \emph{attached} +to the device pointer \ucode{p} on entry to the construct, and +the value of the device pointer \ucode{p} becomes undefined on exit. (Of course, storage allocation for the array section on the device will occur before the -pointer on the device is \emph{attached}.) +pointer on the device is attached.) % For globals with declare target is there such a things a % original and corresponding? \cexample[5.1]{target_ptr_map}{2} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} The following two examples illustrate subtle differences in pointer attachment to device address because of the order of data mapping. -In example \plc{target\_ptr\_map.3a} -the global pointer \plc{p1} points to array \plc{x} and \plc{p2} points to -array \plc{y} on the host. -The array section \plc{x[:N]} is mapped by the \code{target}~\code{enter}~\code{data} directive while array \plc{y} is mapped -on the \code{target} construct. -Since the \code{begin}~\code{declare}~\code{target} directive is applied to the declaration -of \plc{p1}, \plc{p1} is a treated like a mapped variable on the \code{target} -construct and references to \plc{p1} inside the construct will be to the -corresponding \plc{p1} that exists on the device. However, the corresponding -\plc{p1} will be undefined since there is no pointer attachment for it. Pointer -attachment for \plc{p1} would require that (1) \plc{p1} (or an lvalue -expression that refers to the same storage as \plc{p1}) appears as a base -pointer to a list item in a \code{map} clause, and (2) the construct that has -the \code{map} clause causes the list item to transition from \emph{not mapped} +In example \example{target_ptr_map.3a} +the global pointer \ucode{p1} points to array \ucode{x} and \ucode{p2} points to +array \ucode{y} on the host. +The array section \ucode{x[:N]} is mapped by the \kcode{target enter data} directive while array \ucode{y} is mapped +on the \kcode{target} construct. +Since the \kcode{begin declare target} directive is applied to the declaration +of \ucode{p1}, \ucode{p1} is a treated like a mapped variable on the \kcode{target} +construct and references to \ucode{p1} inside the construct will be to the +corresponding \ucode{p1} that exists on the device. However, the corresponding +\ucode{p1} will be undefined since there is no pointer attachment for it. Pointer +attachment for \ucode{p1} would require that (1) \ucode{p1} (or an lvalue +expression that refers to the same storage as \ucode{p1}) appears as a base +pointer to a list item in a \kcode{map} clause, and (2) the construct that has +the \kcode{map} clause causes the list item to transition from \emph{not mapped} to \emph{mapped}. The conditions are clearly not satisfied for this example. -The problem for \plc{p2} in this example is also subtle. It will be privatized -inside the \code{target} construct, with a runtime check for whether the memory +The problem for \ucode{p2} in this example is also subtle. It will be privatized +inside the \kcode{target} construct, with a runtime check for whether the memory to which it is pointing has corresponding memory that is accessible on the -device. If this check is successful, then the \plc{p2} inside the construct +device. If this check is successful, then the \ucode{p2} inside the construct would be appropriately initialized to point to that corresponding memory. -Unfortunately, despite there being an implicit map of the array \plc{y} (to -which \plc{p2} is pointing) on the construct, the order of this map relative to -the initialization of \plc{p2} is unspecified. Therefore, the initial value of -\plc{p2} will also be undefined. +Unfortunately, despite there being an implicit map of the array \ucode{y} (to +which \ucode{p2} is pointing) on the construct, the order of this map relative to +the initialization of \ucode{p2} is unspecified. Therefore, the initial value of +\ucode{p2} will also be undefined. -Thus, referencing values via either \plc{p1} or \plc{p2} inside -the \code{target} region would be invalid. +Thus, referencing values via either \ucode{p1} or \ucode{p2} inside +the \kcode{target} region would be invalid. \cexample[5.1]{target_ptr_map}{3a} -In example \plc{target\_ptr\_map.3b} the mapping orders for arrays \plc{x} -and \plc{y} were rearranged to allow proper pointer attachments. -On the \code{target} construct, the \code{map(x)} clause triggers pointer -attachment for \plc{p1} to the device address of \plc{x}. -Pointer \plc{p2} is assigned the device address of the previously mapped - array \plc{y}. -Referencing values via either \plc{p1} or \plc{p2} inside the \code{target} region is now valid. +In example \example{target_ptr_map.3b} the mapping orders for arrays \ucode{x} +and \ucode{y} were rearranged to allow proper pointer attachments. +On the \kcode{target} construct, the \kcode{map(\ucode{x})} clause triggers pointer +attachment for \ucode{p1} to the device address of \ucode{x}. +Pointer \ucode{p2} is assigned the device address of the previously mapped + array \ucode{y}. +Referencing values via either \ucode{p1} or \ucode{p2} inside the \kcode{target} region is now valid. \cexample[5.1]{target_ptr_map}{3b} %\clearpage -\index{routines!omp_target_is_accessible@\scode{omp_target_is_accessible}} -\index{omp_target_is_accessible routine@\scode{omp_target_is_accessible} routine} -In the following example, storage allocated on the host is not mapped in a \code{target} +\index{routines!omp_target_is_accessible@\kcode{omp_target_is_accessible}} +\index{omp_target_is_accessible routine@\kcode{omp_target_is_accessible} routine} + +In the following example, storage allocated on the host is not mapped in a \kcode{target} region if it is determined that the host memory is accessible from the device. On platforms that support host memory access from a target device, it may be more efficient to omit map clauses and avoid the potential memory allocation @@ -138,29 +139,29 @@ \section{Pointer Mapping} %For discrete memory storage on host and devices, explicit mapping may be required, whereas for %Unified Shared Memory platforms it may be optimal to avoid using map clauses, %because re-allocation of the space may occur when map clauses are present. -The \code{omp\_target\_is\_accessible} API routine is used to determine if the -host storage of size \plc{buf\_size} is accessible on the device, and a metadirective -is used to select the directive variant (a \code{target} with/without a \code{map} clause). +The \kcode{omp_target_is_accessible} API routine is used to determine if the +host storage of size \ucode{buf_size} is accessible on the device, and a metadirective +is used to select the directive variant (a \kcode{target} with/without a \kcode{map} clause). -The \code{omp\_target\_is\_accessible} routine will return true if the storage indicated +The \kcode{omp_target_is_accessible} routine will return true if the storage indicated by the first and second arguments is accessible on the target device. In this case, -the host pointer \plc{ptr} may be directly dereferenced in the subsequent -\code{target} region to access this storage, rather than mapping an array section based -off the pointer. By explicitly specifying the host pointer in a \code{firstprivate} -clause on the construct, its original value will be used directly in the \code{target} region. -In OpenMP 5.1, removing the \code{firstprivate} clause will result in an implicit presence -check of the storage to which \plc{ptr} points, and since this storage is not mapped by the -program, \plc{ptr} will be NULL-initialized in the \code{target} region. -In the next version of the OpenMP Specification, a false presence check without -the \code{firstprivate} clause will cause the pointer to retain its original value. +the host pointer \ucode{ptr} may be directly dereferenced in the subsequent +\kcode{target} region to access this storage, rather than mapping an array section based +off the pointer. By explicitly specifying the host pointer in a \kcode{firstprivate} +clause on the construct, its original value will be used directly in the \kcode{target} region. +In OpenMP 5.1, removing the \kcode{firstprivate} clause will result in an implicit presence +check of the storage to which \ucode{ptr} points, and since this storage is not mapped by the +program, \ucode{ptr} will be \bcode{NULL}-initialized in the \kcode{target} region. +In the OpenMP 5.2 Specification, a false presence check without +the \kcode{firstprivate} clause will cause the pointer to retain its original value. \cexample[5.2]{target_ptr_map}{4} \index{mapping!deep copy} -Similar to the previous example, the \code{omp\_target\_is\_accessible} routine is used to -discover if a deep copy is required for the platform. Here, the \plc{deep\_copy} map, -defined in the \code{declare}~\code{mapper} directive, is used if the host storage referenced by -\plc{s.ptr} (or \plc{s\%ptr} in Fortran) is not accessible from the device. +Similar to the previous example, the \kcode{omp_target_is_accessible} routine is used to +discover if a deep copy is required for the platform. Here, the \ucode{deep_copy} map, +defined in the \kcode{declare mapper} directive, is used if the host storage referenced by +\ucode{s.ptr} (or \ucode{s\%ptr} in Fortran) is not accessible from the device. \cexample[5.2]{target_ptr_map}{5} \ffreeexample[5.2]{target_ptr_map}{5} diff --git a/devices/target_structure_mapping.tex b/devices/target_structure_mapping.tex index 9beef2d..9ec9f94 100644 --- a/devices/target_structure_mapping.tex +++ b/devices/target_structure_mapping.tex @@ -3,23 +3,23 @@ \section{Structure Mapping} \label{sec:structure_mapping} \index{mapping!structure} -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -In the example below, only structure elements \plc{S.a}, \plc{S.b} and \plc{S.p} -of the \plc{S} structure appear in \code{map} clauses of a \code{target} construct. +In the example below, only structure elements \ucode{S.a}, \ucode{S.b} and \ucode{S.p} +of the \ucode{S} structure appear in \kcode{map} clauses of a \kcode{target} construct. Only these components have corresponding variables and storage on the device. -Hence, the large arrays, \plc{S.buffera} and \plc{S.bufferb}, and the \plc{S.x} component have no storage +Hence, the large arrays, \ucode{S.buffera} and \ucode{S.bufferb}, and the \ucode{S.x} component have no storage on the device and cannot be accessed. -Also, since the pointer member \plc{S.p} is used in an array section of a -\code{map} clause, the array storage of the array section on the device, -\plc{S.p[:N]}, is \emph{attached} to the pointer member \plc{S.p} on the device. -Explicitly mapping the pointer member \plc{S.p} is optional in this case. +Also, since the pointer member \ucode{S.p} is used in an array section of a +\kcode{map} clause, the array storage of the array section on the device, +\ucode{S.p[:N]}, is \plc{attached} to the pointer member \ucode{S.p} on the device. +Explicitly mapping the pointer member \ucode{S.p} is optional in this case. -Note: The buffer arrays and the \plc{x} variable have been grouped together, so that +Note: The buffer arrays and the \ucode{x} variable have been grouped together, so that the components that will reside on the device are all together (without gaps). This allows the runtime to optimize the transfer and the storage footprint on the device. @@ -28,8 +28,8 @@ \section{Structure Mapping} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The following example is a slight modification of the above example for -a C++ class. In the member function \plc{SAXPY::driver} -the array section \plc{p[:N]} is \emph{attached} to the pointer member \plc{p} +a C++ class. In the member function \ucode{SAXPY::driver} +the array section \ucode{p[:N]} is attached to the pointer member \ucode{p} on the device. \cppexample[5.1]{target_struct_map}{2} @@ -37,8 +37,8 @@ \section{Structure Mapping} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %In this example a pointer, \plc{p}, is mapped in a -%\code{target}~\code{data} construct (\code{map(p)}) and remains -%persistent throughout the \code{target}~\code{data} region. The address stored +%\code{target data} construct (\code{map(p)}) and remains +%persistent throughout the \code{target data} region. The address stored %on the host is not assigned to the device pointer variable, and %the device value is not copied back to the host at the end of the %region (for a pointer, it is as though \code{map(alloc:p}) is effectively @@ -62,22 +62,22 @@ \section{Structure Mapping} The next example shows two ways in which the structure may be \emph{incorrectly} mapped. -In Case 1, the array section \plc{S1.p[:N]} is first mapped in an enclosing -\code{target}~\code{data} construct, and the \code{target} construct then -implicitly maps the structure \plc{S1}. The initial map of the array section -does not map the base pointer \plc{S1.p} -- it only maps the elements of the +In Case 1, the array section \ucode{S1.p[:N]} is first mapped in an enclosing +\kcode{target data} construct, and the \kcode{target} construct then +implicitly maps the structure \ucode{S1}. The initial map of the array section +does not map the base pointer \ucode{S1.p} -- it only maps the elements of the array section. Furthermore, the implicit map is not sufficient to ensure -pointer attachment for the structure member \plc{S1.p} (refer to the conditions +pointer attachment for the structure member \ucode{S1.p} (refer to the conditions for pointer attachment described in Section~\ref{sec:pointer_mapping}). -Consequentially, the dereference operation \plc{S1.p[i]} in the call to -\plc{saxpyfun} will probably fail because \plc{S1.p} contains a host address. +Consequentially, the dereference operation \ucode{S1.p[i]} in the call to +\ucode{saxpyfun} will probably fail because \ucode{S1.p} contains a host address. In Case 2, again an array section is mapped on an enclosing -\code{target}~\code{data} construct. This time, the nested \code{target} -construct explicitly maps \plc{S2.p}, \plc{S2.a}, and \plc{S2.b}. But as in +\kcode{target data} construct. This time, the nested \kcode{target} +construct explicitly maps \ucode{S2.p}, \ucode{S2.a}, and \ucode{S2.b}. But as in Case 1, this does not satisfy the conditions for pointer attachment since the -construct must map a list item for which \splc{S2.p} is a base pointer, and it -must do so when the \splc{S2.p} is already present on the device or will be +construct must map a list item for which \ucode{S2.p} is a base pointer, and it +must do so when the \ucode{S2.p} is already present on the device or will be created on the device as a result of the same construct. @@ -88,37 +88,37 @@ \section{Structure Mapping} The following example correctly implements pointer attachment cases that involve implicit structure maps. -In Case 1, members \splc{p}, \splc{a}, and \splc{b} of the structure \splc{S1} -are explicitly mapped by the \code{target}~\code{data} construct, to avoid -mapping parts of \splc{S1} that aren't required on the device. The mapped -\splc{S1.p} is attached to the array section \splc{S1.p[:N]}, and remains +In Case 1, members \ucode{p}, \ucode{a}, and \ucode{b} of the structure \ucode{S1} +are explicitly mapped by the \kcode{target data} construct, to avoid +mapping parts of \ucode{S1} that aren't required on the device. The mapped +\ucode{S1.p} is attached to the array section \ucode{S1.p[:N]}, and remains attached while it exists on the device (for the duration of -\code{target}~\code{data} region). Due to the \splc{S1} reference inside the -nested \code{target} construct, the construct implicitly maps \splc{S1} so that +\kcode{target data} region). Due to the \ucode{S1} reference inside the +nested \kcode{target} construct, the construct implicitly maps \ucode{S1} so that the reference refers to the corresponding storage created by the enclosing -\code{target}~\code{data} region. Note that only the members \splc{a}, -\splc{b}, and \splc{p} may be accessed from this storage. - -In Case 2, only the storage for the array section \splc{S2.p[:N]} is mapped -by the \code{target}~\code{data} construct. The nested \code{target} -construct explicitly maps \splc{S2.a} and \splc{S2.b} and explicitly -maps an array section for which \splc{S2.p} is a base pointer. This satisfies -the conditions for \splc{S2.p} becoming an attached pointer. The array +\kcode{target data} region. Note that only the members \ucode{a}, +\ucode{b}, and \ucode{p} may be accessed from this storage. + +In Case 2, only the storage for the array section \ucode{S2.p[:N]} is mapped +by the \kcode{target data} construct. The nested \kcode{target} +construct explicitly maps \ucode{S2.a} and \ucode{S2.b} and explicitly +maps an array section for which \ucode{S2.p} is a base pointer. This satisfies +the conditions for \ucode{S2.p} becoming an attached pointer. The array section in this case is zero-length, but the effect would be the same if the -length was a positive integer less than or equal to \splc{N}. There is also an -implicit map of the containing structure \splc{S2}, again due to the reference -to \splc{S2} inside the construct. The effect of this implicit map permits -access only to members \splc{a}, \splc{b}, and \splc{p}, as for Case 1. - -In Case 3, there is no \code{target}~\code{data} construct. The \code{target} -construct explicitly maps \splc{S3.a} and \splc{S3.b} and explicitly -maps an array section for which \splc{S3.p} is a base pointer. Again, there is -an implicit map of the structure referenced in the construct, \splc{S3}. This -implicit map also causes \splc{S3.p} to be implicitly mapped, because no other -part of \splc{S3} is present prior to the construct being encountered. The -result is an attached pointer \splc{S3.p} on the device. As for Cases 1 and 2, -this implicit map only ensures that storage for the members \splc{a}, \splc{b}, -and \splc{p} are accessible within the corresponding \splc{S3} that is created +length was a positive integer less than or equal to \ucode{N}. There is also an +implicit map of the containing structure \ucode{S2}, again due to the reference +to \ucode{S2} inside the construct. The effect of this implicit map permits +access only to members \ucode{a}, \ucode{b}, and \ucode{p}, as for Case 1. + +In Case 3, there is no \kcode{target data} construct. The \kcode{target} +construct explicitly maps \ucode{S3.a} and \ucode{S3.b} and explicitly +maps an array section for which \ucode{S3.p} is a base pointer. Again, there is +an implicit map of the structure referenced in the construct, \ucode{S3}. This +implicit map also causes \ucode{S3.p} to be implicitly mapped, because no other +part of \ucode{S3} is present prior to the construct being encountered. The +result is an attached pointer \ucode{S3.p} on the device. As for Cases 1 and 2, +this implicit map only ensures that storage for the members \ucode{a}, \ucode{b}, +and \ucode{p} are accessible within the corresponding \ucode{S3} that is created on the device. \cexample[5.1]{target_struct_map}{4} diff --git a/devices/target_unstructured_data.tex b/devices/target_unstructured_data.tex index cf3915a..1df262e 100644 --- a/devices/target_unstructured_data.tex +++ b/devices/target_unstructured_data.tex @@ -1,56 +1,55 @@ -%begin -\pagebreak -\section{\code{target} \code{enter} \code{data} and \code{target} \code{exit} \code{data} Constructs} +%\pagebreak +\section{\kcode{target enter data} and \kcode{target exit data} Constructs} \label{sec:target_enter_exit_data} %\section{Simple target enter data and target exit data Constructs} -\index{constructs!target enter data@\code{target} \code{enter} \code{data}} -\index{constructs!target exit data@\code{target} \code{exit} \code{data}} -\index{target enter data construct@\code{target} \code{enter} \code{data} construct} -\index{target exit data construct@\code{target} \code{exit} \code{data} construct} +\index{constructs!target enter data@\kcode{target enter data}} +\index{constructs!target exit data@\kcode{target exit data}} +\index{target enter data construct@\kcode{target enter data} construct} +\index{target exit data construct@\kcode{target exit data} construct} -The structured data construct (\code{target}~\code{data}) provides persistent data on a -device for subsequent \code{target} constructs as shown in the -\code{target}~\code{data} examples above. This is accomplished by creating a single -\code{target}~\code{data} region containing \code{target} constructs. +The structured data construct (\kcode{target data}) provides persistent data on a +device for subsequent \kcode{target} constructs as shown in the +\kcode{target data} examples above. This is accomplished by creating a single +\kcode{target data} region containing \kcode{target} constructs. The unstructured data constructs allow the creation and deletion of data on the device at any appropriate point within the host code, as shown below -with the \code{target}~\code{enter}~\code{data} and \code{target}~\code{exit}~\code{data} constructs. +with the \kcode{target enter data} and \kcode{target exit data} constructs. -\index{map clause@\code{map} clause!alloc map-type@\code{alloc} map-type} -\index{map clause@\code{map} clause!delete map-type@\code{delete} map-type} -\index{alloc map-type@\code{alloc} map-type} -\index{delete map-type@\code{delete} map-type} +\index{map clause@\kcode{map} clause!alloc map-type@\kcode{alloc} map-type} +\index{map clause@\kcode{map} clause!delete map-type@\kcode{delete} map-type} +\index{alloc map-type@\kcode{alloc} map-type} +\index{delete map-type@\kcode{delete} map-type} The following C++ code creates/deletes a vector in a constructor/destructor -of a class. The constructor creates a vector with \code{target}~\code{enter}~\code{data} -and uses an \code{alloc} modifier in the \code{map} clause to avoid copying values -to the device. The destructor deletes the data (\code{target}~\code{exit}~\code{data}) -and uses the \code{delete} modifier in the \code{map} clause to avoid copying data -back to the host. Note, the stand-alone \code{target}~\code{enter}~\code{data} occurs -after the host vector is created, and the \code{target}~\code{exit}~\code{data} +of a class. The constructor creates a vector with \kcode{target enter data} +and uses an \kcode{alloc} modifier in the \kcode{map} clause to avoid copying values +to the device. The destructor deletes the data (\kcode{target exit data}) +and uses the \kcode{delete} modifier in the \kcode{map} clause to avoid copying data +back to the host. Note, the stand-alone \kcode{target enter data} occurs +after the host vector is created, and the \kcode{target exit data} construct occurs before the host data is deleted. \cppexample[4.5]{target_unstructured_data}{1} -\pagebreak -The following C code allocates and frees the data member of a Matrix structure. -The \code{init\_matrix} function allocates the memory used in the structure and -uses the \code{target}~\code{enter}~\code{data} directive to map it to the target device. The -\code{free\_matrix} function removes the mapped array from the target device +%\pagebreak +The following C code allocates and frees the data member of a \ucode{Matrix} structure. +The \ucode{init_matrix} function allocates the memory used in the structure and +uses the \kcode{target enter data} directive to map it to the target device. The +\ucode{free_matrix} function removes the mapped array from the target device and then frees the memory on the host. Note, the stand-alone -\code{target}~\code{enter}~\code{data} occurs after the host memory is allocated, and the -\code{target}~\code{exit}~\code{data} construct occurs before the host data is freed. +\kcode{target enter data} occurs after the host memory is allocated, and the +\kcode{target exit data} construct occurs before the host data is freed. \cexample[4.5]{target_unstructured_data}{1} -\pagebreak -The following Fortran code allocates and deallocates a module array. The -\code{initialize} subroutine allocates the module array and uses the -\code{target}~\code{enter}~\code{data} directive to map it to the target device. The -\code{finalize} subroutine removes the mapped array from the target device and +%\pagebreak +The following Fortran code allocates and deallocates a module array, \ucode{A}. The +\ucode{initialize} subroutine allocates the module array and uses the +\kcode{target enter data} directive to map it to the target device. The +\ucode{finalize} subroutine removes the mapped array from the target device and then deallocates the array on the host. Note, the stand-alone -\code{target}~\code{enter}~\code{data} occurs after the host memory is allocated, and the -\code{target}~\code{exit}~\code{data} construct occurs before the host data is deallocated. +\kcode{target enter data} occurs after the host memory is allocated, and the +\kcode{target exit data} construct occurs before the host data is deallocated. \ffreeexample[4.5]{target_unstructured_data}{1} %end diff --git a/devices/target_update.tex b/devices/target_update.tex index abc91af..e963084 100644 --- a/devices/target_update.tex +++ b/devices/target_update.tex @@ -1,70 +1,82 @@ -\pagebreak -\section{\code{target} \code{update} Construct} +%\pagebreak +\section{\kcode{target update} Construct} \label{sec:target_update} -\subsection{Simple \code{target} \code{data} and \code{target} \code{update} Constructs} +\subsection{Simple \kcode{target data} and \kcode{target update} Constructs} \label{subsec:target_data_and_update} -\index{constructs!target data@\code{target}~\code{data}} -\index{target data construct@\code{target}~\code{data} construct} -\index{constructs!target update@\code{target}~\code{update}} -\index{target update construct@\code{target}~\code{update} construct} -\index{target update construct@\code{target}~\code{update} construct!to clause@\code{to} clause} -\index{target update construct@\code{target}~\code{update} construct!from clause@\code{from} clause} -\index{target update construct@\code{target}~\code{update} construct!motion-clause@\plc{motion-clause}} +\index{constructs!target data@\kcode{target data}} +\index{target data construct@\kcode{target data} construct} +\index{constructs!target update@\kcode{target update}} +\index{target update construct@\kcode{target update} construct} +\index{target update construct@\kcode{target update} construct!to clause@\kcode{to} clause} +\index{target update construct@\kcode{target update} construct!from clause@\kcode{from} clause} +\index{target update construct@\kcode{target update} construct!motion-clause@\plc{motion-clause}} \index{clauses!motion-clause@\plc{motion-clause}} -\index{clauses!to@\code{to}} -\index{clauses!from@\code{from}} -\index{motion-clause@\plc{motion-clause}!to clause@\code{to} clause} -\index{motion-clause@\plc{motion-clause}!from clause@\code{from} clause} +\index{clauses!to@\kcode{to}} +\index{clauses!from@\kcode{from}} +\index{motion-clause@\plc{motion-clause}!to clause@\kcode{to} clause} +\index{motion-clause@\plc{motion-clause}!from clause@\kcode{from} clause} -The following example shows how the \code{target} \code{update} construct updates +The following example shows how the \kcode{target update} construct updates variables in a device data environment. -The \code{target} \code{data} construct maps array sections \plc{v1[:N]} and \plc{v2[:N]} -(arrays \plc{v1} and \plc{v2} in the Fortran code) into a device data environment. +The \kcode{target data} construct maps array sections \ucode{v1[:N]} and \ucode{v2[:N]} +(arrays \ucode{v1} and \ucode{v2} in the Fortran code) into a device data environment. -The task executing on the host device encounters the first \code{target} region +The task executing on the host device encounters the first \kcode{target} region and waits for the completion of the region. -After the execution of the first \code{target} region, the task executing on -the host device then assigns new values to \plc{v1[:N]} and \plc{v2[:N]} (\plc{v1} and \plc{v2} arrays -in Fortran code) in the task's data environment by calling the function \code{init\_again()}. +After the execution of the first \kcode{target} region, the task executing on +the host device then assigns new values to \ucode{v1[:N]} and \ucode{v2[:N]} (\ucode{v1} and \ucode{v2} arrays +in Fortran code) in the task's data environment by calling the function \ucode{init_again()}. -The \code{target} \code{update} construct assigns the new values of \plc{v1} and -\plc{v2} from the task's data environment to the corresponding mapped array sections -in the device data environment of the \code{target} \code{data} construct. +The \kcode{target update} construct assigns the new values of \ucode{v1} and +\ucode{v2} from the task's data environment to the corresponding mapped array sections +in the device data environment of the \kcode{target data} construct. -The task executing on the host device then encounters the second \code{target} +The task executing on the host device then encounters the second \kcode{target} region and waits for the completion of the region. -The second \code{target} region uses the updated values of \plc{v1[:N]} and \plc{v2[:N]}. +The second \kcode{target} region uses the updated values of \ucode{v1[:N]} and \ucode{v2[:N]}. \cexample[4.0]{target_update}{1} \ffreeexample[4.0]{target_update}{1} -\subsection{\code{target} \code{update} Construct with \code{if} Clause} +\subsection{\kcode{target update} Construct with \kcode{if} Clause} \label{subsec:target_update_if} -\index{target update construct@\code{target}~\code{update} construct!if clause@\code{if} clause} -\index{clauses!if@\code{if}} -\index{if clause@\code{if} clause} +\index{target update construct@\kcode{target update} construct!if clause@\kcode{if} clause} +\index{clauses!if@\kcode{if}} +\index{if clause@\kcode{if} clause} -The following example shows how the \code{target} \code{update} construct updates +The following example shows how the \kcode{target update} construct updates variables in a device data environment. -The \code{target} \code{data} construct maps array sections \plc{v1[:N]} and \plc{v2[:N]} -(arrays \plc{v1} and \plc{v2} in the Fortran code) into a device data environment. In between -the two \code{target} regions, the task executing on the host device conditionally -assigns new values to \plc{v1} and \plc{v2} in the task's data environment. The function \code{maybe\_init\_again()} -returns \plc{true} if new data is written. +The \kcode{target data} construct maps array sections \ucode{v1[:N]} and \ucode{v2[:N]} +(arrays \ucode{v1} and \ucode{v2} in the Fortran code) into a device data environment. In between +the two \kcode{target} regions, the task executing on the host device conditionally +assigns new values to \ucode{v1} and \ucode{v2} in the task's data environment. The function \ucode{maybe_init_again()} +returns \vcode{true} if new data is written. -When the conditional expression (the return value of \code{maybe\_init\_again()}) in the -\code{if} clause is \plc{true}, the \code{target} \code{update} construct -assigns the new values of \plc{v1} and \plc{v2} from the task's data environment to the corresponding -mapped array sections in the \code{target} \code{data} construct's device data +When the conditional expression (the return value of \ucode{maybe_init_again()}) in the +\kcode{if} clause is \plc{true}, the \kcode{target update} construct +assigns the new values of \ucode{v1} and \ucode{v2} from the task's data environment to the corresponding +mapped array sections in the \kcode{target data} construct's device data environment. \cexample[4.0]{target_update}{2} \ffreeexample[4.0]{target_update}{2} +\subsection{\kcode{target update} Construct with Mapper} +\label{subsec:target_update_mapper} +\index{target update construct@\kcode{target update} construct!modifier@mapper} + +The following example shows how the \kcode{target update} construct can be used with a \kcode{mapper} (\ucode{custom}). +The \ucode{custom} mapper maps members of structure \ucode{T} with different map-type modifiers. Inside the +\kcode{target data} region the \kcode{target update} with the \kcode{to} \plc{motion-clause} is equivalent to an update of \ucode{x} on the device. After the \kcode{target} region the \kcode{target update} with the \kcode{from} motion-clause is equivalent to an update of \ucode{y} on the host. + +\cexample[5.1]{target_update}{3} + +\ffreeexample[5.1]{target_update}{3} + diff --git a/devices/teams.tex b/devices/teams.tex index da340ec..669bea4 100644 --- a/devices/teams.tex +++ b/devices/teams.tex @@ -1,79 +1,80 @@ -\pagebreak -\section{\code{teams} Construct and Related Combined Constructs} +%\pagebreak +\section{\kcode{teams} Construct and Related Combined Constructs} \label{sec:teams} -\subsection{\code{target} and \code{teams} Constructs with \code{omp\_get\_num\_teams}\\ -and \code{omp\_get\_team\_num} Routines} +\subsection{\kcode{target} and \kcode{teams} Constructs with \kcode{omp_get_num_teams}\\ +and \kcode{omp_get_team_num} Routines} \label{subsec:teams_api} -\index{constructs!target@\code{target}} -\index{target construct@\code{target} construct} -\index{constructs!teams@\code{teams}} -\index{teams construct@\code{teams} construct} -\index{combined constructs!target teams@\code{target}~\code{teams}} -\index{teams construct@\code{teams} construct!num_teams clause@\scode{num_teams} clause} -\index{clauses!num_teams@\scode{num_teams}} -\index{num_teams clause@\scode{num_teams} clause} -\index{routines!omp_get_num_teams@\scode{omp_get_num_teams}} -\index{routines!omp_get_team_num@\scode{omp_get_team_num}} -\index{omp_get_num_teams routine@\scode{omp_get_num_teams} routine} -\index{omp_get_team_num routine@\scode{omp_get_team_num} routine} - -The following example shows how the \code{target} and \code{teams} constructs -are used to create a league of thread teams that execute a region. The \code{teams} +\index{constructs!target@\kcode{target}} +\index{target construct@\kcode{target} construct} +\index{constructs!teams@\kcode{teams}} +\index{teams construct@\kcode{teams} construct} +\index{combined constructs!target teams@\kcode{target teams}} +\index{teams construct@\kcode{teams} construct!num_teams clause@\kcode{num_teams} clause} +\index{clauses!num_teams@\kcode{num_teams}} +\index{num_teams clause@\kcode{num_teams} clause} +\index{routines!omp_get_num_teams@\kcode{omp_get_num_teams}} +\index{routines!omp_get_team_num@\kcode{omp_get_team_num}} +\index{omp_get_num_teams routine@\kcode{omp_get_num_teams} routine} +\index{omp_get_team_num routine@\kcode{omp_get_team_num} routine} + +The following example shows how the \kcode{target} and \kcode{teams} constructs +are used to create a \plc{league} of thread teams that execute a region. The \kcode{teams} construct creates a league of at most two teams where the primary thread of each -team executes the \code{teams} region. +team executes the \kcode{teams} region. -The \code{omp\_get\_num\_teams} routine returns the number of teams executing in a \code{teams} -region. The \code{omp\_get\_team\_num} routine returns the team number, which is an integer -between 0 and one less than the value returned by \code{omp\_get\_num\_teams}. The following +The \kcode{omp_get_num_teams} routine returns the number of teams executing in a \kcode{teams} +region. The \kcode{omp_get_team_num} routine returns the team number, which is an integer +between 0 and one less than the value returned by \kcode{omp_get_num_teams}. The following example manually distributes a loop across two teams. \cexample[4.0]{teams}{1} \ffreeexample[4.0]{teams}{1} -\subsection{\code{target}, \code{teams}, and \code{distribute} Constructs} +\subsection{\kcode{target}, \kcode{teams}, and \kcode{distribute} Constructs} \label{subsec:teams_distribute} -\index{constructs!distribute@\code{distribute}} -\index{distribute construct@\code{distribute} construct} +\index{constructs!distribute@\kcode{distribute}} +\index{distribute construct@\kcode{distribute} construct} -The following example shows how the \code{target}, \code{teams}, and \code{distribute} -constructs are used to execute a loop nest in a \code{target} region. The \code{teams} -construct creates a league and the primary thread of each team executes the \code{teams} -region. The \code{distribute} construct schedules the subsequent loop iterations +The following example shows how the \kcode{target}, \kcode{teams}, and \kcode{distribute} +constructs are used to execute a loop nest in a \kcode{target} region. The \kcode{teams} +construct creates a league and the primary thread of each team executes the \kcode{teams} +region. The \kcode{distribute} construct schedules the subsequent loop iterations across the primary threads of each team. -The number of teams in the league is less than or equal to the variable \plc{num\_blocks}. +The number of teams in the league is less than or equal to the variable \ucode{num_blocks}. Each team in the league has a number of threads less than or equal to the variable -\plc{block\_threads}. The iterations in the outer loop are distributed among the primary +\ucode{block_threads}. The iterations in the outer loop are distributed among the primary threads of each team. When a team's primary thread encounters the parallel loop construct before the inner -loop, the other threads in its team are activated. The team executes the \code{parallel} +loop, the other threads in its team are activated. The team executes the \kcode{parallel} region and then workshares the execution of the loop. -\index{reduction clause@\code{reduction} clause!on teams construct@on \code{teams} construct} -Each primary thread executing the \code{teams} region has a private copy of the -variable \plc{sum} that is created by the \code{reduction} clause on the \code{teams} construct. +\index{reduction clause@\kcode{reduction} clause!on teams construct@on \kcode{teams} construct} + +Each primary thread executing the \kcode{teams} region has a private copy of the +variable \ucode{sum} that is created by the \kcode{reduction} clause on the \kcode{teams} construct. The primary thread and all threads in its team have a private copy of the variable -\plc{sum} that is created by the \code{reduction} clause on the parallel loop construct. -The second private \plc{sum} is reduced into the primary thread's private copy of \plc{sum} -created by the \code{teams} construct. At the end of the \code{teams} region, -each primary thread's private copy of \plc{sum} is reduced into the final \plc{sum} that is -implicitly mapped into the \code{target} region. +\ucode{sum} that is created by the \kcode{reduction} clause on the parallel loop construct. +The second private \ucode{sum} is reduced into the primary thread's private copy of \ucode{sum} +created by the \kcode{teams} construct. At the end of the \kcode{teams} region, +each primary thread's private copy of \ucode{sum} is reduced into the final \ucode{sum} that is +implicitly mapped into the \kcode{target} region. \cexample[4.0]{teams}{2} \clearpage \ffreeexample[4.0]{teams}{2} -\subsection{\code{target} \code{teams}, and Distribute Parallel Loop Constructs} +\subsection{\kcode{target teams}, and Distribute Parallel Loop Constructs} \label{subsec:teams_distribute_parallel} -The following example shows how the \code{target} \code{teams} and distribute -parallel loop constructs are used to execute a \code{target} region. The \code{target} -\code{teams} construct creates a league of teams where the primary thread of each -team executes the \code{teams} region. +The following example shows how the \kcode{target teams} and distribute +parallel loop constructs are used to execute a \kcode{target} region. +The \kcode{target teams} construct creates a league of teams where the primary thread of each +team executes the \kcode{teams} region. The distribute parallel loop construct schedules the loop iterations across the primary threads of each team and then across the threads of each team. @@ -82,59 +83,58 @@ \subsection{\code{target} \code{teams}, and Distribute Parallel Loop Constructs} \ffreeexample[4.5]{teams}{3} -\subsection{\code{target} \code{teams} and Distribute Parallel Loop +\subsection{\kcode{target teams} and Distribute Parallel Loop Constructs with Scheduling Clauses} \label{subsec:teams_distribute_parallel_schedule} -\index{distribute construct@\code{distribute} construct!dist_schedule clause@\scode{dist_schedule} clause} -\index{clauses!dist_schedule@\scode{dist_schedule}} -\index{dist_schedule clause@\scode{dist_schedule} clause} -\index{worksharing-loop constructs!schedule clause@\code{schedule} clause} -\index{clauses!schedule@\code{schedule}} -\index{schedule clause@\code{schedule} clause} - -The following example shows how the \code{target} \code{teams} and distribute -parallel loop constructs are used to execute a \code{target} region. The \code{teams} +\index{distribute construct@\kcode{distribute} construct!dist_schedule clause@\kcode{dist_schedule} clause} +\index{clauses!dist_schedule@\kcode{dist_schedule}} +\index{dist_schedule clause@\kcode{dist_schedule} clause} +\index{worksharing-loop constructs!schedule clause@\kcode{schedule} clause} +\index{clauses!schedule@\kcode{schedule}} +\index{schedule clause@\kcode{schedule} clause} + +The following example shows how the \kcode{target teams} and \kcode{distribute parallel} +constructs are used to execute a \kcode{target} region. The \kcode{teams} construct creates a league of at most eight teams where the primary thread of each -team executes the \code{teams} region. The number of threads in each team is -less than or equal to 16. +team executes the \kcode{teams} region. The number of threads in each team is +less than or equal to \ucode{16}. -The \code{distribute} parallel loop construct schedules the subsequent loop iterations +The \kcode{distribute} parallel loop construct schedules the subsequent loop iterations across the primary threads of each team and then across the threads of each team. -The \code{dist\_schedule} clause on the distribute parallel loop construct indicates +The \kcode{dist_schedule} clause on the distribute parallel loop construct indicates that loop iterations are distributed to the primary thread of each team in chunks -of 1024 iterations. +of \ucode{1024} iterations. -The \code{schedule} clause indicates that the 1024 iterations distributed to +The \kcode{schedule} clause indicates that the 1024 iterations distributed to a primary thread are then assigned to the threads in its associated team in chunks -of 64 iterations. +of \ucode{64} iterations. \cexample[4.0]{teams}{4} \ffreeexample[4.0]{teams}{4} -\subsection{\code{target} \code{teams} and \code{distribute} \code{simd} Constructs} +\subsection{\kcode{target teams} and \kcode{distribute simd} Constructs} \label{subsec:teams_distribute_simd} -The following example shows how the \code{target} \code{teams} and \code{distribute} -\code{simd} constructs are used to execute a loop in a \code{target} region. -The \code{target} \code{teams} construct creates a league of teams where the -primary thread of each team executes the \code{teams} region. +The following example shows how the \kcode{target teams} and \kcode{distribute simd} constructs are used to execute a loop in a \kcode{target} region. +The \kcode{target teams} construct creates a league of teams where the +primary thread of each team executes the \kcode{teams} region. -The \code{distribute} \code{simd} construct schedules the loop iterations across +The \kcode{distribute simd} construct schedules the loop iterations across the primary thread of each team and then uses SIMD parallelism to execute the iterations. \cexample[4.0]{teams}{5} \ffreeexample[4.0]{teams}{5} -\subsection{\code{target} \code{teams} and Distribute Parallel Loop SIMD Constructs} +\subsection{\kcode{target teams} and Distribute Parallel Loop SIMD Constructs} \label{subsec:teams_distribute_parallel_simd} -The following example shows how the \code{target} \code{teams} and the distribute -parallel loop SIMD constructs are used to execute a loop in a \code{target} \code{teams} -region. The \code{target} \code{teams} construct creates a league of teams -where the primary thread of each team executes the \code{teams} region. +The following example shows how the \kcode{target teams} and the distribute +parallel loop SIMD constructs are used to execute a loop in a \kcode{target teams} +region. The \kcode{target teams} construct creates a league of teams +where the primary thread of each team executes the \kcode{teams} region. The distribute parallel loop SIMD construct schedules the loop iterations across the primary thread of each team and then across the threads of each team where each diff --git a/devices/usm.tex b/devices/usm.tex new file mode 100644 index 0000000..56399cc --- /dev/null +++ b/devices/usm.tex @@ -0,0 +1,39 @@ +\pagebreak +\section{Unified Shared Memory} +\label{sec:usm} + +\index{requires directive@\kcode{requires} directive} +\index{directives!requires@\kcode{requires}} + +\index{unified_shared_memory clause@\kcode{unified_shared_memory} clause} +\index{clauses!unified_shared_memory@\kcode{unified_shared_memory}} + +The following examples show behavior of scalars, pointers, references (C++) and +associate names (Fortran) in \kcode{target} constructs when +unified shared memory (USM) is required throughout the scope of the program by the +\kcode{unified_shared_memory} clause in a \kcode{requires} directive. +USM assumes a unified address space. + +In the C++ code of the first example, a scalar (\ucode{x}), a pointer +(\ucode{ptr}), and a reference (\ucode{ref}) are used in a \kcode{target} construct in Cases 1, 2 and 3, respectively. +For the scalar variable \ucode{x}, the predetermined data-sharing attribute is still +firstprivate under the USM requirement and, hence, any manipulation of the local variable on the device is never reflected on the host. +With the USM requirement, pointers always refer to the same location in memory on the host and devices. +Hence, the value of \ucode{x} (in the host data environment) can be modified with \ucode{ptr} in +the \kcode{target} construct, as seen in Case 2. +For the reference \ucode{ref}, the object to which it refers is mapped for +the \kcode{target} construct, as seen in Case 3. + +In Case 1 of the Fortran example, the scalar \ucode{x} is firstprivate under the USM requirement +in the \kcode{target} construct, and modification of the local variable on the device is +never updated to the host data environment. +Also, in Case 2, the use of \ucode{ax}, which is associated with \ucode{x}, +will update the \ucode{x} value in the host data environment. +In Case 3, the Fortran pointer \ucode{ptr} and its target \ucode{y} are not firstprivate, +but implicitly mapped. Hence, updates to the value of \ucode{y} appear in the host data environment. + +%Hence, updates to \ucode{y} in the \kcode{target} construct appear in the data environment of the host. + +%\pagebreak +\cppexample[5.2]{usm_scalar_ptr_ref_asc}{1} +\ffreeexample[5.2]{usm_scalar_ptr_ref_asc}{1} diff --git a/directives/attributes.tex b/directives/attributes.tex index e57ea96..6d09b48 100644 --- a/directives/attributes.tex +++ b/directives/attributes.tex @@ -5,18 +5,18 @@ \section{C++ Attributes} OpenMP directives for C++ can also be specified with %the implementation-defined -the \code{directive} extension for the C++11 standard \plc{attributes}. +the \kcode{directive} extension for the C++11 standard \plc{attributes}. %https://en.cppreference.com/w/cpp/language/attributes -The C++ example below shows two ways to parallelize a \code{for} loop using the \code{\#pragma} syntax. -The first pragma uses the combined \code{parallel}~\code{for} directive, and the second -applies the uncombined closely nested directives, \code{parallel} and \code{for}, directly to the same statement. +The C++ example below shows two ways to parallelize a \bcode{for} loop using the \kcode{\#pragma} syntax. +The first pragma uses the combined \kcode{parallel for} directive, and the second +applies the uncombined closely nested directives, \kcode{parallel} and \kcode{for}, directly to the same statement. These are labeled PRAG 1-3. Using the attribute syntax, the same construct in PRAG 1 is applied two different ways in attribute form, as shown in the ATTR 1 and ATTR 2 sections. -In ATTR 1 the attribute syntax is used with the \code{omp ::} namespace form. -In ATTR 2 the attribute syntax is used with the \code{using omp :} namespace form. +In ATTR 1 the attribute syntax is used with the \kcode{omp ::} namespace form. +In ATTR 2 the attribute syntax is used with the \kcode{using omp :} namespace form. Next, parallelization is attempted by applying directives using two different syntaxes. For ATTR 3 and PRAG 4, the loop parallelization will fail to compile because multiple directives that @@ -25,32 +25,32 @@ \section{C++ Attributes} While multiple attributes may be applied to the same statement, compilation may fail if the ordering of the directive matters. -For the ATTR 4-5 loop parallelization, the \code{parallel} directive precedes -the \code{for} directive, but the compiler may reorder consecutive attributes. +For the ATTR 4-5 loop parallelization, the \kcode{parallel} directive precedes +the \kcode{for} directive, but the compiler may reorder consecutive attributes. If the directives are reversed, compilation will fail. The attribute directive of the ATTR 6 section resolves the previous problem (in ATTR 4-5). -Here, the \code{sequence} attribute is used to apply ordering to the -directives of ATTR 4-5, using the \code{omp}~\code{::} namespace qualifier. (The -\code{using omp :} namespace form is not available for the \code{sequence} attribute.) -Note, for the \code{sequence} attribute a comma must separate the \code{directive} extensions. +Here, the \kcode{sequence} attribute is used to apply ordering to the +directives of ATTR 4-5, using the \kcode{omp ::} namespace qualifier. (The +\kcode{using omp :} namespace form is not available for the \kcode{sequence} attribute.) +Note, for the \kcode{sequence} attribute a comma must separate the \kcode{directive} extensions. The last 3 pairs of sections (PRAG DECL 1-2, 3-4, and 5-6) show cases where -directive ordering does not matter for \code{declare}~\code{simd} directives. +directive ordering does not matter for \kcode{declare simd} directives. -In section PRAG DECL 1-2, the two loops use different SIMD forms of the \plc{P} function -(one with \code{simdlen(4)} and the other with \code{simdlen(8)}), -as prescribed by the two different \code{declare}~\code{simd} directives -applied to the \plc{P} function definitions (at the beginning of the code). +In section PRAG DECL 1-2, the two loops use different SIMD forms of the \ucode{P} function +(one with \kcode{simdlen(\ucode{4})} and the other with \kcode{simdlen(\ucode{8})}), +as prescribed by the two different \kcode{declare simd} directives +applied to the \ucode{P} function definitions (at the beginning of the code). The directives use the pragma syntax, and order is not important. For the next set of loops -(PRAG DECL 3-4) that use the \plc{Q} function, the attribute syntax is -used for the \code{declare}~\code{simd} directives. +(PRAG DECL 3-4) that use the \ucode{Q} function, the attribute syntax is +used for the \kcode{declare simd} directives. The result is compliant code since directive order is irrelevant. Sections ATTR DECL 5-6 are included for completeness. Here, the attribute -form of the \code{simd} directive is used for loops calling the \plc{Q} function, -in combination with the attribute form of the \code{declare}~\code{simd} -directives declaring the variants for \plc{Q}. +form of the \kcode{simd} directive is used for loops calling the \ucode{Q} function, +in combination with the attribute form of the \kcode{declare simd} +directives declaring the variants for \ucode{Q}. \cppexample[5.1]{directive_syntax_attribute}{1} diff --git a/directives/fixed_format_comments.tex b/directives/fixed_format_comments.tex index b270c1f..10cff43 100644 --- a/directives/fixed_format_comments.tex +++ b/directives/fixed_format_comments.tex @@ -5,16 +5,16 @@ \section{Fortran Comments (Fixed Source Form)} \index{fixed form syntax, Fortran} OpenMP directives in Fortran codes with fixed source form are specified as comments with one of the -\code{!\$omp}, \code{c\$omp}, and \code{*\$omp} sentinels, followed by a +\scode{!$omp}, \scode{c$omp}, and \scode{*$omp} sentinels, followed by a directive name, and required and optional clauses. The sentinel must begin in column 1. In the example below the first directive (DIR 1) specifies the %parallel work-sharing -\code{parallel}~\code{do} combined directive, with a \code{num\_threads} clause, and a comment. +\kcode{parallel do} combined directive, with a \kcode{num_threads} clause, and a comment. The second directive (DIR 2) shows the same directive split across two lines. The next nested directives (DIR 3 and 4) show the previous combined directive as two separate directives. -Here, an \code{end} directive (\code{end}~\code{parallel}) must be specified to demarcate the range (region) -of the \code{parallel} directive. +Here, an \kcode{end} directive (\kcode{end parallel}) must be specified to demarcate the range (region) +of the \kcode{parallel} directive. \fexample{directive_syntax_F_fixed_comment}{1} \clearpage diff --git a/directives/free_format_comments.tex b/directives/free_format_comments.tex index 32e88aa..2c84578 100644 --- a/directives/free_format_comments.tex +++ b/directives/free_format_comments.tex @@ -5,47 +5,47 @@ \section{Fortran Comments (Free Source Form)} \index{free form syntax, Fortran} OpenMP directives in Fortran codes with free source form are specified as comments -that use the \code{!\$omp} sentinel, followed by the -directive name, and required and optional clauses. Lines are continued with an ending ampersand (\code{\&}), -and the continued line begins with \code{!\$omp} or \code{!\$omp\&}. Comments may appear on the +that use the \scode{!$omp} sentinel, followed by the +directive name, and required and optional clauses. Lines are continued with an ending ampersand (\scode{&}), +and the continued line begins with \scode{!$omp} or \scode{!$omp&}. Comments may appear on the same line as the directive. Directives are case insensitive. In the example below the first directive (DIR 1) specifies the %parallel work-sharing -\code{parallel}~\code{do} combined directive, with a \code{num\_threads} clause, and a comment. +\kcode{parallel do} combined directive, with a \kcode{num_threads} clause, and a comment. The second directive (DIR 2) shows the same directive split across two lines. The next nested directives (DIR 3 and 4) show the previous combined directive as two separate directives. -Here, an \code{end} directive (\code{end}~\code{parallel}) must be specified to demarcate the range (region) -of the \code{parallel} directive. +Here, an \kcode{end} directive (\kcode{end parallel}) must be specified to demarcate the range (region) +of the \kcode{parallel} directive. \ffreeexample{directive_syntax_F_free_comment}{1} \clearpage -As of OpenMP 5.1, \code{block} and \code{end}~\code{block} statements can be used to designate -a structured block for an OpenMP region, and any paired OpenMP \code{end} directive becomes optional, -as shown in the next example. Note, the variables \plc{i} and \plc{thrd\_no} are declared within the +As of OpenMP 5.1, \bcode{block} and \bcode{end block} statements can be used to designate +a structured block for an OpenMP region, and any paired OpenMP \kcode{end} directive becomes optional, +as shown in the next example. Note, the variables \ucode{i} and \ucode{thrd_no} are declared within the block structure and are hence private. -It was necessary to explicitly declare the \plc{i} variable, due to the \code{implicit none} statement; +It was necessary to explicitly declare the \ucode{i} variable, due to the \bcode{implicit none} statement; it could have also been declared outside the structured block. \ffreeexample[5.1]{directive_syntax_F_block}{1} -A Fortran BLOCK construct may eliminate the need for a paired \scode{end} directive for an OpenMP construct, +A Fortran \bcode{BLOCK} construct may eliminate the need for a paired \kcode{end} directive for an OpenMP construct, as illustrated in the following example. -The first \code{parallel} construct is specified with an OpenMP loosely structured block -(where the first executable construct is not a Fortran 2008 BLOCK construct). -A paired \scode{end} directive must end the OpenMP construct. -The second \code{parallel} construct is specified with an OpenMP strictly structured block -(consists only of a single Fortran BLOCK construct). -The paired \scode{end} directive is optional in this case, and is not used here. - -The next two \code{parallel} directives form an enclosing outer \code{parallel} construct -and a nested inner \code{parallel} construct. The first \code{end}~\code{parallel} directive -that subsequently appears terminates the inner \code{parallel} construct, -because a paired \scode{end} directive immediately following a BLOCK construct that is +The first \kcode{parallel} construct is specified with an OpenMP loosely structured block +(where the first executable construct is not a Fortran 2008 \bcode{BLOCK} construct). +A paired \kcode{end} directive must end the OpenMP construct. +The second \kcode{parallel} construct is specified with an OpenMP strictly structured block +(consists only of a single Fortran \bcode{BLOCK} construct). +The paired \kcode{end} directive is optional in this case, and is not used here. + +The next two \kcode{parallel} directives form an enclosing outer \kcode{parallel} construct +and a nested inner \kcode{parallel} construct. The first \kcode{end parallel} directive +that subsequently appears terminates the inner \kcode{parallel} construct, +because a paired \kcode{end} directive immediately following a \bcode{BLOCK} construct that is a strictly structured block of an OpenMP construct is treated as the terminating end directive of that construct. -The next \code{end}~\code{parallel} directive is required to terminate the outer \code{parallel} construct. +The next \kcode{end parallel} directive is required to terminate the outer \kcode{parallel} construct. \ffreeexample[5.1]{directive_syntax_F_block}{2} diff --git a/directives/pragmas.tex b/directives/pragmas.tex index 9bf91af..1f63eeb 100644 --- a/directives/pragmas.tex +++ b/directives/pragmas.tex @@ -4,20 +4,20 @@ \section{C/C++ Pragmas} \index{directive syntax!pragma, C/C++} \index{pragma syntax, C/C++} -OpenMP C and C++ directives can be specified with the C/C++ \code{\#pragma} directive. -An OpenMP directive begins with \code{\#pragma}~\code{omp} and is followed by the +OpenMP C and C++ directives can be specified with the C/C++ \kcode{\#pragma} directive. +An OpenMP directive begins with \kcode{\#pragma omp} and is followed by the OpenMP directive name, and required and optional clauses. Lines are continued in the usual manner, and comments may be included at the end. Directives are case sensitive. The example below illustrates the use of the OpenMP pragma form. -The first pragma (PRAG 1) specifies a combined \code{parallel}~\code{for} -directive, with a \code{num\_threads} clause, and a comment. +The first pragma (PRAG 1) specifies a combined \kcode{parallel for} +directive, with a \kcode{num_threads} clause, and a comment. %The NT macro is expanded in the clause. The second pragma (PRAG 2) shows the same directive split across two lines. The next nested pragmas (PRAG 3 and 4) show the previous combined directive as two separate directives. The executable directives above all apply to the next -statement. The \code{parallel} directive can be applied to a \plc{structured}~\plc{block} +statement. The \kcode{parallel} directive can be applied to a \plc{structured block} as shown in PRAG 5. \cexample{directive_syntax_pragma}{1} diff --git a/introduction/Examples.tex b/introduction/Examples.tex index 724259a..259fa2f 100644 --- a/introduction/Examples.tex +++ b/introduction/Examples.tex @@ -2,8 +2,8 @@ \section{Examples Organization} \label{chap:examples} \label{sec:examples} \index{example label} -\index{example label!omp_verno@\scode{omp_}\plc{verno}} -\index{omp_verno@\scode{omp_}\plc{verno}} +\index{example label!omp_verno@\kcode{omp_\plc{verno}}} +\index{omp_verno@\kcode{omp_\plc{verno}}} This document includes examples of the OpenMP API directives, constructs, and routines. @@ -21,11 +21,11 @@ \section{Examples Organization} Example labels include version information of the form \verlabel{\plc{verno}} to indicate features that are illustrated by an example for a specific OpenMP version, such as -``\plc{scan.1.c} \;\verlabel{5.0}.'' +``\example{scan.1.c} \;\verlabel{5.0}.'' Some of the example labels include version information of the form \verlabel[pre\_]{3.0} to indicate features that are specified prior to OpenMP version 3.0, such as -``\plc{ploop.1.c} \;\verlabel[pre\_]{3.0}.'' +``\example{ploop.1.c} \;\verlabel[pre\_]{3.0}.'' Language markers may be used to indicate text or codes that are specific to a particular base language. @@ -37,3 +37,8 @@ \section{Examples Organization} \fortranspecificstart This is Fortran specific... \fortranspecificend + +Throughout the examples document we assume that the number of threads +used for a \kcode{parallel} region is the same as +the number of threads requested, unless explicitly specified otherwise. + diff --git a/loop_transformations/partial_tile.tex b/loop_transformations/partial_tile.tex index c243a41..3626bb8 100644 --- a/loop_transformations/partial_tile.tex +++ b/loop_transformations/partial_tile.tex @@ -25,13 +25,13 @@ \section{Incomplete Tiles} \caption{Tiling illustrations} \end{figure} -In the following example, function \plc{func1} uses the \code{tile} construct -with a \code{sizes(4,16)} tiling clause. Because the second tile dimension of -16 does not evenly divide into the iteration count of the j-loop, the -iterations corresponding to the remainder for the j-loop correspond to partial +In the following example, function \ucode{func1} uses the \kcode{tile} construct +with a \kcode{sizes(\ucode{4,16})} tiling clause. Because the second tile dimension of +16 does not evenly divide into the iteration count of the \ucode{j}-loop, the +iterations corresponding to the remainder for the \ucode{j}-loop correspond to partial tiles as shown in Figure~\ref{fig:Example_tile2}. Each remaining function illustrates a code implementation that a compiler may generate to implement the -\code{tile} construct in \plc{func1}. +\kcode{tile} construct in \ucode{func1}. %Iterations with the tiles can be executed in a any order, ignoring partial tile boundaries. % Deepak: I don't think this first sentence is true for iterations in a partial tile. @@ -41,35 +41,35 @@ \section{Incomplete Tiles} Implementations must ensure that dependencies that are valid with any tile size need to be preserved (including tile size of 1 and tiles as large as the iteration space). -Functions \plc{func2} through \plc{func6} are valid implementations of \plc{func1}. -In \splc{func2} the unrolling is illustrated as a pair of nested loops with a simple -adjustment in the size of the final iteration block in the \splc{j2} iteration space +Functions \ucode{func2} through \ucode{func6} are valid implementations of \ucode{func1}. +In \ucode{func2} the unrolling is illustrated as a pair of nested loops with a simple +adjustment in the size of the final iteration block in the \ucode{j2} iteration space for the partial tile. Performance of the implementation depends on the hardware architecture, the instruction set and compiler optimization goals. -Functions \plc{func3}, \plc{func4}, and \plc{func5} have the advantage that +Functions \ucode{func3}, \ucode{func4}, and \ucode{func5} have the advantage that the innermost loop for the complete tile is a constant size and can be replaced with SIMD instructions. If the target platform has masked SIMD instructions with no overhead, then avoiding the construction of a -remainder loop, as in \plc{func5}, might be the best option. -Another option is to use a remainder loop without tiling, as shown in \plc{func6}, to reduce control-flow overhead. +remainder loop, as in \ucode{func5}, might be the best option. +Another option is to use a remainder loop without tiling, as shown in \ucode{func6}, to reduce control-flow overhead. \cexample[5.1]{partial_tile}{1} \ffreeexample[5.1]{partial_tile}{1} -In the following example, function \plc{func7} tiles nested loops with a size of (4,16), -resulting in partial tiles that cover the last 4 iterations of the j-loop, as +In the following example, function \ucode{func7} tiles nested loops with a size of (\ucode{4,16}), +resulting in partial tiles that cover the last 4 iterations of the \ucode{j}-loop, as in the previous example. However, the outer loop is parallelized with a -\code{parallel} worksharing-loop construct. +\kcode{parallel} worksharing-loop construct. -Functions \plc{func8} and \plc{func9} illustrate two implementations of the tiling -with \code{parallel} and worksharing-loop directives. Function \plc{func8} uses a single outer loop, with a \plc{min} function -to accommodate the partial tiles. Function \plc{func9} +Functions \ucode{func8} and \ucode{func9} illustrate two implementations of the tiling +with \kcode{parallel} and worksharing-loop directives. Function \ucode{func8} uses a single outer loop, with a \ucode{min} function +to accommodate the partial tiles. Function \ucode{func9} uses two sets of nested loops, the first iterates over the complete tiles and the second covers iterations from the partial tiles. When fissioning loops that -are in a \code{parallel} worksharing-loop region, each iteration of each workshared loop -must be executed on the same thread as in an un-fissioned loop. The \code{schedule(static)} clause in \plc{func7} -forces the implementation to use static scheduling and allows the fission in function \plc{func8}. +are in a \kcode{parallel} worksharing-loop region, each iteration of each workshared loop +must be executed on the same thread as in an un-fissioned loop. The \kcode{schedule(static)} clause in \ucode{func7} +forces the implementation to use static scheduling and allows the fission in function \ucode{func8}. When dynamic scheduling is prescribed, fissioning is not allowed. When no scheduling is specified, the compiler implementation will select a scheduling \plc{kind} and adhere to its restrictions. diff --git a/loop_transformations/tile.tex b/loop_transformations/tile.tex index 3d1be5a..4c9f4b6 100644 --- a/loop_transformations/tile.tex +++ b/loop_transformations/tile.tex @@ -1,44 +1,44 @@ %\pagebreak -\section{\code{tile} Construct} +\section{\kcode{tile} Construct} \label{sec:tile} -\index{constructs!tile@\code{tile}} -\index{tile construct@\code{tile} construct} -\index{tile construct@\code{tile} construct!sizes clause@\code{sizes} clause} -\index{sizes clause@\code{sizes} clause} -\index{clauses!sizes@\code{sizes}} +\index{constructs!tile@\kcode{tile}} +\index{tile construct@\kcode{tile} construct} +\index{tile construct@\kcode{tile} construct!sizes clause@\kcode{sizes} clause} +\index{sizes clause@\kcode{sizes} clause} +\index{clauses!sizes@\kcode{sizes}} -In the following example a \code{tile} construct transforms two nested loops -within the \texttt{func1} function into four nested loops. -The tile sizes in the \code{sizes} clause are applied from outermost +In the following example a \kcode{tile} construct transforms two nested loops +within the \ucode{func1} function into four nested loops. +The tile sizes in the \kcode{sizes} clause are applied from outermost to innermost loops (left-to-right). The effective tiling operation is illustrated in -the \texttt{func2} function. +the \ucode{func2} function. (For easier illustration, tile sizes for all examples in this section evenly divide the iteration counts so that there are no remainders.) In the following C/C++ code the inner loop traverses columns and the outer loop traverses the rows of a 100x128 (row x column) matrix. -The \code{sizes(5,16)} clause of the \code{tile} construct specifies +The \kcode{sizes(\ucode{5,16})} clause of the \kcode{tile} construct specifies a 5x16 blocking, applied to the outer (row) and inner (column) loops. -The worksharing-loop construct before the \code{tile} +The worksharing-loop construct before the \kcode{tile} construct is applied after the transform. \cexample[5.1]{tile}{1} In the following Fortran code the inner loop traverses rows and the outer loop traverses the columns of a 128x100 (row x column) matrix. -The \code{sizes(5,16)} clause of the \code{tile} construct specifies +The \kcode{sizes(\ucode{5,16})} clause of the \kcode{tile} construct specifies a 5x16 blocking, applied to the outer (column) and inner (row) loops. -The worksharing-loop construct before the \code{tile} +The worksharing-loop construct before the \kcode{tile} construct is applied after the transform. \ffreeexample[5.1]{tile}{1} \clearpage This example illustrates transformation nesting. -Here, a 4x4 ``outer'' \code{tile} construct is applied to the ``inner'' tile transform shown in the example above. -The effect of the inner loop is shown in \texttt{func2} (cf.\ \texttt{func2} in tile.1.c). -The outer \code{tile} construct's \code{sizes(4,4)} clause applies a 4x4 tile upon the resulting -blocks of the inner transform. The effective looping is shown in \texttt{func3}. +Here, a 4x4 ``outer'' \kcode{tile} construct is applied to the ``inner'' tile transform shown in the example above. +The effect of the inner loop is shown in \ucode{func2} (cf.\ \ucode{func2} in \example{tile.1.c}). +The outer \kcode{tile} construct's \kcode{sizes(\ucode{4,4})} clause applies a 4x4 tile upon the resulting +blocks of the inner transform. The effective looping is shown in \ucode{func3}. \cexample[5.1]{tile}{2} \ffreeexample[5.1]{tile}{2} diff --git a/loop_transformations/unroll.tex b/loop_transformations/unroll.tex index fbc67fe..37aceaf 100644 --- a/loop_transformations/unroll.tex +++ b/loop_transformations/unroll.tex @@ -1,20 +1,20 @@ \pagebreak -\section{\code{unroll} Construct} +\section{\kcode{unroll} Construct} \label{sec:unroll} -\index{constructs!unroll@\code{unroll}} -\index{unroll construct@\code{unroll} construct} -\index{unroll construct@\code{unroll} construct!full clause@\code{full} clause} -\index{full clause@\code{full} clause} -\index{clauses!full@\code{full}} -\index{unroll construct@\code{unroll} construct!partial clause@\code{partial} clause} -\index{partial clause@\code{partial} clause} -\index{clauses!partial@\code{partial}} +\index{constructs!unroll@\kcode{unroll}} +\index{unroll construct@\kcode{unroll} construct} +\index{unroll construct@\kcode{unroll} construct!full clause@\kcode{full} clause} +\index{full clause@\kcode{full} clause} +\index{clauses!full@\kcode{full}} +\index{unroll construct@\kcode{unroll} construct!partial clause@\kcode{partial} clause} +\index{partial clause@\kcode{partial} clause} +\index{clauses!partial@\kcode{partial}} -The \code{unroll} construct is a loop transformation that increases the +The \kcode{unroll} construct is a loop transformation that increases the number of loop blocks in a loop, while reducing the number of iterations. -The \code{full} clause specifies that the loop is to be completely unrolled. +The \kcode{full} clause specifies that the loop is to be completely unrolled. That is, a loop block for each iteration is created, and the loop is removed. -A \code{partial} clause with a \plc{unroll-factor} specifies that the number of +A \kcode{partial} clause with an \plc{unroll-factor} specifies that the number of iterations will be reduced multiplicatively by the factor while the number of blocks will be increased by the same factor. Operationally, the loop is tiled by the factor, and the tiled loop is @@ -27,14 +27,14 @@ \section{\code{unroll} Construct} Unrolling a loop does not change the code's semantics. Also, compilers may unroll loops without explicit directives, at various optimization levels. -In the example below, the \code{unroll} construct is used without any clause, and then -with a \code{full} clause, in the first two functions, respectively. +In the example below, the \kcode{unroll} construct is used without any clause, and then +with a \kcode{full} clause, in the first two functions, respectively. When no clause is used, it is up to the implementation (compiler) to decide if and how the loop is to be unrolled. The iteration count can have a run time value. -In the second function, the \code{unroll} construct uses a \code{full} clause +In the second function, the \kcode{unroll} construct uses a \kcode{full} clause to completely unroll the loop. A compile-time constant is required for the iteration count. -The statements in the third function (\plc{unroll\_full\_equivalent}) illustrates +The statements in the third function (\ucode{unroll_full_equivalent}) illustrates equivalent code for the full unrolling in the second function. \cexample[5.1]{unroll}{1} @@ -47,12 +47,12 @@ \section{\code{unroll} Construct} \ffreeexample[5.1]{unroll}{2} In many cases, when the iteration count is large and/or dynamic, it is -reasonable to partially unroll a loop by including a \code{partial} clause. -In the \plc{unroll3\_partial} function below, the \plc{unroll-factor} value +reasonable to partially unroll a loop by including a \kcode{partial} clause. +In the \ucode{unroll3_partial} function below, the \plc{unroll-factor} value of 4 is used to create a tile size of 4 that is unrolled to create 4 unrolled statements. The equivalent ``hand unrolled'' loop code is presented in the -\plc{unroll3\_partial\_equivalent} function. -If the \plc{unroll-factor} is omitted, as in the \plc{unroll3\_partial\_nofactor} +\ucode{unroll3_partial_equivalent} function. +If the \plc{unroll-factor} is omitted, as in the \ucode{unroll3_partial_nofactor} function, the implementation may optimally select a factor from 1 (no unrolling) to the iteration count (full unrolling). In the latter case the construct generates a loop with a single iteration. @@ -63,22 +63,22 @@ \section{\code{unroll} Construct} When the iteration count is not a multiple of the \plc{unroll-factor}, iterations that should not produce executions must be conditionally protected from execution. In this example, the first function -unrolls a loop that has a variable iteration count. Since the \code{unroll} -construct uses a \code{partial(}~\plc{4}~\code{)} clause, the compiler will need to +unrolls a loop that has a variable iteration count. Since the \kcode{unroll} +construct uses a \kcode{partial(\ucode{4})} clause, the compiler will need to create code that can account for cases when the iteration count is not a multiple of 4. A brute-force, simple-to-understand approach for implementing -the conditionals is shown in the \plc{unroll\_partial\_remainder\_option1} function. +the conditionals is shown in the \ucode{unroll_partial_remainder_option1} function. The remaining two functions show more optimal algorithms the compiler may select to implement the transformation. Optimal approaches may reduce the number of conditionals as shown in -\plc{unroll\_partial\_remainder\_option2}, and +\ucode{unroll_partial_remainder_option2}, and may eliminate conditionals completely by peeling off a ``remainder'' -into a separate loop as in \plc{unroll\_partial\_remainder\_option3}. +into a separate loop as in \ucode{unroll_partial_remainder_option3}. Regardless of the optimization, implementations must ensure that the semantics remain the same, especially when additional directives are applied to -the unrolled loop. For the case in the \plc{unroll\_partial\_remainder\_option3} +the unrolled loop. For the case in the \ucode{unroll_partial_remainder_option3} function, the fission of the worksharing-loop construct may result in a different distribution of threads to the iterations. Since no reproducible scheduling is specified on the work-sharing construct, the worksharing-loop and unrolling are compliant. diff --git a/memory_model/allocators.tex b/memory_model/allocators.tex index 3f91577..754f14f 100644 --- a/memory_model/allocators.tex +++ b/memory_model/allocators.tex @@ -1,49 +1,49 @@ -\pagebreak +%\pagebreak \section{Memory Allocators} \label{sec:allocators} \index{memory allocators!allocator traits} \index{memory allocators!memory space} -\index{memory allocators!omp_alloc routine@\scode{omp_alloc} routine} -\index{memory allocators!allocators directive@\scode{allocators} directive} +\index{memory allocators!omp_alloc routine@\kcode{omp_alloc} routine} +\index{memory allocators!allocators directive@\kcode{allocators} directive} -\index{omp_alloc routine@\scode{omp_alloc} routine} -\index{routines!omp_alloc@\scode{omp_alloc}} +\index{omp_alloc routine@\kcode{omp_alloc} routine} +\index{routines!omp_alloc@\kcode{omp_alloc}} -\index{directives!allocators@\code{allocators}} -\index{allocators directive@\code{allocators} directive} -\index{allocators directive@\code{allocators} directive!allocator clause@\code{allocator} clause} +\index{directives!allocators@\kcode{allocators}} +\index{allocators directive@\kcode{allocators} directive} +\index{allocators directive@\kcode{allocators} directive!allocator clause@\kcode{allocator} clause} -\index{clauses!allocator@\code{allocator}} -\index{allocator clause@\code{allocator} clause} -\index{omp_init_allocator routine@\scode{omp_init_allocator} routine} -\index{routines!omp_init_allocator@\scode{omp_init_allocator}} +\index{clauses!allocator@\kcode{allocator}} +\index{allocator clause@\kcode{allocator} clause} +\index{omp_init_allocator routine@\kcode{omp_init_allocator} routine} +\index{routines!omp_init_allocator@\kcode{omp_init_allocator}} OpenMP memory allocators can be used to allocate memory with specific allocator traits. In the following example an OpenMP allocator is used to -specify an alignment for arrays \plc{x} and \plc{y}. The +specify an alignment for arrays \ucode{x} and \ucode{y}. The general approach for attributing traits to variables allocated by OpenMP is to create or specify a pre-defined \plc{memory space}, create an array of \plc{traits}, and then form an \plc{allocator} from the memory space and trait. The allocator is then specified -in an OpenMP allocation (using an API \plc{omp\_alloc()} function -for C/C++ code and an \code{allocators} directive for Fortran code -in the \splc{allocators.1} example). +in an OpenMP allocation (using an API \kcode{omp_alloc()} function +for C/C++ code and an \kcode{allocators} directive for Fortran code +in the \example{allocators.1} example). -In the example below the \plc{xy\_memspace} variable is declared -and assigned the default memory space (\plc{omp\_default\_mem\_space}). +In the example below the \ucode{xy_memspace} variable is declared +and assigned the default memory space (\kcode{omp_default_mem_space}). Next, an array for \plc{traits} is created. Since only one -trait will be used, the array size is \plc{1}. +trait will be used, the array size is \ucode{1}. A trait is a structure in C/C++ and a derived type in Fortran, containing 2 components: a key and a corresponding value (key-value pair). -The trait key used here is \plc{omp\_atk\_alignment} (an enum for C/C++ +The trait key used here is \kcode{omp_atk_alignment} (an enum for C/C++ and a parameter for Fortran) -and the trait value of 64 is specified in the \plc{xy\_traits} declaration. +and the trait value of 64 is specified in the \ucode{xy_traits} declaration. These declarations are followed by a call to the -\plc{omp\_init\_allocator()} function to combine the memory -space (\plc{xy\_memspace}) and the traits (\plc{xy\_traits}) -to form an allocator (\plc{xy\_alloc}). +\kcode{omp_init_allocator()} function to combine the memory +space (\ucode{xy_memspace}) and the traits (\ucode{xy_traits}) +to form an allocator (\ucode{xy_alloc}). %In the C/C++ code the API \plc{omp\_allocate()} function is used %to allocate space, similar to \plc{malloc}, except that the allocator @@ -54,17 +54,17 @@ \section{Memory Allocators} %with an \code{allocator} clause (specifying the \plc{xy\_alloc} as the allocator) %for the following Fortran \plc{allocate} statement. -In the C/C++ code the API \plc{omp\_allocate()} function is used -to allocate space, similar to \plc{malloc}, except that the allocator +In the C/C++ code the API \kcode{omp_allocate()} function is used +to allocate space, similar to \bcode{malloc}, except that the allocator is specified as the second argument. -In Fortran an \code{allocators} directive is used to specify an allocator -for the following Fortran \plc{allocate} statement. -A variable list in the \scode{allocate} clause may be supplied if the allocator +In Fortran an \kcode{allocators} directive is used to specify an allocator +for the following Fortran \bcode{allocate} statement. +A variable list in the \kcode{allocate} clause may be supplied if the allocator is to be applied to a subset of variables in the Fortran allocate statement. -Here, the \plc{xy\_alloc} allocator is specified -in the modifier of the \code{allocator} clause, -and the set of all variables used in the \plc{allocate} statement is specified in the list. +Here, the \ucode{xy_alloc} allocator is specified +in the modifier of the \kcode{allocator} clause, +and the set of all variables used in the \bcode{allocate} statement is specified in the list. %"for a following Fortran allocation statement" (no using "immediately" here) % it looks like if you have a list, the allocation statement does not need @@ -72,118 +72,116 @@ \section{Memory Allocators} % spec5.0 157:19-20 The allocate directive must appear in the same scope as % the declarations of each of its list items and must follow all such declarations. -%\pagebreak - \cexample[5.0]{allocators}{1} \ffreeexample[5.2]{allocators}{1} -When using the \scode{allocators} construct with optional clauses in Fortran code, +When using the \kcode{allocators} construct with optional clauses in Fortran code, users should be aware of the behavior of a reallocation. -In the following example, the \splc{a} variable is allocated with 64-byte -alignment through the \scode{align} clause of the \scode{allocators} construct. +In the following example, the \ucode{a} variable is allocated with 64-byte +alignment through the \kcode{align} clause of the \kcode{allocators} construct. %The alignment of the newly allocated object, \splc{a}, in the (reallocation) %assignment \splc{a = b} may not be the same as before. -The alignment of the newly allocated object, \splc{a}, in the (reallocation) -assignment \splc{a = b} will not be reallocated with the 64-byte alignment, but -with the 32-byte alignment prescribed by the trait of the \splc{my_alloctr} +The alignment of the newly allocated object, \ucode{a}, in the (reallocation) +assignment \ucode{a = b} will not be reallocated with the 64-byte alignment, but +with the 32-byte alignment prescribed by the trait of the \ucode{my_alloctr} allocator. It is best to avoid this problem by constructing and using an -allocator (not the \scode{align} clause) with the required alignment in -the \scode{allocators} construct. +allocator (not the \kcode{align} clause) with the required alignment in +the \kcode{allocators} construct. Note that in the subsequent -deallocation of \splc{a} the deallocation must precede the destruction -of the allocator used in the allocation of \splc{a}. +deallocation of \ucode{a} the deallocation must precede the destruction +of the allocator used in the allocation of \ucode{a}. \ffreeexample[5.2]{allocators}{2} -When creating and using an \scode{allocators} construct within a Fortran procedure +When creating and using an \kcode{allocators} construct within a Fortran procedure for allocating storage (and subsequently freeing the allocator storage with an -\scode{omp_destroy_allocator} construct), users should be aware of the necessity +\kcode{omp_destroy_allocator} construct), users should be aware of the necessity of using an explicit Fortran deallocation instead of relying on auto-deallocation. In the following example, a user-defined allocator is used in the allocation -of the \splc{c} variable, and then the allocator is destroyed. -Auto-deallocation at the end of the \splc{broken_auto_deallocation} procedure +of the \ucode{c} variable, and then the allocator is destroyed. +Auto-deallocation at the end of the \ucode{broken_auto_deallocation} procedure will fail without the allocator, hence an explicit deallocation should be used -(before the \scode{omp_destroy_allocator} construct). -Note that an allocator may be specified directly in the \scode{allocate} clause -without using the \scode{allocator} complex modifier, so long as no other modifier +(before the \kcode{omp_destroy_allocator} construct). +Note that an allocator may be specified directly in the \kcode{allocate} clause +without using the \kcode{allocator} complex modifier, so long as no other modifier is specified in the clause. \ffreeexample[5.2]{allocators}{3} +\pagebreak -\index{directives!allocate@\code{allocate}} -\index{allocate directive@\code{allocate} directive} -\index{allocate directive@\code{allocate} directive!allocator clause@\code{allocator} clause} +\index{directives!allocate@\kcode{allocate}} +\index{allocate directive@\kcode{allocate} directive} +\index{allocate directive@\kcode{allocate} directive!allocator clause@\kcode{allocator} clause} -The \scode{allocate} directive is a convenient way to apply an OpenMP +The \kcode{allocate} directive is a convenient way to apply an OpenMP allocator to the allocation of declared variables. This example illustrates the allocation of specific types of storage in a program for use in libraries, privatized variables, and with offloading. -Two groups of variables, \{\plc{v1, v2}\} and \{\plc{v3, v4}\}, are used with the \scode{allocate} -directive, and the \{\plc{v5, v6}\} pair is used with the \scode{allocate} clause. -Here we explicitly use predefined allocators \scode{omp_high_bw_mem_alloc} and \scode{omp_default_mem_alloc} -with the \scode{allocate} directive in CASE 1. Similar effects are achieved for private variables of a task -by using the \scode{allocate} clause, as shown in CASE 2. +Two groups of variables, \{\ucode{v1, v2}\} and \{\ucode{v3, v4}\}, are used with the \kcode{allocate} +directive, and the \{\ucode{v5, v6}\} pair is used with the \kcode{allocate} clause. +Here we explicitly use predefined allocators \kcode{omp_high_bw_mem_alloc} and \kcode{omp_default_mem_alloc} +with the \kcode{allocate} directive in CASE 1. Similar effects are achieved for private variables of a task +by using the \kcode{allocate} clause, as shown in CASE 2. -Note, when the \scode{allocate} directive does not specify an \scode{allocator} clause, an -implementation-defined default, stored in the \splc{def-allocator-var} ICV, is used +Note, when the \kcode{allocate} directive does not specify an \kcode{allocator} clause, an +implementation-defined default, stored in the \plc{def-allocator-var} ICV, is used (not illustrated here). -Users can set and get the default allocator with the \scode{omp_set_default_allocator} -and \scode{omp_get_default_allocator} API routines. +Users can set and get the default allocator with the \kcode{omp_set_default_allocator} +and \kcode{omp_get_default_allocator} API routines. \cexample[5.1]{allocators}{4} \ffreeexample[5.1]{allocators}{4} -\pagebreak -\index{uses_allocators clause@\scode{uses_allocators} clause} -\index{clauses!uses_allocators@\scode{uses_allocators}} +\index{uses_allocators clause@\kcode{uses_allocators} clause} +\index{clauses!uses_allocators@\kcode{uses_allocators}} -The use of allocators in \scode{target} regions is facilitated by the -\scode{uses_allocators} clause as shown in the cases below. +The use of allocators in \kcode{target} regions is facilitated by the +\kcode{uses_allocators} clause as shown in the cases below. -In CASE 1, the predefined \scode{omp_cgroup_mem_alloc} allocator is made available on the -device in the first \scode{target} construct as specified in the \scode{uses_allocators} clause. -The allocator is then used in the \scode{allocate} -clause of the \scode{teams} construct to allocate a private array for each -team (contention group). The private \splc{xbuf} arrays that are filled by each -team are reduced as specified in the \scode{reduction} clause on the \scode{teams} construct. +In CASE 1, the predefined \kcode{omp_cgroup_mem_alloc} allocator is made available on the +device in the first \kcode{target} construct as specified in the \kcode{uses_allocators} clause. +The allocator is then used in the \kcode{allocate} +clause of the \kcode{teams} construct to allocate a private array for each +team (contention group). The private \ucode{xbuf} arrays that are filled by each +team are reduced as specified in the \kcode{reduction} clause on the \kcode{teams} construct. -In CASE 2, user-defined traits are specified in the \splc{cgroup_traits} variable. -An allocator is initialized for the \scode{target} region in the \scode{uses_allocators} clause, -and the traits specified in \splc{cgroup_traits} are included by the \scode{traits} modifier. +In CASE 2, user-defined traits are specified in the \ucode{cgroup_traits} variable. +An allocator is initialized for the \kcode{target} region in the \kcode{uses_allocators} clause, +and the traits specified in \ucode{cgroup_traits} are included by the \kcode{traits} modifier. -In CASE 3, the \splc{cgroup_alloc} variable is initialized on the host with traits -and a memory space. However, these are ignored by the \scode{uses_allocators} clause -and a new allocator for the \scode{target} region is initialized with default traits. +In CASE 3, the \ucode{cgroup_alloc} variable is initialized on the host with traits +and a memory space. However, these are ignored by the \kcode{uses_allocators} clause +and a new allocator for the \kcode{target} region is initialized with default traits. \cexample[5.2]{allocators}{5} \ffreeexample[5.2]{allocators}{5} -\index{dynamic_allocators clause@\scode{dynamic_allocators} clause} -\index{clauses!dynamic_allocators@\scode{dynamic_allocators}} +\index{dynamic_allocators clause@\kcode{dynamic_allocators} clause} +\index{clauses!dynamic_allocators@\kcode{dynamic_allocators}} -The following example shows how to make an allocator available in a \scode{target} region -without specifying a \scode{uses_allocators} clause. +The following example shows how to make an allocator available in a \kcode{target} region +without specifying a \kcode{uses_allocators} clause. -In CASE 1, the predefined \scode{omp_cgroup_mem_alloc} allocator is used in the \scode{target} -region as in CASE 1 of the previous example, but without specifying a \scode{uses_allocators} clause. -This is accomplished by specifying the \scode{requires} directive with a -\scode{dynamic_allocators} clause in the same compilation unit, to remove -restrictions on allocator usage in \scode{target} regions. +In CASE 1, the predefined \kcode{omp_cgroup_mem_alloc} allocator is used in the \kcode{target} +region as in CASE 1 of the previous example, but without specifying a \kcode{uses_allocators} clause. +This is accomplished by specifying the \kcode{requires} directive with a +\kcode{dynamic_allocators} clause in the same compilation unit, to remove +restrictions on allocator usage in \kcode{target} regions. -CASE 2 also uses the \scode{dynamic_allocators} clause to remove allocator -restrictions in \scode{target} regions. Here, an allocator is initialized -by calling the \scode{omp_init_allocator} routine in the \code{target} region. -The allocator is then used for the allocations of array \plc{xbuf} in -an \scode{allocate} clause of the \code{target}~\code{teams} construct +CASE 2 also uses the \kcode{dynamic_allocators} clause to remove allocator +restrictions in \kcode{target} regions. Here, an allocator is initialized +by calling the \kcode{omp_init_allocator} routine in the \kcode{target} region. +The allocator is then used for the allocations of array \ucode{xbuf} in +an \kcode{allocate} clause of the \kcode{target teams} construct for each team and destroyed after its use. -The use of separate \code{target} regions is needed here since -no statement is allowed between a \code{target} directive and -its nested \code{teams} construct. +The use of separate \kcode{target} regions is needed here since +no statement is allowed between a \kcode{target} directive and +its nested \kcode{teams} construct. \cexample[5.2]{allocators}{6} \ffreeexample[5.2]{allocators}{6} diff --git a/memory_model/fort_race.tex b/memory_model/fort_race.tex index e7f26b6..97e914e 100644 --- a/memory_model/fort_race.tex +++ b/memory_model/fort_race.tex @@ -1,4 +1,4 @@ -\pagebreak +%\pagebreak \section{Race Conditions Caused by Implied Copies of Shared Variables in Fortran} \fortranspecificstart \label{sec:fort_race} @@ -9,7 +9,7 @@ \section{Race Conditions Caused by Implied Copies of Shared Variables in Fortran array as its dummy argument. The subroutine call passing an array section argument may cause the compiler to copy the argument into a temporary location prior to the call and copy from the temporary location into the original variable when the -subroutine returns. This copying would cause races in the \code{parallel} region. +subroutine returns. This copying would cause races in the \kcode{parallel} region. \ffreenexample{fort_race}{1} \fortranspecificend diff --git a/memory_model/mem_model.tex b/memory_model/mem_model.tex index 9756485..2effd4c 100644 --- a/memory_model/mem_model.tex +++ b/memory_model/mem_model.tex @@ -7,19 +7,19 @@ \section{OpenMP Memory Model} execution: ordering of thread execution and memory accesses that may or may not lead to race conditions. -In the following example, at Print 1, the value of \code{xval} could be either 2 -or 5, depending on the timing of the threads. The \code{atomic} directives are -necessary for the accesses to \code{x} by threads 1 and 2 to avoid a data race. +In the following example, at Print 1, the value of \ucode{xval} could be either 2 +or 5, depending on the timing of the threads. The \kcode{atomic} directives are +necessary for the accesses to \ucode{x} by threads 1 and 2 to avoid a data race. If the atomic write completes before the atomic read, thread 1 is guaranteed to -see 5 in \code{xval}. Otherwise, thread 1 is guaranteed to see 2 in \code{xval}. +see 5 in \ucode{xval}. Otherwise, thread 1 is guaranteed to see 2 in \ucode{xval}. \index{flushes!implicit} -\index{atomic construct@\code{atomic} construct} -\index{constructs!atomic@\code{atomic}} +\index{atomic construct@\kcode{atomic} construct} +\index{constructs!atomic@\kcode{atomic}} The barrier after Print 1 contains implicit flushes on all threads, as well as a thread synchronization, so the programmer is guaranteed that the value 5 will be printed by both Print 2 and Print 3. Since neither Print 2 or Print 3 are modifying -\code{x}, they may concurrently access \code{x} without requiring \code{atomic} +\ucode{x}, they may concurrently access \ucode{x} without requiring \kcode{atomic} directives to avoid a data race. \cexample[3.1]{mem_model}{1} @@ -27,16 +27,16 @@ \section{OpenMP Memory Model} \ffreeexample[3.1]{mem_model}{1} \pagebreak -\index{flushes!flush construct@\code{flush} construct} -\index{flush construct@\code{flush} construct} -\index{constructs!flush@\code{flush}} +\index{flushes!flush construct@\kcode{flush} construct} +\index{flush construct@\kcode{flush} construct} +\index{constructs!flush@\kcode{flush}} The following example demonstrates why synchronization is difficult to perform -correctly through variables. The write to \code{flag} on thread 0 and the read -from \code{flag} in the loop on thread 1 must be atomic to avoid a data race. -When thread 1 breaks out of the loop, \code{flag} will have the value of 1. -However, \code{data} will still be undefined at the first print statement. Only -after the flush of both \code{flag} and \code{data} after the first print -statement will \code{data} have the well-defined value of 42. +correctly through variables. The write to \ucode{flag} on thread 0 and the read +from \ucode{flag} in the loop on thread 1 must be atomic to avoid a data race. +When thread 1 breaks out of the loop, \ucode{flag} will have the value of 1. +However, \ucode{data} will still be undefined at the first print statement. Only +after the flush of both \ucode{flag} and \ucode{data} after the first print +statement will \ucode{data} have the well-defined value of 42. \cexample[3.1]{mem_model}{2} @@ -46,13 +46,13 @@ \section{OpenMP Memory Model} \index{flushes!flush with a list} The next example demonstrates why synchronization is difficult to perform correctly through variables. As in the preceding example, the updates to -\code{flag} and the reading of \code{flag} in the loops on threads 1 and 2 are -performed atomically to avoid data races on \code{flag}. However, the code still +\ucode{flag} and the reading of \ucode{flag} in the loops on threads 1 and 2 are +performed atomically to avoid data races on \ucode{flag}. However, the code still contains data race due to the incorrect use of ``flush with a list'' after the -assignment to \code{data1} on thread 1. By not including \code{flag} in the -flush-set of that \code{flush} directive, the assignment can be reordered with -respect to the subsequent atomic update to \code{flag}. Consequentially, -\code{data1} is undefined at the print statement on thread 2. +assignment to \ucode{data1} on thread 1. By not including \ucode{flag} in the +flush-set of that \kcode{flush} directive, the assignment can be reordered with +respect to the subsequent atomic update to \ucode{flag}. Consequentially, +\ucode{data1} is undefined at the print statement on thread 2. \cexample[3.1]{mem_model}{3} @@ -62,20 +62,20 @@ \section{OpenMP Memory Model} The following two examples illustrate the ordering properties of the \plc{flush} operation. The \plc{flush} operations are strong flushes that are applied to the specified flush lists. -However, use of a \code{flush} construct with a list is extremely error +However, use of a \kcode{flush} construct with a list is extremely error prone and users are strongly discouraged from attempting it. In the codes the programmer intends to prevent simultaneous execution of the protected section by the two threads. The atomic directives in the codes ensure that the accesses to shared -variables \plc{a} and \plc{b} are atomic write and atomic read operations. Otherwise both examples would contain data races and automatically result +variables \ucode{a} and \ucode{b} are atomic write and atomic read operations. Otherwise both examples would contain data races and automatically result in unspecified behavior. -In the following incorrect code example, operations on variables \plc{a} and -\plc{b} are not ordered with respect to each other. For instance, nothing -prevents the compiler from moving the flush of \plc{b} on thread 0 or the -flush of \plc{a} on thread 1 to a position completely after the protected +In the following incorrect code example, operations on variables \ucode{a} and +\ucode{b} are not ordered with respect to each other. For instance, nothing +prevents the compiler from moving the flush of \ucode{b} on thread 0 or the +flush of \ucode{a} on thread 1 to a position completely after the protected section (assuming that the protected section on thread 0 does not reference -\plc{b} and the protected section on thread 1 does not reference \plc{a}). +\ucode{b} and the protected section on thread 1 does not reference \ucode{a}). If either re-ordering happens, both threads can simultaneously execute the protected section. Any shared data accessed in the protected section is not guaranteed to @@ -88,11 +88,11 @@ \section{OpenMP Memory Model} The following code example correctly ensures that the protected section is executed by only one thread at a time. Execution of the protected section by neither thread is considered correct in this example. This occurs if both -flushes complete prior to either thread executing its \code{if} statement +flushes complete prior to either thread executing its \bcode{if} statement for the protected section. The compiler is prohibited from moving the flush at all for either thread, ensuring that the respective assignment is complete and the data is flushed -before the \code{if} statement is executed. +before the \bcode{if} statement is executed. \cexample[3.1]{mem_model}{4b} \ffreeexample[3.1]{mem_model}{4b} diff --git a/memory_model/sources/allocators.1.c b/memory_model/sources/allocators.1.c index a10782e..a4a958c 100644 --- a/memory_model/sources/allocators.1.c +++ b/memory_model/sources/allocators.1.c @@ -21,12 +21,14 @@ int main() omp_allocator_handle_t xy_alloc = omp_init_allocator(xy_memspace,1,xy_traits); - x=(float *)omp_alloc(N*sizeof(float), xy_alloc); y=(float *)omp_alloc(N*sizeof(float), xy_alloc); if( ((intptr_t)(y))%64 != 0 || ((intptr_t)(x))%64 != 0 ) - { printf("ERROR: x|y not 64-Byte aligned\n"); exit(1); } + { + printf("ERROR: x|y not 64-Byte aligned\n"); + exit(1); + } #pragma omp parallel { diff --git a/omp_copyright.txt b/omp_copyright.txt index 8f3e7cd..cd2cf97 100644 --- a/omp_copyright.txt +++ b/omp_copyright.txt @@ -1,4 +1,4 @@ -Copyright (c) 1997-2022 OpenMP Architecture Review Board. +Copyright (c) 1997-2024 OpenMP Architecture Review Board. All rights reserved. Permission to redistribute and use without fee all or part of the source diff --git a/ompt_interface/ompt_start.tex b/ompt_interface/ompt_start.tex index 2c09c66..fea9efe 100644 --- a/ompt_interface/ompt_start.tex +++ b/ompt_interface/ompt_start.tex @@ -6,48 +6,48 @@ \section{OMPT Start} This section explains how the tool and an OpenMP implementation interact to accomplish tool activation. \index{OMPT interface!activating} -\index{OMPT interface!ompt_start_tool routine@\scode{ompt_start_tool} routine} -\index{routines!ompt_start_tool@\scode{ompt_start_tool}} -\index{ompt_start_tool routine@\scode{ompt_start_tool} routine} +\index{OMPT interface!ompt_start_tool routine@\kcode{ompt_start_tool} routine} +\index{routines!ompt_start_tool@\kcode{ompt_start_tool}} +\index{ompt_start_tool routine@\kcode{ompt_start_tool} routine} Step 1. \emph{Determine Whether to Initialize} \begin{adjustwidth}{2.5em}{0pt} -A tool is activated by the OMPT interface when it returns a non-NULL pointer to an \code{ompt\_start\_tool\_result\_t} structure on a call to \code{ompt\_start\_tool} by the OpenMP implementation. -There are three ways that a tool can provide a definition of \code{ompt\_start\_tool} to an OpenMP implementation: -(1) Statically linking the tool's definition of \code{ompt\_start\_tool} into an OpenMP application. +A tool is activated by the OMPT interface when it returns a non-\bcode{NULL} pointer to an \kcode{ompt_start_tool_result_t} structure on a call to \kcode{ompt_start_tool} by the OpenMP implementation. +There are three ways that a tool can provide a definition of \kcode{ompt_start_tool} to an OpenMP implementation: +(1) Statically linking the tool's definition of \kcode{ompt_start_tool} into an OpenMP application. (2) Introducing a dynamically linked library that includes the tool's definition of -\code{ompt\_start\_tool} into the application's address space. +\kcode{ompt_start_tool} into the application's address space. (3) Providing the name of a dynamically linked library appropriate for the architecture and operating system used by the application in the \plc{tool-libraries-var} ICV. \end{adjustwidth} Step 2. \emph{Initializing a First-Party tool} \begin{adjustwidth}{2.5em}{0pt} -If a tool-provided implementation of \code{ompt\_start\_tool} returns a non-NULL pointer -to an \code{ompt\_start\_tool\_result\_t} structure, the OpenMP implementation will invoke +If a tool-provided implementation of \kcode{ompt_start_tool} returns a non-\bcode{NULL} pointer +to an \kcode{ompt_start_tool_result_t} structure, the OpenMP implementation will invoke the tool initializer specified in this structure prior to the occurrence of any OpenMP event. \end{adjustwidth} -\index{OMPT interface!ompt_set_callback routine@\scode{ompt_set_callback} routine} -\index{routines!ompt_set_callback@\scode{ompt_set_callback}} -\index{ompt_set_callback routine@\scode{ompt_set_callback} routine} +\index{OMPT interface!ompt_set_callback routine@\kcode{ompt_set_callback} routine} +\index{routines!ompt_set_callback@\kcode{ompt_set_callback}} +\index{ompt_set_callback routine@\kcode{ompt_set_callback} routine} Step 3. \emph{Monitoring Activity on the Host} \begin{adjustwidth}{2.5em}{0pt} To monitor execution of an OpenMP program on the host device, a tool's initializer must register to receive notification of events that occur as an OpenMP program executes. A tool can register callbacks for OpenMP events using the runtime entry point known -as \code{ompt\_set\_callback}, which has the following possible return codes: \hfill \break - \code{ompt\_set\_error}, - \code{ompt\_set\_never}, - \code{ompt\_set\_impossible}, - \code{ompt\_set\_sometimes}, - \code{ompt\_set\_sometimes\_paired}, - \code{ompt\_set\_always}. +as \kcode{ompt_set_callback}, which has the following possible return codes: \hfill \break + \kcode{ompt_set_error}, + \kcode{ompt_set_never}, + \kcode{ompt_set_impossible}, + \kcode{ompt_set_sometimes}, + \kcode{ompt_set_sometimes_paired}, + \kcode{ompt_set_always}. -If the \code{ompt\_set\_callback} runtime entry point is called outside a tool's initializer, -registration of supported callbacks may fail with a return code of \code{ompt\_set\_error}. -All callbacks registered with \code{ompt\_set\_callback} or returned by \code{ompt\_get\_callback} -use the dummy type signature \code{ompt\_callback\_t}. While this is a compromise, it is +If the \kcode{ompt_set_callback} runtime entry point is called outside a tool's initializer, +registration of supported callbacks may fail with a return code of \kcode{ompt_set_error}. +All callbacks registered with \kcode{ompt_set_callback} or returned by \kcode{ompt_get_callback} +use the dummy type signature \kcode{ompt_callback_t}. While this is a compromise, it is better than providing unique runtime entry points with precise type signatures to set and get the callback for each unique runtime entry point type signature. \end{adjustwidth} @@ -55,22 +55,22 @@ \section{OMPT Start} ---------------- To use the OMPT interface a tool must provide a globally-visible implementation -of the \code{ompt\_start\_tool} function. -The function returns a pointer to an \code{ompt\_start\_tool\_result\_t} structure +of the \kcode{ompt_start_tool} function. +The function returns a pointer to an \kcode{ompt_start_tool_result_t} structure that contains callback pointers for tool initialization and finalization as well -as a data word, \plc{tool\_data}, that is to be passed by reference to these callbacks. -A \code{NULL} return indicates the tool will not use the OMPT interface. -The runtime execution of \code{ompt\_start\_tool} is triggered by the first OpenMP +as a data word, \ucode{tool_data}, that is to be passed by reference to these callbacks. +A \bcode{NULL} return indicates the tool will not use the OMPT interface. +The runtime execution of \kcode{ompt_start_tool} is triggered by the first OpenMP directive or OpenMP API routine call. -In the example below, the user-provided \code{ompt\_start\_tool} function +In the example below, the user-provided \kcode{ompt_start_tool} function performs a check to make sure the runtime OpenMP version that OMPT supports -(provided by the \texttt{omp\_version} argument) is identical to the +(provided by the \ucode{omp_version} argument) is identical to the OpenMP implementation (compile-time) version. -Also, a \code{NULL} is returned to indicate that the OMPT interface is not +Also, a \bcode{NULL} is returned to indicate that the OMPT interface is not used (no callbacks and tool data are specified). -\emph{Note}: The \texttt{omp-tools.h} file is included. +\emph{Note}: The \plc{omp-tools.h} file is included. \cexample[5.0]{ompt_start}{1} diff --git a/openmp-examples.tex b/openmp-examples.tex index 284ce18..c1ece0c 100644 --- a/openmp-examples.tex +++ b/openmp-examples.tex @@ -46,15 +46,12 @@ % The following says letter size, but the style sheet may change the size \documentclass[10pt,letterpaper,twoside,makeidx,hidelinks]{scrreprt} -% Text to appear in the footer on even-numbered pages: -\newcommand{\VER}{5.2.1} % Examples Document Version -\newcommand{\PVER}{5.2} % Supported Spec Version -\newcommand{\VERDATE}{November 2022} -\newcommand{\footerText}{OpenMP Examples Version \VER{} - \VERDATE} - % input a generated file with additional definitions \input{generated-include} +% Text to appear in the footer on even-numbered pages: +\newcommand{\footerText}{OpenMP Examples Version \VER{} - \VERDATE} + % Unified style sheet for OpenMP documents: \input{openmp.sty} diff --git a/openmp.sty b/openmp.sty index 26a281e..0f7ac76 100644 --- a/openmp.sty +++ b/openmp.sty @@ -302,16 +302,43 @@ % Enable \alltt{} for formatting blocks of code: \usepackage{alltt} +\usepackage{toolbox} % for \toolboxMakeSplit % This sets the default \code{} font to tt (monospace) and bold: \newcommand\code[1]{\texttt{\textbf{#1}}} \newcommand\scode[1]{\protect\textbf{\protect\texttt{\protect\detokenize{#1}}}} -\newcommand\nspace[1]{\textrm{\textmd{ }}} % This defines the \plc{} placeholder font to be tt normal slanted: \newcommand\plc[1]{\textrm{\textmd{\itshape{#1}}}} \newcommand\splc[1]{\protect\textit{\protect\textrm{\protect\detokenize{#1}}}} +% This is an updated set of macros for code style work +% kcode - keywords, vcode - value, bcode - base language, +% pvar - variables, pout - program outputs +\toolboxMakeSplit*{ }{DoSplitS}\toolboxMakeSplit*{_}{DoSplitU} +\protected\def\DoReplaceU#1{\DoSplitU{#1}\leftutext\rightutext + \leftutext% + \ifthenelse{\isundefined{\rightutext}}{}% + {\_\expandafter\DoReplaceU\expandafter{\rightutext}}} +\protected\def\DoReplaceS#1{\DoSplitS{#1}\leftstext\rightstext + \expandafter\DoReplaceU\expandafter{\leftstext}% + \ifthenelse{\isundefined{\rightstext}}{}% + {\textrm{~}\expandafter\DoReplaceS\expandafter{\rightstext}}} +\newcommand{\myreplacedmt}[1]{\protect\DoReplaceS{#1}} +\newcommand\kcode[1]{\texttt{\bfseries\upshape\myreplacedmt{#1}}} +\newcommand\bcode[1]{\texttt{\mdseries\upshape\myreplacedmt{#1}}} +\newcommand\vcode[1]{\bcode{#1}} +\newcommand\ucode[1]{\texttt{\mdseries\slshape\myreplacedmt{#1}}} +\newcommand\pvar[1]{\ucode{#1}} +\newcommand\pout[1]{\vcode{#1}} +\newcommand\docref[1]{\textrm{\mdseries\itshape{#1}}} +\newcommand\example[1]{\splc{#1}} + +\newcommand\examplesrepo{https://github.com/OpenMP/Examples} +\newcommand\examplestree[2]{\href{\examplesrepo/tree/v#1}{#2}} +\newcommand\examplesref[1]{\examplestree{#1}{#1}} +\newcommand\examplesblob[1]{\href{\examplesrepo/blob/#1}{#1}} + % Environment for a paragraph of literal code, single-spaced, no outline, no indenting: \newenvironment{codepar}[1] {\begin{alltt}\bfseries #1} @@ -417,6 +444,12 @@ % convenience macro for formatting the word "Note:" at the beginning of note blocks: \newcommand{\noteheader}{{\textrm{\textsf{\textbf\textup\normalsize{{{{Note }}}}}}}} +% blue line floater at top of a page for "Name (cont.)" +\newcommand{\topmarker}[1]{% + \begin{figure}[t!] + \linewitharrows{-1}{dashed}{#1 (cont.)}{8em} + \end{figure}} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Glossary formatting @@ -471,19 +504,11 @@ % Thanks to Jin, Haoqiang H. for the original definitions of the following: \usepackage{color,fancyvrb} % for \VerbatimInput -\usepackage{toolbox} % for \toolboxMakeSplit \usepackage{xargs} % for optional args \renewcommand\theFancyVerbLine{\normalfont\footnotesize\sffamily S-\arabic{FancyVerbLine}} -\newcommand{\myreplace}[3]{\bgroup\toolboxMakeSplit*{#1}{DoSplit}% - \long\def\DoReplace##1{\DoSplit{##1}\lefttext\righttext - \lefttext - \toolboxIfElse{\ifx\righttext\undefined}{}% - {#2\expandafter\DoReplace\expandafter{\righttext}}}% - \DoReplace{#3}\egroup} - -\newcommand{\escstr}[1]{\myreplace{_}{\_}{#1}} +\newcommand{\escstr}[1]{\DoReplaceU{#1}} \newcommandx*\verlabel[2][1=]{(\code{\small{}#1omp\_#2})} \newcommand{\exampleheader}[6]{% diff --git a/parallel_execution/collapse.tex b/parallel_execution/collapse.tex index b93c33c..36d52b2 100644 --- a/parallel_execution/collapse.tex +++ b/parallel_execution/collapse.tex @@ -1,86 +1,87 @@ -\pagebreak -\section{\code{collapse} Clause} +%\pagebreak +\section{\kcode{collapse} Clause} \label{sec:collapse} -\index{clauses!collapse@\code{collapse}} -\index{collapse clause@\code{collapse} clause} +\index{clauses!collapse@\kcode{collapse}} +\index{collapse clause@\kcode{collapse} clause} -In the following example, the \code{k} and \code{j} loops are associated with -the loop construct. So the iterations of the \code{k} and \code{j} loops are +In the following example, the \ucode{k} and \ucode{j} loops are associated with +the worksharing-loop construct. So the iterations of the \ucode{k} and \ucode{j} loops are collapsed into one loop with a larger iteration space, and that loop is then divided -among the threads in the current team. Since the \code{i} loop is not associated -with the loop construct, it is not collapsed, and the \code{i} loop is executed -sequentially in its entirety in every iteration of the collapsed \code{k} and -\code{j} loop. +among the threads in the current team. Since the \ucode{i} loop is not associated +with the worksharing-loop construct, it is not collapsed, and the \ucode{i} loop is executed +sequentially in its entirety in every iteration of the collapsed \ucode{k} and +\ucode{j} loop. -The variable \code{j} can be omitted from the \code{private} clause when the -\code{collapse} clause is used since it is implicitly private. However, if the -\code{collapse} clause is omitted then \code{j} will be shared if it is omitted -from the \code{private} clause. In either case, \code{k} is implicitly private -and could be omitted from the \code{private} clause. +The variable \ucode{j} can be omitted from the \kcode{private} clause when the +\kcode{collapse} clause is used since it is implicitly private. However, if the +\kcode{collapse} clause is omitted then \ucode{j} will be shared if it is omitted +from the \kcode{private} clause. In either case, \ucode{k} is implicitly private +and could be omitted from the \kcode{private} clause. \cexample[3.0]{collapse}{1} \fexample[3.0]{collapse}{1} -In the next example, the \code{k} and \code{j} loops are associated with the -loop construct. So the iterations of the \code{k} and \code{j} loops are collapsed +In the next example, the \ucode{k} and \ucode{j} loops are associated with the +worksharing-loop construct. So the iterations of the \ucode{k} and \ucode{j} loops are collapsed into one loop with a larger iteration space, and that loop is then divided among the threads in the current team. -The sequential execution of the iterations in the \code{k} and \code{j} loops +The sequential execution of the iterations in the \ucode{k} and \ucode{j} loops determines the order of the iterations in the collapsed iteration space. This implies -that in the sequentially last iteration of the collapsed iteration space, \code{k} -will have the value \code{2} and \code{j} will have the value \code{3}. Since -\code{klast} and \code{jlast} are \code{lastprivate}, their values are assigned -by the sequentially last iteration of the collapsed \code{k} and \code{j} loop. -This example prints: \code{2 3}. +that in the sequentially last iteration of the collapsed iteration space, \ucode{k} +will have the value \ucode{2} and \ucode{j} will have the value \ucode{3}. Since +\ucode{klast} and \ucode{jlast} are \kcode{lastprivate}, their values are assigned +by the sequentially last iteration of the collapsed \ucode{k} and \ucode{j} loop. +This example prints: \ucode{2 3}. \cexample[3.0]{collapse}{2} \fexample[3.0]{collapse}{2} -\index{clauses!collapse@\code{collapse}} -\index{collapse clause@\code{collapse} clause} -\index{clauses!ordered@\code{ordered}} -\index{ordered clause@\code{ordered} clause} -The next example illustrates the interaction of the \code{collapse} and \code{ordered} +\index{clauses!collapse@\kcode{collapse}} +\index{collapse clause@\kcode{collapse} clause} +\index{clauses!ordered@\kcode{ordered}} +\index{ordered clause@\kcode{ordered} clause} +The next example illustrates the interaction of the \kcode{collapse} and \kcode{ordered} clauses. -In the example, the loop construct has both a \code{collapse} clause and an \code{ordered} -clause. The \code{collapse} clause causes the iterations of the \code{k} and -\code{j} loops to be collapsed into one loop with a larger iteration space, and -that loop is divided among the threads in the current team. An \code{ordered} -clause is added to the loop construct because an ordered region binds to the loop -region arising from the loop construct. +In the example, the worksharing-loop construct has both a \kcode{collapse} clause and an \kcode{ordered} +clause. The \kcode{collapse} clause causes the iterations of the \ucode{k} and +\ucode{j} loops to be collapsed into one loop with a larger iteration space, and +that loop is divided among the threads in the current team. An \kcode{ordered} +clause is added to the worksharing-loop construct because an ordered region binds to the loop +region arising from the worksharing-loop construct. -According to Section 2.12.8 of the OpenMP 4.0 specification, +According to the \docref{\kcode{ordered} Construct} section of the OpenMP 4.0 specification, a thread must not execute more than one ordered region that binds -to the same loop region. So the \code{collapse} clause is required for the example -to be conforming. With the \code{collapse} clause, the iterations of the \code{k} -and \code{j} loops are collapsed into one loop, and therefore only one ordered -region will bind to the collapsed \code{k} and \code{j} loop. Without the \code{collapse} -clause, there would be two ordered regions that bind to each iteration of the \code{k} -loop (one arising from the first iteration of the \code{j} loop, and the other -arising from the second iteration of the \code{j} loop). +to the same loop region. So the \kcode{collapse} clause is required for the example +to be conforming. With the \kcode{collapse} clause, the iterations of the \ucode{k} +and \ucode{j} loops are collapsed into one loop, and therefore only one ordered +region will bind to the collapsed \ucode{k} and \ucode{j} loop. Without the \kcode{collapse} +clause, there would be two ordered regions that bind to each iteration of the \ucode{k} +loop (one arising from the first iteration of the \ucode{j} loop, and the other +arising from the second iteration of the \ucode{j} loop). +\pagebreak The code prints -\code{0 1 1} +\pout{0 1 1} \\ -\code{0 1 2} +\pout{0 1 2} \\ -\code{0 2 1} +\pout{0 2 1} \\ -\code{1 2 2} +\pout{1 2 2} \\ -\code{1 3 1} +\pout{1 3 1} \\ -\code{1 3 2} +\pout{1 3 2} \cexample[3.0]{collapse}{3} \fexample[3.0]{collapse}{3} -\clearpage +%\clearpage \index{non-rectangular loop nest} diff --git a/parallel_execution/fort_do.tex b/parallel_execution/fort_do.tex index 07b80d4..9ef9bab 100644 --- a/parallel_execution/fort_do.tex +++ b/parallel_execution/fort_do.tex @@ -1,19 +1,20 @@ \pagebreak -\section{Fortran Restrictions on the \code{do} Construct} +\section{Fortran Restrictions on the \kcode{do} Construct} \label{sec:fort_do} -\index{constructs!do@\code{do}} -\index{do construct@\code{do} construct} +\index{constructs!do@\kcode{do}} +\index{do construct@\kcode{do} construct} \fortranspecificstart -If an \code{end do} directive follows a \plc{do-construct} in which several -\code{DO} statements share a \code{DO} termination statement, then a \code{do} -directive can only be specified for the outermost of these \code{DO} statements. -The following example contains correct usages of loop constructs: +If an \kcode{end do} directive follows a \plc{do-construct} in which several +\bcode{DO} statements share a \bcode{DO} termination statement, then a \kcode{do} +directive can only be specified for the outermost of these \bcode{DO} statements. +The following example contains correct usages of +\kcode{do} constructs: \fnexample{fort_do}{1} -The following example is non-conforming because the matching \code{do} directive -for the \code{end do} does not precede the outermost loop: +The following example is non-conforming because the matching \kcode{do} directive +for the \kcode{end do} does not precede the outermost loop: \fnexample{fort_do}{2} \fortranspecificend diff --git a/parallel_execution/fpriv_sections.tex b/parallel_execution/fpriv_sections.tex index eb864c9..27c8d77 100644 --- a/parallel_execution/fpriv_sections.tex +++ b/parallel_execution/fpriv_sections.tex @@ -1,17 +1,17 @@ \pagebreak -\section{\code{firstprivate} Clause and \code{sections} Construct} +\section{\kcode{firstprivate} Clause and \kcode{sections} Construct} \label{sec:fpriv_sections} -\index{constructs!sections@\code{sections}} -\index{sections construct@\code{sections} construct} -\index{constructs!section@\code{section}} -\index{section construct@\code{section} construct} -\index{clauses!firstprivate@\code{firstprivate}} -\index{firstprivate clause@\code{firstprivate} clause} +\index{constructs!sections@\kcode{sections}} +\index{sections construct@\kcode{sections} construct} +\index{constructs!section@\kcode{section}} +\index{section construct@\kcode{section} construct} +\index{clauses!firstprivate@\kcode{firstprivate}} +\index{firstprivate clause@\kcode{firstprivate} clause} -In the following example of the \code{sections} construct the \code{firstprivate} -clause is used to initialize the private copy of \code{section\_count} of each -thread. The problem is that the \code{section} constructs modify \code{section\_count}, -which breaks the independence of the \code{section} constructs. When different +In the following example of the \kcode{sections} construct the \kcode{firstprivate} +clause is used to initialize the private copy of \ucode{section_count} of each +thread. The problem is that the \kcode{section} constructs modify \ucode{section_count}, +which breaks the independence of the \kcode{section} constructs. When different threads execute each section, both sections will print the value 1. When the same thread executes the two sections, one section will print the value 1 and the other will print the value 2. Since the order of execution of the two sections in this diff --git a/parallel_execution/get_nthrs.tex b/parallel_execution/get_nthrs.tex index c837d8d..8f5916e 100644 --- a/parallel_execution/get_nthrs.tex +++ b/parallel_execution/get_nthrs.tex @@ -1,13 +1,13 @@ \pagebreak -\section{\code{omp\_get\_num\_threads} Routine} +\section{\kcode{omp_get_num_threads} Routine} \label{sec:get_nthrs} -\index{routines!omp_get_num_threads@\scode{omp_get_num_threads}} -\index{omp_get_num_threads routine@\scode{omp_get_num_threads} routine} +\index{routines!omp_get_num_threads@\kcode{omp_get_num_threads}} +\index{omp_get_num_threads routine@\kcode{omp_get_num_threads} routine} -In the following example, the \code{omp\_get\_num\_threads} call returns 1 in -the sequential part of the code, so \code{np} will always be equal to 1. To determine -the number of threads that will be deployed for the \code{parallel} region, the -call should be inside the \code{parallel} region. +In the following example, the \kcode{omp_get_num_threads} call returns 1 in +the sequential part of the code, so \ucode{np} will always be equal to 1. To determine +the number of threads that will be deployed for the \kcode{parallel} region, the +call should be inside the \kcode{parallel} region. \cexample{get_nthrs}{1} diff --git a/parallel_execution/host_teams.tex b/parallel_execution/host_teams.tex index 9ddc968..e912a2a 100644 --- a/parallel_execution/host_teams.tex +++ b/parallel_execution/host_teams.tex @@ -1,24 +1,24 @@ \pagebreak -\section{\code{teams} Construct on Host} +\section{\kcode{teams} Construct on Host} \label{sec:host_teams} -\index{constructs!teams@\code{teams}} -\index{teams construct@\code{teams} construct} +\index{constructs!teams@\kcode{teams}} +\index{teams construct@\kcode{teams} construct} %{\color{blue} ... } {\color{violet} ... } -Originally the \code{teams} construct was created for devices (such as GPUs) +Originally the \kcode{teams} construct was created for devices (such as GPUs) for independent executions of a structured block by teams within a league (on SMs). -It was only available through offloading with the \code{target} construct, -and the execution of a \code{teams} region could only be directed to host -execution by various means such as \code{if} and \code{device} clauses, -and the \code{OMP\_TARGET\_OFFLOAD} environment variable. +It was only available through offloading with the \kcode{target} construct, +and the execution of a \kcode{teams} region could only be directed to host +execution by various means such as \kcode{if} and \kcode{device} clauses, +and the \kcode{OMP_TARGET_OFFLOAD} environment variable. -In OpenMP 5.0 the \code{teams} construct was extended to enable the host -to execute a \code{teams} region (without an associated \code{target} construct), +In OpenMP 5.0 the \kcode{teams} construct was extended to enable the host +to execute a \kcode{teams} region (without an associated \kcode{target} construct), with anticipation of further affinity and threading controls in future OpenMP releases. %With additional affinity controls, a team could be %assigned to execute on a socket or use only a specified number of threads. -In the example below the \code{teams} construct is used to create two +In the example below the \kcode{teams} construct is used to create two teams, one to execute single precision code, and the other to execute double precision code. Two teams are required, and the thread limit for each team is set to 1/2 of the number of diff --git a/parallel_execution/linear_in_loop.tex b/parallel_execution/linear_in_loop.tex index 1ef0522..2a1a772 100644 --- a/parallel_execution/linear_in_loop.tex +++ b/parallel_execution/linear_in_loop.tex @@ -1,13 +1,13 @@ -\section{\code{linear} Clause in Loop Constructs} +\section{\kcode{linear} Clause in Loop Constructs} \label{sec:linear_in_loop} -\index{clauses!linear@\code{linear}} -\index{linear clause@\code{linear} clause} +\index{clauses!linear@\kcode{linear}} +\index{linear clause@\kcode{linear} clause} -The following example shows the use of the \code{linear} clause in a loop +The following example shows the use of the \kcode{linear} clause in a worksharing-loop construct to allow the proper parallelization of a loop that contains -an induction variable (\plc{j}). At the end of the execution of -the loop construct, the original variable \plc{j} is updated with -the value \plc{N/2} from the last iteration of the loop. +an induction variable (\ucode{j}). At the end of the execution of +the worksharing-loop construct, the original variable \ucode{j} is updated with +the value \ucode{N/2} from the last iteration of the loop. \cexample[4.5]{linear_in_loop}{1} diff --git a/parallel_execution/loop.tex b/parallel_execution/loop.tex index bc8a3fd..d8940bc 100644 --- a/parallel_execution/loop.tex +++ b/parallel_execution/loop.tex @@ -1,15 +1,40 @@ -\pagebreak -\section{\code{loop} Construct} +%\pagebreak +\section{\kcode{loop} Construct} \label{sec:loop} -\index{constructs!loop@\code{loop}} -\index{loop construct@\code{loop} construct} +\index{constructs!loop@\kcode{loop}} +\index{loop construct@\kcode{loop} construct} -The following example illustrates the use of the OpenMP 5.0 \code{loop} +The following example illustrates the use of the OpenMP 5.0 \kcode{loop} construct for the execution of a loop. -The \code{loop} construct asserts to the compiler that the iterations +The \kcode{loop} construct asserts to the compiler that the iterations of the loop are free of data dependencies and may be executed concurrently. It allows the compiler to use heuristics to select the parallelization scheme and compiler-level optimizations for the concurrency. \cexample[5.0]{loop}{1} \ffreeexample[5.0]{loop}{1} + +The following example shows the use of the orphaned \kcode{loop} construct. Since the +function \ucode{foo()} is not lexically nested inside of the \kcode{teams} region it needs to specify +the \kcode{bind} clause. The first \kcode{loop} construct binds to the \kcode{teams} region +from where the function \ucode{foo} is called. Binding to \kcode{teams} allows thread-level +parallelism to be available for the second \kcode{loop} construct. +The loop iterations can be executed concurrently, +thus allowing implementations to perform various loop nest optimizations including +reordering of the \ucode{i} and \ucode{j} loops. The \kcode{loop} construct can be +implemented using any parallelism-generating mechanism, which allows better use +of hardware resources while also allowing sequential optimizations, reordering, +tiling etc. + +For example, the first \kcode{loop} construct could be implemented as if it was specified as +\kcode{distribute parallel for} and the second \kcode{loop} construct as if it was specified as +\kcode{simd} if the hardware can support SIMD operations. +% \\ +% OR +% \\ +% For example, the first \kcode{loop} construct could be implemented as +% \kcode{distribute parallel for} and the second \kcode{loop} construct as \kcode{simd} +% if the hardware can support SIMD operations. + +\cexample[5.0]{loop}{2} +\ffreeexample[5.0]{loop}{2} diff --git a/parallel_execution/masked.tex b/parallel_execution/masked.tex index 3cd14c2..713a161 100644 --- a/parallel_execution/masked.tex +++ b/parallel_execution/masked.tex @@ -1,21 +1,21 @@ \pagebreak -\section{\code{masked} Construct} +\section{\kcode{masked} Construct} \label{sec:masked} -\index{constructs!masked@\code{masked}} -\index{masked construct@\code{masked} construct} -\index{masked construct@\code{masked} construct!filter clause@\code{filter} clause} -\index{clauses!filter@\code{filter}} -\index{filter clause@\code{filter} clause} +\index{constructs!masked@\kcode{masked}} +\index{masked construct@\kcode{masked} construct} +\index{masked construct@\kcode{masked} construct!filter clause@\kcode{filter} clause} +\index{clauses!filter@\kcode{filter}} +\index{filter clause@\kcode{filter} clause} -The following example demonstrates the \code{masked} construct. +The following example demonstrates the \kcode{masked} construct. In the example, the primary thread (thread number 0) keeps track of how many iterations have been executed and prints out a progress report in the iteration loop. -The other threads skip the \code{masked} region without waiting. -The \code{filter} clause can be used to specify a thread number other +The other threads skip the \kcode{masked} region without waiting. +The \kcode{filter} clause can be used to specify a thread number other than the primary thread to execute a structured block, as illustrated by -the second \code{masked} construct after the iteration loop. -If the thread specified in a \scode{filter} clause does not exist +the second \kcode{masked} construct after the iteration loop. +If the thread specified in a \kcode{filter} clause does not exist in the team then the structured block is not executed by any thread. \cexample[5.1]{masked}{1} diff --git a/parallel_execution/nowait.tex b/parallel_execution/nowait.tex index c9f753c..3d6b985 100644 --- a/parallel_execution/nowait.tex +++ b/parallel_execution/nowait.tex @@ -1,12 +1,12 @@ -\pagebreak -\section{\code{nowait} Clause} +%\pagebreak +\section{\kcode{nowait} Clause} \label{sec:nowait} -\index{clauses!nowait@\code{nowait}} -\index{nowait clause@\code{nowait} clause} +\index{clauses!nowait@\kcode{nowait}} +\index{nowait clause@\kcode{nowait} clause} -If there are multiple independent loops within a \code{parallel} region, you -can use the \code{nowait} clause to avoid the implied barrier at the end of the -loop construct, as follows: +If there are multiple independent loops within a \kcode{parallel} region, you +can use the \kcode{nowait} clause to avoid the implied barrier at the end of +the worksharing-loop construct, as follows: \cexample{nowait}{1} @@ -15,16 +15,16 @@ \section{\code{nowait} Clause} \index{loop scheduling!static} \index{static scheduling} In the following example, static scheduling distributes the same logical iteration -numbers to the threads that execute the three loop regions. This allows the \code{nowait} +numbers to the threads that execute the three loop regions. This allows the \kcode{nowait} clause to be used, even though there is a data dependence between the loops. The dependence is satisfied as long the same thread executes the same logical iteration numbers in each loop. Note that the iteration count of the loops must be the same. The example satisfies -this requirement, since the iteration space of the first two loops is from \code{0} -to \code{n-1} (from \code{1} to \code{N} in the Fortran version), while the -iteration space of the last loop is from \code{1} to \code{n} (\code{2} to -\code{N+1} in the Fortran version). +this requirement, since the iteration space of the first two loops is from \ucode{0} +to \ucode{n-1} (from \ucode{1} to \ucode{N} in the Fortran version), while the +iteration space of the last loop is from \ucode{1} to \ucode{n} (\ucode{2} to +\ucode{N+1} in the Fortran version). \cexample{nowait}{2} diff --git a/parallel_execution/nthrs_dynamic.tex b/parallel_execution/nthrs_dynamic.tex index 98fc374..027304f 100644 --- a/parallel_execution/nthrs_dynamic.tex +++ b/parallel_execution/nthrs_dynamic.tex @@ -1,17 +1,17 @@ -\pagebreak -\section{Interaction Between the \code{num\_threads} Clause and \code{omp\_set\_dynamic}} +%\pagebreak +\section{Interaction Between the \kcode{num_threads} Clause and \kcode{omp_set_dynamic}} \label{sec:nthrs_dynamic} -\index{clauses!num_threads@\scode{num_threads}} -\index{num_threads clause@\scode{num_threads} clause} -\index{routines!omp_set_dynamic@\scode{omp_set_dynamic}} -\index{omp_set_dynamic routine@\scode{omp_set_dynamic} routine} +\index{clauses!num_threads@\kcode{num_threads}} +\index{num_threads clause@\kcode{num_threads} clause} +\index{routines!omp_set_dynamic@\kcode{omp_set_dynamic}} +\index{omp_set_dynamic routine@\kcode{omp_set_dynamic} routine} -The following example demonstrates the \code{num\_threads} clause and the effect +The following example demonstrates the \kcode{num_threads} clause and the effect of the \\ -\code{omp\_set\_dynamic} routine on it. +\kcode{omp_set_dynamic} routine on it. -The call to the \code{omp\_set\_dynamic} routine with argument \code{0} in -C/C++, or \code{.FALSE.} in Fortran, disables the dynamic adjustment of the number +The call to the \kcode{omp_set_dynamic} routine with argument \ucode{0} in +C/C++, or \ucode{.FALSE.} in Fortran, disables the dynamic adjustment of the number of threads in OpenMP implementations that support it. In this case, 10 threads are provided. Note that in case of an error the OpenMP implementation is free to abort the program or to supply any number of threads available. @@ -20,16 +20,16 @@ \section{Interaction Between the \code{num\_threads} Clause and \code{omp\_set\_ \fexample{nthrs_dynamic}{1} -\pagebreak -The call to the \code{omp\_set\_dynamic} routine with a non-zero argument in -C/C++, or \code{.TRUE.} in Fortran, allows the OpenMP implementation to choose +%\pagebreak +The call to the \kcode{omp_set_dynamic} routine with a non-zero argument in +C/C++, or \ucode{.TRUE.} in Fortran, allows the OpenMP implementation to choose any number of threads between 1 and 10. \cexample{nthrs_dynamic}{2} \fexample{nthrs_dynamic}{2} -It is good practice to set the \plc{dyn-var} ICV explicitly by calling the \code{omp\_set\_dynamic} +It is good practice to set the \plc{dyn-var} ICV explicitly by calling the \kcode{omp_set_dynamic} routine, as its default setting is implementation defined. diff --git a/parallel_execution/nthrs_nesting.tex b/parallel_execution/nthrs_nesting.tex index a1b5481..5b624d4 100644 --- a/parallel_execution/nthrs_nesting.tex +++ b/parallel_execution/nthrs_nesting.tex @@ -1,10 +1,10 @@ -\pagebreak +%\pagebreak \section{Controlling the Number of Threads on Multiple Nesting Levels} \label{sec:nthrs_nesting} -\index{environment variables!OMP_NUM_THREADS@\scode{OMP_NUM_THREADS}} -\index{OMP_NUM_THREADS@\scode{OMP_NUM_THREADS}} +\index{environment variables!OMP_NUM_THREADS@\kcode{OMP_NUM_THREADS}} +\index{OMP_NUM_THREADS@\kcode{OMP_NUM_THREADS}} -The following examples demonstrate how to use the \code{OMP\_NUM\_THREADS} environment +The following examples demonstrate how to use the \kcode{OMP_NUM_THREADS} environment variable to control the number of threads on multiple nesting levels: \cexample{nthrs_nesting}{1}[1] diff --git a/parallel_execution/parallel.tex b/parallel_execution/parallel.tex index 449b564..cd7868a 100644 --- a/parallel_execution/parallel.tex +++ b/parallel_execution/parallel.tex @@ -1,12 +1,12 @@ \pagebreak -\section{\code{parallel} Construct} +\section{\kcode{parallel} Construct} \label{sec:parallel} -\index{constructs!parallel@\code{parallel}} -\index{parallel construct@\code{parallel} construct} +\index{constructs!parallel@\kcode{parallel}} +\index{parallel construct@\kcode{parallel} construct} -The \code{parallel} construct can be used in coarse-grain parallel programs. -In the following example, each thread in the \code{parallel} region decides what -part of the global array \plc{x} to work on, based on the thread number: +The \kcode{parallel} construct can be used in coarse-grain parallel programs. +In the following example, each thread in the \kcode{parallel} region decides what +part of the global array \ucode{x} to work on, based on the thread number: \cexample{parallel}{1} diff --git a/parallel_execution/ploop.tex b/parallel_execution/ploop.tex index ca447f9..ed6c350 100644 --- a/parallel_execution/ploop.tex +++ b/parallel_execution/ploop.tex @@ -2,19 +2,19 @@ \section{A Simple Parallel Loop} \label{sec:ploop} \index{combined constructs!parallel worksharing-loop} -\index{constructs!parallel@\code{parallel}} -\index{parallel construct@\code{parallel} construct} -\index{worksharing-loop constructs!for@\code{for}} -\index{worksharing-loop constructs!do@\code{do}} -\index{constructs!for@\code{for}} -\index{constructs!do@\code{do}} -\index{for construct@\code{for} construct} -\index{do construct@\code{do} construct} +\index{constructs!parallel@\kcode{parallel}} +\index{parallel construct@\kcode{parallel} construct} +\index{worksharing-loop constructs!for@\kcode{for}} +\index{worksharing-loop constructs!do@\kcode{do}} +\index{constructs!for@\kcode{for}} +\index{constructs!do@\kcode{do}} +\index{for construct@\kcode{for} construct} +\index{do construct@\kcode{do} construct} The following example demonstrates how to parallelize a simple loop -using the parallel worksharing-loop +using the \kcode{parallel} worksharing-loop construct. The loop iteration variable is private by default, so it is not -necessary to specify it explicitly in a \code{private} clause. +necessary to specify it explicitly in a \kcode{private} clause. \cexample{ploop}{1} diff --git a/parallel_execution/pra_iterator.tex b/parallel_execution/pra_iterator.tex index c71f9c5..fb215a6 100644 --- a/parallel_execution/pra_iterator.tex +++ b/parallel_execution/pra_iterator.tex @@ -1,4 +1,4 @@ -\pagebreak +%\pagebreak \section{Parallel Random Access Iterator Loop} \cppspecificstart \label{sec:pra_iterator} diff --git a/parallel_execution/psections.tex b/parallel_execution/psections.tex index 690133a..efe69fa 100644 --- a/parallel_execution/psections.tex +++ b/parallel_execution/psections.tex @@ -1,13 +1,13 @@ \pagebreak -\section{\code{parallel} \code{sections} Construct} +\section{\kcode{parallel sections} Construct} \label{sec:psections} -\index{combined constructs!parallel sections@\code{parallel}~\code{sections}} -\index{parallel sections construct@\code{parallel}~\code{sections} construct} +\index{combined constructs!parallel sections@\kcode{parallel sections}} +\index{parallel sections construct@\kcode{parallel sections} construct} -In the following example routines \code{XAXIS}, \code{YAXIS}, and \code{ZAXIS} can -be executed concurrently. The first \code{section} directive is optional. Note -that all \code{section} directives need to appear in the -\code{parallel}~\code{sections} construct. +In the following example routines \ucode{XAXIS}, \ucode{YAXIS}, and \ucode{ZAXIS} can +be executed concurrently. The first \kcode{section} directive is optional. Note +that all \kcode{section} directives need to appear in the +\kcode{parallel sections} construct. \cexample{psections}{1} diff --git a/parallel_execution/set_dynamic_nthrs.tex b/parallel_execution/set_dynamic_nthrs.tex index 370cbd6..5deb508 100644 --- a/parallel_execution/set_dynamic_nthrs.tex +++ b/parallel_execution/set_dynamic_nthrs.tex @@ -1,24 +1,24 @@ -\pagebreak -\section{\code{omp\_set\_dynamic} and \\ -\code{omp\_set\_num\_threads} Routines} +%\pagebreak +\section{\kcode{omp_set_dynamic} and \\ +\kcode{omp_set_num_threads} Routines} \label{sec:set_dynamic_nthrs} -\index{routines!omp_set_dynamic@\scode{omp_set_dynamic}} -\index{omp_set_dynamic routine@\scode{omp_set_dynamic} routine} -\index{routines!omp_set_num_threads@\scode{omp_set_num_threads}} -\index{omp_set_num_threads routine@\scode{omp_set_num_threads} routine} +\index{routines!omp_set_dynamic@\kcode{omp_set_dynamic}} +\index{omp_set_dynamic routine@\kcode{omp_set_dynamic} routine} +\index{routines!omp_set_num_threads@\kcode{omp_set_num_threads}} +\index{omp_set_num_threads routine@\kcode{omp_set_num_threads} routine} -Some programs rely on a fixed, prespecified number of threads to execute correctly. +Some programs rely on a fixed, pre-specified number of threads to execute correctly. Because the default setting for the dynamic adjustment of the number of threads is implementation defined, such programs can choose to turn off the dynamic threads capability and set the number of threads explicitly to ensure portability. The -following example shows how to do this using \code{omp\_set\_dynamic}, and \code{omp\_set\_num\_threads}. +following example shows how to do this using \kcode{omp_set_dynamic}, and \kcode{omp_set_num_threads}. In this example, the program executes correctly only if it is executed by 16 threads. If the implementation is not capable of supporting 16 threads, the behavior of this example is implementation defined. Note that the number of threads executing -a \code{parallel} region remains constant during the region, regardless of the +a \kcode{parallel} region remains constant during the region, regardless of the dynamic threads setting. The dynamic threads mechanism determines the number of -threads to use at the start of the \code{parallel} region and keeps it constant +threads to use at the start of the \kcode{parallel} region and keeps it constant for the duration of the region. \cexample{set_dynamic_nthrs}{1} diff --git a/parallel_execution/single.tex b/parallel_execution/single.tex index 4605908..f1a80a5 100644 --- a/parallel_execution/single.tex +++ b/parallel_execution/single.tex @@ -1,17 +1,17 @@ -\pagebreak -\section{\code{single} Construct} +%\pagebreak +\section{\kcode{single} Construct} \label{sec:single} -\index{constructs!single@\code{single}} -\index{single construct@\code{single} construct} +\index{constructs!single@\kcode{single}} +\index{single construct@\kcode{single} construct} -The following example demonstrates the \code{single} construct. In the example, +The following example demonstrates the \kcode{single} construct. In the example, only one thread prints each of the progress messages. All other threads will skip -the \code{single} region and stop at the barrier at the end of the \code{single} +the \kcode{single} region and stop at the barrier at the end of the \kcode{single} construct until all threads in the team have reached the barrier. If other threads -can proceed without waiting for the thread executing the \code{single} region, -a \code{nowait} clause can be specified, as is done in the third \code{single} +can proceed without waiting for the thread executing the \kcode{single} region, +a \kcode{nowait} clause can be specified, as is done in the third \kcode{single} construct in this example. The user must not make any assumptions as to which thread -will execute a \code{single} region. +will execute a \kcode{single} region. \cexample{single}{1} diff --git a/parallel_execution/sources/loop.2.c b/parallel_execution/sources/loop.2.c new file mode 100644 index 0000000..6b72470 --- /dev/null +++ b/parallel_execution/sources/loop.2.c @@ -0,0 +1,58 @@ +/* +* @@name: loop.2 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_5.0 +*/ +#include +#define N 1024 + +int x[N][N]; +int y[N], z[N]; + +void foo() { +// i-loop distributed across encountering league of teams +#pragma omp loop bind(teams) + for (int i = 0; i < N; i++) { + // this loop has an implicit bind(thread) + #pragma omp loop + for (int j = 0; j < N; j++) { + x[i][j] += y[i]*z[i]; + } + } +} + +int main(){ + int error = 0; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + x[i][j] = 0; + } + } + + for (int i = 0; i < N; i++) { + y[i] = i; + z[i] = i+1; + } + +#pragma omp teams num_teams(4) + { + foo(); + } + +//check values + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + if( x[i][j] != i * (i+1)) + error++; + } + } + if(error) { + printf("FAILED\n"); + return 1; + } + printf("PASSED\n"); + return 0; +} diff --git a/parallel_execution/sources/loop.2.f90 b/parallel_execution/sources/loop.2.f90 new file mode 100644 index 0000000..ca220b8 --- /dev/null +++ b/parallel_execution/sources/loop.2.f90 @@ -0,0 +1,63 @@ +! @@name: loop.2 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_5.0 +module xyz_data + integer, parameter :: N=1024 + integer :: x(N,N) + integer :: y(N), z(N) + +contains + subroutine foo() + integer :: i, j + + !! i-loop distributed across encountering league of + !! teams + !$omp loop bind(teams) + do i = 1, N + !! this loop has an implicit bind(thread) + !$omp loop + do j = 1, N + x(j,i) = x(j,i) + y(i)*z(i) + end do + end do + end subroutine +end module + +program main + use xyz_data + integer :: error = 0 + + do i = 1, N + do j = 1, N + x(j,i) = 0 + end do + end do + + do i = 1, N + y(i) = i + z(i) = i + 1 + end do + + !$omp teams num_teams(4) + call foo() + !$omp end teams + +!!check values + do i = 1, N + do j = 1, N + if( x(j,i) /= i * (i+1) ) then + error = error + 1 + endif + enddo + enddo + + if(error .gt. 0) then + print*, "FAILED" + stop 1 + end if + + print*, "PASSED" + +end program diff --git a/parallel_execution/workshare.tex b/parallel_execution/workshare.tex index 2e5af8e..45ae15d 100644 --- a/parallel_execution/workshare.tex +++ b/parallel_execution/workshare.tex @@ -1,73 +1,68 @@ -\pagebreak -\section{\code{workshare} Construct} +%\pagebreak +\section{\kcode{workshare} Construct} \fortranspecificstart \label{sec:workshare} -\index{constructs!workshare@\code{workshare}} -\index{workshare construct@\code{workshare} construct} +\index{constructs!workshare@\kcode{workshare}} +\index{workshare construct@\kcode{workshare} construct} -The following are examples of the \code{workshare} construct. +The following are examples of the \kcode{workshare} construct. -In the following example, \code{workshare} spreads work across the threads executing -the \code{parallel} region, and there is a barrier after the last statement. -Implementations must enforce Fortran execution rules inside of the \code{workshare} +In the following example, \kcode{workshare} spreads work across the threads executing +the \kcode{parallel} region, and there is a barrier after the last statement. +Implementations must enforce Fortran execution rules inside of the \kcode{workshare} block. \fnexample{workshare}{1} -In the following example, the barrier at the end of the first \code{workshare} -region is eliminated with a \code{nowait} clause. Threads doing \code{CC = -DD} immediately begin work on \code{EE = FF} when they are done with \code{CC +In the following example, the barrier at the end of the first \kcode{workshare} +region is eliminated with a \kcode{nowait} clause. Threads doing \ucode{CC = +DD} immediately begin work on \ucode{EE = FF} when they are done with \ucode{CC = DD}. +\pagebreak \fnexample{workshare}{2} -% blue line floater at top of this page for "Fortran, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} -\end{figure} +\topmarker{Fortran} -The following example shows the use of an \code{atomic} directive inside a \code{workshare} -construct. The computation of \code{SUM(AA)} is workshared, but the update to -\code{R} is atomic. +The following example shows the use of an \kcode{atomic} directive inside a \kcode{workshare} +construct. The computation of \ucode{SUM(AA)} is workshared, but the update to +\ucode{R} is atomic. \fnexample{workshare}{3} -Fortran \code{WHERE} and \code{FORALL} statements are \emph{compound statements}, -made up of a \emph{control} part and a \emph{statement} part. When \code{workshare} +Fortran \bcode{WHERE} and \bcode{FORALL} statements are \emph{compound statements}, +made up of a \emph{control} part and a \emph{statement} part. When \kcode{workshare} is applied to one of these compound statements, both the control and the statement -parts are workshared. The following example shows the use of a \code{WHERE} statement -in a \code{workshare} construct. +parts are workshared. The following example shows the use of a \bcode{WHERE} statement +in a \kcode{workshare} construct. Each task gets worked on in order by the threads: -\code{AA = BB} then +\ucode{AA = BB} then \\ -\code{CC = DD} then +\ucode{CC = DD} then \\ -\code{EE .ne. 0} then +\ucode{EE .ne. 0} then \\ -\code{FF = 1 / EE} then +\ucode{FF = 1 / EE} then \\ -\code{GG = HH} +\ucode{GG = HH} \fnexample{workshare}{4} -% blue line floater at top of this page for "Fortran, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} -\end{figure} +\topmarker{Fortran} In the following example, an assignment to a shared scalar variable is performed -by one thread in a \code{workshare} while all other threads in the team wait. +by one thread in a \kcode{workshare} while all other threads in the team wait. \fnexample{workshare}{5} The following example contains an assignment to a private scalar variable, which -is performed by one thread in a \code{workshare} while all other threads wait. +is performed by one thread in a \kcode{workshare} while all other threads wait. It is non-conforming because the private scalar variable is undefined after the assignment statement. \fnexample{workshare}{6} -Fortran execution rules must be enforced inside a \code{workshare} construct. +Fortran execution rules must be enforced inside a \kcode{workshare} construct. In the following example, the same result is produced in the following program fragment regardless of whether the code is executed sequentially or inside an OpenMP program with multiple threads: diff --git a/program_control/assumption.tex b/program_control/assumption.tex new file mode 100644 index 0000000..f535ab8 --- /dev/null +++ b/program_control/assumption.tex @@ -0,0 +1,34 @@ +%\pagebreak +\section{Assumption Directives} +\label{sec:assumption} +%\index{assumption!assume directive@\kcode{assume} directive} +%\index{assumption!assumes directive@\kcode{assumes} directive} +\index{directives!assume@\kcode{assume}} +\index{directives!assumes@\kcode{assumes}} +\index{assume directive@\kcode{assume} directive} +\index{assumes directive@\kcode{assumes} directive} + +\index{directives!begin assumes@\kcode{begin assumes}} +\index{begin assumes directive@\kcode{begin assumes} directive} + +\index{no_parallelism clause@\kcode{no_parallelism} clause} +\index{clauses!no_parallelism@\kcode{no_parallelism}} +\index{holds clause@\kcode{holds} clause} +\index{clauses!holds@\kcode{holds}} + +Assumption directives provide additional information about the expected properties of +the program that may be used by an implementation for optimization. +Ignoring this information should not alter the behavior of the program. The C/C++ example +shows the use of delimited scope (Case 1) and block-associated (Case 2) assumption directives. +A similar effect is shown for Fortran where the \kcode{assumes} directive is used in the module (Case 1) +and the block-associated directive uses an \kcode{end assume} termination (Case 2). +The function \ucode{fun} is annotated with the \kcode{no_parallelism} clause, using the \kcode{begin assumes} +(C) or \kcode{assumes} (Fortran) directive, to indicate that no implicit/explicit tasks are generated and no +SIMD constructs are encountered during execution of the function. If the function \ucode{fun} contains +task-generating or SIMD constructs then the behavior would be undefined. The block-associated +\kcode{assume} directive is used to indicate that \ucode{N} is a multiple of 8 and +will always be equal to or greater than 1. This information, if used for optimization, +could eliminate additional checks. + +\cexample[5.1]{assumption}{1} +\ffreeexample[5.1]{assumption}{1} diff --git a/program_control/cancellation.tex b/program_control/cancellation.tex index 1daebf9..d999344 100644 --- a/program_control/cancellation.tex +++ b/program_control/cancellation.tex @@ -1,46 +1,44 @@ -\pagebreak +%\pagebreak \section{Cancellation Constructs} \label{sec:cancellation} -\index{cancellation!cancel construct@\code{cancel} construct} -\index{constructs!cancel@\code{cancel}} -\index{cancel construct@\code{cancel} construct} +\index{cancellation!cancel construct@\kcode{cancel} construct} +\index{constructs!cancel@\kcode{cancel}} +\index{cancel construct@\kcode{cancel} construct} -\index{cancellation!for parallel region@for \code{parallel} region} +\index{cancellation!for parallel region@for \kcode{parallel} region} \index{cancellation!for worksharing region} -The following example shows how the \code{cancel} directive can be used to terminate -an OpenMP region. Although the \code{cancel} construct terminates the OpenMP +The following example shows how the \kcode{cancel} directive can be used to terminate +an OpenMP region. Although the \kcode{cancel} construct terminates the OpenMP worksharing region, programmers must still track the exception through the pointer -ex and issue a cancellation for the \code{parallel} region if an exception has +\ucode{ex} and issue a cancellation for the \kcode{parallel} region if an exception has been raised. The primary thread checks the exception pointer to make sure that the -exception is properly handled in the sequential part. If cancellation of the \code{parallel} -region has been requested, some threads might have executed \code{phase\_1()}. -However, it is guaranteed that none of the threads executed \code{phase\_2()}. +exception is properly handled in the sequential part. If cancellation of the \kcode{parallel} +region has been requested, some threads might have executed \ucode{phase_1()}. +However, it is guaranteed that none of the threads executed \ucode{phase_2()}. \cppexample[4.0]{cancellation}{1} -\index{cancellation!cancellation point construct@\code{cancellation}~\code{point} construct} -\index{constructs!cancellation point@\code{cancellation}~\code{point}} -\index{cancellation point construct@\code{cancellation}~\code{point} construct} -The following example illustrates the use of the \code{cancel} construct in error -handling. If there is an error condition from the \code{allocate} statement, +\index{cancellation!cancellation point construct@\kcode{cancellation point} construct} +\index{constructs!cancellation point@\kcode{cancellation point}} +\index{cancellation point construct@\kcode{cancellation point} construct} +The following example illustrates the use of the \kcode{cancel} construct in error +handling. If there is an error condition from the \bcode{allocate} statement, the cancellation is activated. The encountering thread sets the shared variable -\code{err} and other threads of the binding thread set proceed to the end of +\ucode{err} and other threads of the binding thread set proceed to the end of the worksharing construct after the cancellation has been activated. \ffreeexample[4.0]{cancellation}{1} -\clearpage - -\index{cancellation!for taskgroup region@for \code{taskgroup} region} +\index{cancellation!for taskgroup region@for \kcode{taskgroup} region} The following example shows how to cancel a parallel search on a binary tree as soon as the search value has been detected. The code creates a task to descend into the child nodes of the current tree node. If the search value has been found, -the code remembers the tree node with the found value through an \code{atomic} -write to the result variable and then cancels execution of all search tasks. The -function \code{search\_tree\_parallel} groups all search tasks into a single -task group to control the effect of the \code{cancel taskgroup} directive. The -\plc{level} argument is used to create undeferred tasks after the first ten +the code remembers the tree node with the found value through an \kcode{atomic} +write to the result variable (\ucode{found}) and then cancels execution of all search tasks. The +function \ucode{search_tree_parallel} groups all search tasks into a single +task group to control the effect of the \kcode{cancel taskgroup} directive. The +\ucode{level} argument is used to create undeferred tasks after the first ten levels of the tree. \cexample[5.1]{cancellation}{2} diff --git a/program_control/cond_comp.tex b/program_control/cond_comp.tex index 727b7f1..79e8290 100644 --- a/program_control/cond_comp.tex +++ b/program_control/cond_comp.tex @@ -1,12 +1,12 @@ -\pagebreak +%\pagebreak \section{Conditional Compilation} \label{sec:cond_comp} -\index{conditional compilation!_OPENMP macro@\scode{_OPENMP} macro} +\index{conditional compilation!_OPENMP macro@\kcode{_OPENMP} macro} \index{conditional compilation!sentinel} \ccppspecificstart The following example illustrates the use of conditional compilation using the -OpenMP macro \code{\_OPENMP}. With OpenMP compilation, the \code{\_OPENMP} +OpenMP macro \kcode{_OPENMP}. With OpenMP compilation, the \kcode{_OPENMP} macro becomes defined. \cnexample{cond_comp}{1} @@ -14,7 +14,7 @@ \section{Conditional Compilation} \fortranspecificstart The following example illustrates the use of the conditional compilation sentinel. -With OpenMP compilation, the conditional compilation sentinel \code{!\$} is recognized +With OpenMP compilation, the conditional compilation sentinel \scode{!$} is recognized and treated as two spaces. In fixed form source, statements guarded by the sentinel must start after column 6. diff --git a/program_control/context_based_variants.tex b/program_control/context_based_variants.tex new file mode 100644 index 0000000..58227c9 --- /dev/null +++ b/program_control/context_based_variants.tex @@ -0,0 +1,393 @@ +%\pagebreak +\section{Context-based Variant Selection} +\label{sec:context_based_variants} + +\index{directives!declare variant@\kcode{declare variant}} +\index{directives!metadirective@\kcode{metadirective}} + +\index{OpenMP context@OpenMP context} +\index{context selector@context selector} +\index{trait selector set@trait selector set} +\index{trait selector@trait selector} +\index{trait property@trait property} + +Certain directives, including \kcode{declare variant}, +\kcode{begin declare variant}, and \kcode{metadirective} +directives, specify function or directive variants for callsite or directive +substitution. They use \plc{context selectors} to specify the contexts in which +the variant may be selected for substitution. A context selector specifies +various \plc{trait selectors}, grouped into \plc{trait selector sets}. A trait +selector, for a given trait selector set, identifies a corresponding trait +(and, in some cases, its trait properties) that may or may not be active in an +\plc{OpenMP context}. A context selector is considered to be \plc{compatible} +with a given OpenMP context if all traits and trait properties corresponding to +trait selectors are active in that context. + +Each context selector is a comma-separated list of trait selector sets and each +trait selector set has the form \plc{trait-selector-name}~\kcode{=\{}~ +\plc{trait-selector-list}~\kcode{\}}, where \plc{trait-selector-list} is a +comma-separated list of trait selectors. Some trait selectors may in turn +specify one or more \plc{trait properties}. Additionally, a trait selector may +optionally specify a \plc{trait score} for explicit control over variant +selection. + +Consider this context selector: \kcode{construct=\{teams,parallel,for\},} +\kcode{device=\{arch(nvptx)\},} \kcode{user=\{condition(\ucode{N>32})\}}. + +The context selector specifies three distinct trait selector sets, a +\kcode{construct} trait selector set, a \kcode{device} trait selector set, and +a \kcode{user} trait selector set. The \kcode{construct} trait selector set +specifies three trait selectors: \kcode{teams}, \kcode{parallel}, and +\kcode{for}. The \kcode{device} trait selector set specifies one trait +selector: \kcode{arch(nvptx)}. And the \kcode{user} trait selector set +specifies one trait selector: \kcode{condition(\ucode{N>32})}. + +The \kcode{teams}, \kcode{parallel}, and \kcode{for} trait selectors +respectively require that the \plc{teams}, \plc{parallel}, and \plc{for} traits +are active in the \plc{construct} trait set of the OpenMP context (i.e., the +\kcode{teams}, \kcode{parallel}, and \kcode{for} constructs are enclosing +constructs that do not appear outside any enclosing \kcode{target} construct at +the program point of interest). The \kcode{arch} trait selector specifies the +\kcode{nvptx} trait property, requiring that \plc{nvptx} is one of the +supported architectures per the \plc{arch} trait of the \plc{device} trait set +of the OpenMP context. Finally, the \kcode{condition} trait selector specifies +the \ucode{N>32} expression as a trait property, requiring that \ucode{N>32} +evaluates to \plc{true} in the OpenMP context. + +The remainder of this section presents examples that make use of context +selectors for function and directive variant selection. Sections +\ref{subsec:declare_variant} and \ref{subsec:metadirective} cover cases where +only one context selector is compatible. +Section \ref{subsec:context_selector_scoring} covers cases where +multiple compatible context selectors exist and a scoring algorithm +determines which one of the variants is selected. + +\subsection{\kcode{declare variant} Directive} +\label{subsec:declare_variant} +\index{directives!declare variant@\kcode{declare variant}} +\index{declare variant directive@\kcode{declare variant} directive} +\index{declare variant directive@\kcode{declare variant} directive!match clause@\kcode{match} clause} +\index{clauses!match@\kcode{match}} +\index{match clause@\kcode{match} clause} + +A \kcode{declare variant} directive specifies an alternate function, +\plc{function variant}, to be used in place of the \plc{base function} +%when the trait within the \kcode{match} clause has a valid context. +when the trait within the \kcode{match} clause matches the OpenMP context at a given callsite. +The base function follows the directive in the C and C++ languages. +In Fortran, either a subroutine or function may be used as the base function, +and the \kcode{declare variant} directive must be in the specification +part of a subroutine or function (unless a \plc{base-proc-name} +modifier is used, as in the case of a procedure declaration statement). See +the OpenMP 5.0 Specification for details on the modifier. + +When multiple \kcode{declare variant} directives are used +a function variant becomes a candidate for replacing the base function if the +%base function call context matches the traits of all selectors in the \kcode{match} clause. +context at the base function call matches the traits of all selectors in the \kcode{match} clause. +If there are multiple candidates, a score is assigned with rules for each +of the selector traits. +See Section \ref{subsec:context_selector_scoring} for details. +%The scoring algorithm can be found in the OpenMP 5.0 Specification. + +In the first example the \ucode{vxv()} function is called within a \kcode{parallel} region, +a \kcode{target} region, and in a sequential part of the program. Two function variants, \ucode{p_vxv()} and \ucode{t_vxv()}, +are defined for the first two regions by using \kcode{parallel} and \kcode{target} selectors (within +the \plc{construct} trait set) in a \kcode{match} clause. The \ucode{p_vxv()} function variant includes +a \kcode{for} construct (\kcode{do} construct for Fortran) for the \kcode{parallel} region, +while \ucode{t_vxv()} includes a \kcode{distribute simd} construct for the \kcode{target} region. +The \ucode{t_vxv()} function is explicitly compiled for the device using a declare target directive. + +Since the two \kcode{declare variant} directives have no selectors that match traits for the context +of the base function call in the sequential part of the program, the base \ucode{vxv()} function is used there, +as expected. +(The vectors in the \ucode{p_vxv} and \ucode{t_vxv} functions have been multiplied +by 3 and 2, respectively, for checking the validity of the replacement. Normally +the purpose of a function variant is to produce the same results by a different method.) + +%Note: a \code{target teams} construct is used to direct execution onto a device, with a +%\code{distribute simd} construct in the function variant. As of the OpenMP 5.0 implementation +%no intervening code is allowed between a \code{target} and \code{teams} construct. So +%using a \code{target} construct to direct execution onto a device, and including +%\code{teams distribute simd} in the variant function would produce non conforming code. + +\cexample[5.1]{declare_variant}{1} + +\ffreeexample[5.0]{declare_variant}{1} + +In this example, traits from the \plc{device} set are used to select a function variant. +In the \kcode{declare variant} directive, an \kcode{isa} trait selector +specifies that if the implementation of the ``\vcode{core-avx512}'' +instruction set is detected at compile time the \ucode{avx512_saxpy()} +variant function is used for the call to \ucode{base_saxpy()}. + +A compilation of \ucode{avx512_saxpy()} is aware of +the AVX-512 instruction set that supports 512-bit vector extensions. +Within \ucode{avx512_saxpy()}, the \kcode{parallel for simd} construct performs parallel execution, and +takes advantage of 64-byte data alignment. +When the \ucode{avx512_saxpy()} function variant is not selected, the base \ucode{base_saxpy()} function variant +containing only a basic \kcode{parallel for} construct is used for the call to \ucode{base_saxpy()}. + +%Note: +%An allocator is used to set the alignment to 64 bytes when an OpenMP compilation is performed. +%Details about allocator variable declarations and functions +%can be found in the allocator example of the Memory Management Chapter. + +\cexample[5.0]{declare_variant}{2} + +\ffreeexample[5.0]{declare_variant}{2} + +The \kcode{begin declare variant} with a paired \kcode{end declare variant} directive was introduced +for C/C++ in the OpenMP 5.1 to allow nesting of declare variant directives. +This example shows a practical situation where nested declare variant directives can be used +to include the correct specialized user function based on the underlying vendor \kcode{isa} trait. +The function name \ucode{my_fun()} is identical in all the header files and the version called will +differ based on the calling context. The example assumes that either NVIDIA or AMD target devices are used. + +\index{directives!begin declare variant@\kcode{begin declare variant}} +\index{begin declare variant directive@\kcode{begin declare variant} directive} + +\cexample[5.1]{declare_variant}{3} + +%%%%%%%%%%%%% +\subsection{Metadirectives} +\label{subsec:metadirective} +\index{directives!metadirective@\kcode{metadirective}} +\index{metadirective directive@\kcode{metadirective} directive} + +\index{metadirective directive@\kcode{metadirective} directive!when clause@\kcode{when} clause} +\index{metadirective directive@\kcode{metadirective} directive!otherwise clause@\kcode{otherwise} clause} +\index{clauses!when@\kcode{when}} +\index{when clause@\kcode{when} clause} +\index{clauses!otherwise@\kcode{otherwise}} +\index{otherwise clause@\kcode{otherwise} clause} +A \kcode{metadirective} directive provides a mechanism to select a directive in +a \kcode{when} clause to be used, depending upon one or more contexts: +implementation, available devices and the present enclosing construct. +The directive in an \kcode{otherwise} clause is used when a directive of the +\kcode{when} clause is not selected. + +\index{context selector!construct@\plc{construct}} +In the \kcode{when} clause the \plc{context selector} (or just \plc{selector}) defines traits that are +evaluated for selection of the directive that follows the selector. +This ``selectables'' directive is called a \plc{directive variant}. +%Traits are grouped by \plc{construct}, \plc{implementation} and +%\plc{device} \plc{sets} to be used by a selector of the same name. + +\index{context selector!device@\plc{device}} +In the first example the \plc{arch} trait of the +\kcode{device} selector set specifies that if an \ucode{nvptx} architecture is +active in the OpenMP context, then the \kcode{teams loop} +directive variant is selected as the directive; otherwise, the \kcode{parallel loop} +directive variant of the \kcode{otherwise} clause is selected as the directive. +That is, if a device of \ucode{nvptx} architecture is supported by the implementation within +the enclosing \kcode{target} construct, its directive variant is selected. +The architecture names, such as \ucode{nvptx}, are implementation defined. +Also, note that the \kcode{device} clause specified in a \kcode{target} construct specifies +a device number, while \kcode{device}, as used in the \kcode{metadirective} +directive as selector set, has traits of \plc{kind}, \plc{isa} and \plc{arch}. + + +\cexample[5.2]{metadirective}{1} + +\ffreeexample[5.2]{metadirective}{1} +\pagebreak + +\index{context selector!implementation@\plc{implementation}} +In the second example, the \kcode{implementation} selector set is specified +in the \kcode{when} clause to distinguish between platforms. +Additionally, specific architectures are specified with the \kcode{device} +selector set. + +In the code, different \kcode{teams} constructs are employed as determined +by the \kcode{metadirective} directive. +The number of teams is restricted by a \kcode{num_teams} clause +and a thread limit is also set by a \kcode{thread_limit} clause for +vendor platforms and specific architecture +traits. Otherwise, just the \kcode{teams} construct is used without +any clauses, as prescribed by the \kcode{otherwise} clause. + + +\cexample[5.2]{metadirective}{2} + +\ffreeexample[5.2]{metadirective}{2} + +\index{context selector!construct@\plc{construct}} + +\index{directives!declare target@\kcode{declare target}} +\index{declare target directive@\kcode{declare target} directive} + +\index{directives!begin declare target@\kcode{begin declare target}} +\index{begin declare target directive@\kcode{begin declare target} directive} + +In the third example, a \kcode{construct} selector set is specified in the \kcode{when} clause. +Here, a \kcode{metadirective} directive is used within a function that is also +compiled as a function for a target device as directed by a declare target directive. +The \kcode{target} directive name of the \kcode{construct} selector ensures that the +\kcode{distribute parallel for/do} construct is employed for the target compilation. +Otherwise, for the host-compiled version the \kcode{parallel for/do simd} construct is used. + +In the first call to the \ucode{exp_pi_diff()} routine the context is a +\kcode{target teams} construct and the \kcode{distribute parallel for/do} +construct version of the function is invoked, +while in the second call the \kcode{parallel for/do simd} construct version is used. + +%%%%%%%% +This case illustrates an important point for users that may want to hoist the +\kcode{target} directive out of a function that contains the usual +\kcode{target teams distribute parallel for/do} construct +(for providing alternate constructs through the \kcode{metadirective} directive as here). +While this combined construct can be decomposed into a \kcode{target} and +\kcode{teams distribute parallel for/do} constructs, the OpenMP 5.0 specification has the restriction: +``If a \kcode{teams} construct is nested within a \kcode{target} construct, that \kcode{target} construct must +contain no statements, declarations or directives outside of the \kcode{teams} construct''. +So, the \kcode{teams} construct must immediately follow the \kcode{target} construct without any intervening +code statements (which includes function calls). +Since the \kcode{target} construct alone cannot be hoisted out of a function, +the \kcode{target teams} construct has been hoisted out of the function, and +the \kcode{distribute parallel for/do} construct is used +as the variant directive of the \kcode{metadirective} directive within the function. +%%%%%%%% + +\cexample[5.2]{metadirective}{3} + +\ffreeexample[5.2]{metadirective}{3} +\pagebreak + +\index{context selector!user@\plc{user}} +\index{context selector!condition selector@\kcode{condition} selector} + +The \kcode{user} selector set can be used in a \kcode{metadirective} +to select directives at execution time when the +\kcode{condition( \plc{boolean-expr} )} selector expression is not a constant expression. +In this case it is a \plc{dynamic} trait set, and the selection is made at run time, rather +than at compile time. + +In the following example the \ucode{foo} function employs the \kcode{condition} +selector to choose a device for execution at run time. +In the \ucode{bar} routine metadirectives are nested. +At the outer level a selection between serial and parallel execution in performed +at run time, followed by another run time selection on the schedule kind in the inner +level when the active \plc{construct} trait is \kcode{parallel}. + +(Note, the variable \ucode{b} in two of the ``selected'' constructs is declared private for the sole purpose +of detecting and reporting that the construct is used. Since the variable is private, its value +is unchanged outside of the construct region, whereas it is changed if the ``unselected'' construct +is used.) + +%(Note: The value of \plc{b} after the \code{parallel} region remains 0 for the +%\code{guided} scheduling case, because its \code{parallel} construct also contains +%the \code{private(}~\plc{b}~\code{)} clause. +%The variable \plc{b} is employed for the sole purpose of distinguishing which +%\code{parallel} construct is selected-- for testing.) + +%While there might be other ways to make these decisions at run time, such as using +%an \code{if} clause on a \code{parallel} construct, this mechanism is much more general. +%For instance, an input ``gpu\_type'' string could be used and tested in boolean expressions +%to select from one of several possible \code{target} constructs. +%Also, setting the scheduling variable (\plc{unbalanced}) within the execution through a +%``work balance'' function might be a more practical approach for setting the schedule kind. + + +\cexample[5.2]{metadirective}{4} + +\ffreeexample[5.2]{metadirective}{4} +\pagebreak + +Metadirectives can be used in conjunction with templates as shown in the C++ code below. +Here the template definition generates two versions of the Fibonacci function. +The \ucode{tasking} boolean is used in the \kcode{condition} selector to enable tasking. +The true form implements a parallel version with \kcode{task} and \kcode{taskwait} +constructs as in the \example{tasking.4.c} code in Section~\ref{sec:task_taskwait}. +The false form implements a serial version without any tasking constructs. +Note that the serial version is used in the parallel function for optimally +processing numbers less than 8. + +\cppexample[5.0]{metadirective}{5} + +%\pagebreak +\subsection{Context Selector Scoring} + +\label{subsec:context_selector_scoring} +\index{context selector scoring@context selector scoring} + +Each context selector for which all specified traits are active in the current +\plc{OpenMP context} is a \plc{compatible context selector}, and the associated +function variant or directive variant for such a context selector is a +\plc{replacement candidate}. The final \plc{score} of each of the compatible +context selectors determine which of the replacement candidates is selected for +substitution. + +For a given compatible context selector, the score is calculated according +to the specified trait selectors and their corresponding traits. If the trait +selectors are a strict subset of the trait selectors specified by another +compatible context selector then the score of the context selector is zero. +Otherwise, the final score is one plus the sum of the score values of each +specified trait selector. + +A replacement candidate is selected if no other candidate has a higher scoring +context selector. If multiple replacement candidates have a context selector +with the same highest score, the one specified first on the metadirective is +selected. If multiple function variants are replacement candidates that have +context selectors with the same highest score, the one that is selected is +implementation defined. + +If a \kcode{construct} selector set is specified in the context selector, each +active construct trait that is named in that selector set contributes a score +of $2^{p-1}$, where $p$ is the position of that trait in the current +\plc{construct} trait set (the set of traits in the OpenMP context). If a +\kcode{device} or \kcode{target_device} selector set is specified in the +selector, then an active \plc{kind}, \plc{arch}, or \plc{isa} trait that is +named in the selector set contributes a score of $2^l$, $2^{l+1}$, and +$2^{l+2}$, respectively, where $l$ is the number of traits in the +\plc{construct} trait set. For any other active traits that are named in the +context selector that are not implementation-defined extensions, the +contributed score, by default, is zero. + +The default score for any active traits other than \plc{construct} traits and +the \plc{kind}, \plc{arch}, or \plc{isa} traits may be overridden with an +explicit score expression. Specifying an explicit score is only recommended +for prioritizing replacement candidates for which a selection is not +dependent on construct traits. That is, none of the compatible context +selectors specify a \kcode{construct} trait selector or a \kcode{kind}, +\kcode{arch}, or \kcode{isa} trait selector. + +In the following example, four function variants are declared for the procedure +\ucode{f}: \ucode{fx1}, \ucode{fx2}, \ucode{fx3}, and \ucode{fx4}. Suppose that +the target device for the \kcode{target} region has the \plc{gpu} device kind, +has the \plc{nvptx} architecture, and supports the \splc{sm_70} instruction set +architecture. Hence, the context selectors for all function variants are +compatible with the context at the callsite for \ucode{f} inside the +\kcode{target} region. The \plc{construct} trait set at the callsite, +consisting of all enclosing constructs and having a count of \plc{l=6}, is: +\{\plc{target}, \plc{teams}, \plc{distribute}, \plc{parallel}, +\plc{for}/\plc{do}, \plc{task}\}. Note that only \plc{context-matching} +constructs, which does not include \kcode{distribute} or \kcode{task}, may be +named by a \kcode{construct} trait selector as of OpenMP 5.2. The score for +\ucode{fx1} is $1+2^0=2$, for \ucode{fx2} is $1+2^1+2^3+2^4=27$, +for \ucode{fx3} is $1+2^6+2^8=321$, and for \ucode{fx4} is $1+2^7+2^8=385$. +Since \ucode{fx4} is the function variant that has the highest scoring +selector, it is selected by the implementation at the callsite. + +\cexample[5.0]{selector_scoring}{1} + +\ffreeexample[5.0]{selector_scoring}{1} + +In the next example, three function variants are declared for the procedure +\ucode{kernel}: \ucode{kernel_target_ua}, \ucode{kernel_target_usm}, and +\ucode{kernel_target_usm_v2}. Suppose that the implementation supports the +\splc{unified_address} and \splc{unified_shared_memory} requirements, so that +the context selectors for all function variants are compatible. The score for +\ucode{kernel_target_ua} is 1, which is one plus the zero score associated with +the active \splc{unified_address} requirement. The score for +\ucode{kernel_target_usm} is 0, as the selector is a strict subset of the +selector for \ucode{kernel_target_usm_v2}. The score for +\ucode{kernel_target_usm_v2} is 2, which is one plus the explicit score of 1 +for the \plc{condition} trait and the zero score associated with the acive +\splc{unified_shared_memory} requirement . Since \ucode{kernel_target_usm_v2} +is the function variant that has the highest scoring selector, it is selected +by the implementation at the callsite. + +\cexample[5.0]{selector_scoring}{2} + +\ffreeexample[5.0]{selector_scoring}{2} diff --git a/program_control/icv.tex b/program_control/icv.tex index 4802728..7c6b4f5 100644 --- a/program_control/icv.tex +++ b/program_control/icv.tex @@ -1,9 +1,9 @@ -\pagebreak +%\pagebreak \section{Internal Control Variables (ICVs)} \label{sec:icv} \index{internal control variables} -According to Section 2.3 of the OpenMP 4.0 specification, an OpenMP implementation must act as if there are ICVs that control +According to the \docref{Internal Control Variables} section of the OpenMP 4.0 specification, an OpenMP implementation must act as if there are ICVs that control the behavior of the program. This example illustrates two ICVs, \plc{nthreads-var} and \plc{max-active-levels-var}. The \plc{nthreads-var} ICV controls the number of threads requested for encountered parallel regions; there is one copy @@ -13,42 +13,42 @@ \section{Internal Control Variables (ICVs)} In the following example, the \plc{nest-var}, \plc{max-active-levels-var}, \plc{dyn-var}, and \plc{nthreads-var} ICVs are modified through calls to -the runtime library routines \code{omp\_set\_nested},\\ \code{omp\_set\_max\_active\_levels},\code{ -omp\_set\_dynamic}, and \code{omp\_set\_num\_threads} respectively. These ICVs -affect the operation of \code{parallel} regions. Each implicit task generated -by a \code{parallel} region has its own copy of the \plc{nest-var, dyn-var}, +the runtime library routines \kcode{omp_set_nested},\\ \kcode{omp_set_max_active_levels}, \kcode{omp_set_dynamic}, +and \kcode{omp_set_num_threads} respectively. These ICVs +affect the operation of \kcode{parallel} regions. Each implicit task generated +by a \kcode{parallel} region has its own copy of the \plc{nest-var}, \plc{dyn-var}, and \plc{nthreads-var} ICVs. In the following example, the new value of \plc{nthreads-var} applies only to -the implicit tasks that execute the call to \code{omp\_set\_num\_threads}. There +the implicit tasks that execute the call to \kcode{omp_set_num_threads}. There is one copy of the \plc{max-active-levels-var} ICV for the whole program and its value is the same for all tasks. This example assumes that nested parallelism is supported. -The outer \code{parallel} region creates a team of two threads; each of the threads -will execute one of the two implicit tasks generated by the outer \code{parallel} +The outer \kcode{parallel} region creates a team of two threads; each of the threads +will execute one of the two implicit tasks generated by the outer \kcode{parallel} region. -Each implicit task generated by the outer \code{parallel} region calls \code{omp\_set\_num\_threads(3)}, +Each implicit task generated by the outer \kcode{parallel} region calls \kcode{omp_set_num_threads(\ucode{3})}, assigning the value 3 to its respective copy of \plc{nthreads-var}. Then each -implicit task encounters an inner \code{parallel} region that creates a team +implicit task encounters an inner \kcode{parallel} region that creates a team of three threads; each of the threads will execute one of the three implicit tasks -generated by that inner \code{parallel} region. +generated by that inner \kcode{parallel} region. -Since the outer \code{parallel} region is executed by 2 threads, and the inner -by 3, there will be a total of 6 implicit tasks generated by the two inner \code{parallel} +Since the outer \kcode{parallel} region is executed by 2 threads, and the inner +by 3, there will be a total of 6 implicit tasks generated by the two inner \kcode{parallel} regions. -Each implicit task generated by an inner \code{parallel} region will execute -the call to\\ \code{omp\_set\_num\_threads(4)}, assigning the value 4 to its respective +Each implicit task generated by an inner \kcode{parallel} region will execute +the call to\\ \kcode{omp_set_num_threads(\ucode{4})}, assigning the value 4 to its respective copy of \plc{nthreads-var}. -The print statement in the outer \code{parallel} region is executed by only one +The print statement in the outer \kcode{parallel} region is executed by only one of the threads in the team. So it will be executed only once. -The print statement in an inner \code{parallel} region is also executed by only -one of the threads in the team. Since we have a total of two inner \code{parallel} -regions, the print statement will be executed twice -- once per inner \code{parallel} +The print statement in an inner \kcode{parallel} region is also executed by only +one of the threads in the team. Since we have a total of two inner \kcode{parallel} +regions, the print statement will be executed twice -- once per inner \kcode{parallel} region. \pagebreak diff --git a/program_control/interop.tex b/program_control/interop.tex index f42b503..1c73004 100644 --- a/program_control/interop.tex +++ b/program_control/interop.tex @@ -1,43 +1,43 @@ -\pagebreak -\section{\code{interop} Construct} +%\pagebreak +\section{\kcode{interop} Construct} \label{sec:interop} -\index{constructs!interop@\code{interop}} -\index{interop construct@\code{interop} construct} +\index{constructs!interop@\kcode{interop}} +\index{interop construct@\kcode{interop} construct} -The \scode{interop} construct allows OpenMP to interoperate with foreign runtime environments. -In the example below, asynchronous cuda memory copies and a \splc{cublasDaxpy} routine are executed -in a cuda stream. Also, an asynchronous target task execution (having a \scode{nowait} clause) +The \kcode{interop} construct allows OpenMP to interoperate with foreign runtime environments. +In the example below, asynchronous cuda memory copies and a \ucode{cublasDaxpy} routine are executed +in a cuda stream. Also, an asynchronous target task execution (having a \kcode{nowait} clause) and two explicit tasks are executed through OpenMP directives. Scheduling dependences (synchronization) are -imposed on the foreign stream and the OpenMP tasks through \scode{depend} clauses. +imposed on the foreign stream and the OpenMP tasks through \kcode{depend} clauses. -\index{interop construct@\code{interop} construct!init clause@\code{init} clause} -\index{init clause@\code{init} clause} -\index{clauses!init@\code{init}} -\index{interop construct@\code{interop} construct!depend clause@\code{depend} clause} -\index{depend clause@\code{depend} clause} -\index{clauses!depend@\code{depend}} -First, an interop object, \splc{obj}, is initialized for synchronization by including the -\scode{targetsync} \splc{interop-type} in the interop \scode{init} clause -(\scode{init(}~\scode{targetsync,obj}~\scode{)}). +\index{interop construct@\kcode{interop} construct!init clause@\kcode{init} clause} +\index{init clause@\kcode{init} clause} +\index{clauses!init@\kcode{init}} +\index{interop construct@\kcode{interop} construct!depend clause@\kcode{depend} clause} +\index{depend clause@\kcode{depend} clause} +\index{clauses!depend@\kcode{depend}} +First, an interop object, \ucode{obj}, is initialized for synchronization by including the +\kcode{targetsync} \plc{interop-type} in the interop \kcode{init} clause +(\kcode{init(targetsync, \ucode{obj})}). The object provides access to the foreign runtime. -The \scode{depend} clause provides a dependence behavior +The \kcode{depend} clause provides a dependence behavior for foreign tasks associated with a valid object. -\index{routines!omp_get_interop_int@\scode{omp_get_interop_int}} -\index{omp_get_interop_int routine@\scode{omp_get_interop_int} routine} -Next, the \scode{omp_get_interop_int} routine is used to extract the foreign -runtime id (\scode{omp_ipr_fr_id}), and a test in the next statement ensures -that the cuda runtime (\scode{omp_ifr_cuda}) is available. +\index{routines!omp_get_interop_int@\kcode{omp_get_interop_int}} +\index{omp_get_interop_int routine@\kcode{omp_get_interop_int} routine} +Next, the \kcode{omp_get_interop_int} routine is used to extract the foreign +runtime id (\kcode{omp_ipr_fr_id}), and a test in the next statement ensures +that the cuda runtime (\kcode{omp_ifr_cuda}) is available. -\index{routines!omp_get_interop_ptr@\scode{omp_get_interop_ptr}} -\index{omp_get_interop_ptr routine@\scode{omp_get_interop_ptr} routine} -\index{interop construct@\code{interop} construct!destroy clause@\code{destroy} clause} -\index{destroy clause@\code{destroy} clause} -\index{clauses!destroy@\code{destroy}} -Within the block for executing the \splc{cublasDaxpy} routine, a stream is acquired -with the \scode{omp_get_interop_ptr} routine, which returns a cuda stream (\splc{s}). +\index{routines!omp_get_interop_ptr@\kcode{omp_get_interop_ptr}} +\index{omp_get_interop_ptr routine@\kcode{omp_get_interop_ptr} routine} +\index{interop construct@\kcode{interop} construct!destroy clause@\kcode{destroy} clause} +\index{destroy clause@\kcode{destroy} clause} +\index{clauses!destroy@\kcode{destroy}} +Within the block for executing the \ucode{cublasDaxpy} routine, a stream is acquired +with the \kcode{omp_get_interop_ptr} routine, which returns a cuda stream (\ucode{s}). The stream is included in the cublas handle, and used directly in the asynchronous memory -routines. The following \scode{interop} construct, with the \scode{destroy} clause, +routines. The following \kcode{interop} construct, with the \kcode{destroy} clause, ensures that the foreign tasks have completed. \cexample[5.1]{interop}{1} diff --git a/program_control/metadirective.tex b/program_control/metadirective.tex deleted file mode 100644 index 28d6ca5..0000000 --- a/program_control/metadirective.tex +++ /dev/null @@ -1,154 +0,0 @@ -\pagebreak -\section{Metadirectives} -\label{sec:metadirective} -\index{directives!metadirective@\code{metadirective}} -\index{metadirective directive@\code{metadirective} directive} - -\index{metadirective directive@\code{metadirective} directive!when clause@\code{when} clause} -\index{metadirective directive@\code{metadirective} directive!otherwise clause@\code{otherwise} clause} -\index{clauses!when@\code{when}} -\index{when clause@\code{when} clause} -\index{clauses!otherwise@\code{otherwise}} -\index{otherwise clause@\code{otherwise} clause} -A \code{metadirective} directive provides a mechanism to select a directive in -a \code{when} clause to be used, depending upon one or more contexts: -implementation, available devices and the present enclosing construct. -The directive in an \code{otherwise} clause is used when a directive of the -\code{when} clause is not selected. - -\index{context selector!construct@\plc{construct}} -In the \code{when} clause the \plc{context selector} (or just \plc{selector}) defines traits that are -evaluated for selection of the directive that follows the selector. -This ``selectables'' directive is called a \plc{directive variant}. -Traits are grouped by \plc{construct}, \plc{implementation} and -\plc{device} \plc{sets} to be used by a selector of the same name. - -\index{context selector!device@\plc{device}} -In the first example the architecture trait \plc{arch} of the -\plc{device} selector set specifies that if an \plc{nvptx} architecture is -active in the OpenMP context, then the \code{teams}~\code{loop} -\plc{directive variant} is selected as the directive; otherwise, the \code{parallel}~\code{loop} -\plc{directive variant} of the \code{otherwise} clause is selected as the directive. -That is, if a \plc{device} of \plc{nvptx} architecture is supported by the implementation within -the enclosing \code{target} construct, its \plc{directive variant} is selected. -The architecture names, such as \plc{nvptx}, are implementation defined. -Also, note that \plc{device} as used in a \code{target} construct specifies -a device number, while \plc{device}, as used in the \code{metadirective} -directive as selector set, has traits of \plc{kind}, \plc{isa} and \plc{arch}. - - -\cexample[5.2]{metadirective}{1} - -\ffreeexample[5.2]{metadirective}{1} - -%\pagebreak -\index{context selector!implementation@\plc{implementation}} -In the second example, the \plc{implementation} selector set is specified -in the \code{when} clause to distinguish between platforms. -Additionally, specific architectures are specified with the \plc{device} -selector set. - -In the code, different \code{teams} constructs are employed as determined -by the \code{metadirective} directive. -The number of teams is restricted by a \code{num\_teams} clause -and a thread limit is also set by a \code{thread\_limit} clause for -\plc{vendor} platforms and specific architecture -traits. Otherwise, just the \code{teams} construct is used without -any clauses, as prescribed by the \code{otherwise} clause. - - -\cexample[5.2]{metadirective}{2} - -\ffreeexample[5.2]{metadirective}{2} -\clearpage - -\index{context selector!construct@\plc{construct}} - -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} - -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} - -In the third example, a \plc{construct} selector set is specified in the \code{when} clause. -Here, a \code{metadirective} directive is used within a function that is also -compiled as a function for a target device as directed by a declare target directive. -The \plc{target} directive name of the \code{construct} selector ensures that the -\code{distribute}~\code{parallel}~\code{for/do} construct is employed for the target compilation. -Otherwise, for the host-compiled version the \code{parallel}~\code{for/do}~\code{simd} construct is used. - -In the first call to the \plc{exp\_pi\_diff()} routine the context is a -\code{target}~\code{teams} construct and the \code{distribute}~\code{parallel}~\code{for/do} -construct version of the function is invoked, -while in the second call the \code{parallel}~\code{for/do}~\code{simd} construct version is used. - -%%%%%%%% -This case illustrates an important point for users that may want to hoist the -\code{target} directive out of a function that contains the usual -\code{target}~\code{teams}~\code{distribute}~\code{parallel}~\code{for/do} construct -(for providing alternate constructs through the \code{metadirective} directive as here). -While this combined construct can be decomposed into a \code{target} and -\code{teams distribute parallel for/do} constructs, the OpenMP 5.0 specification has the restriction: -``If a \code{teams} construct is nested within a \code{target} construct, that \code{target} construct must -contain no statements, declarations or directives outside of the \code{teams} construct''. -So, the \code{teams} construct must immediately follow the \code{target} construct without any intervening -code statements (which includes function calls). -Since the \code{target} construct alone cannot be hoisted out of a function, -the \code{target}~\code{teams} construct has been hoisted out of the function, and the -\code{distribute}~\code{parallel}~\code{for/do} construct is used -as the \plc{variant} directive of the \code{metadirective} directive within the function. -%%%%%%%% - -\cexample[5.2]{metadirective}{3} - -\ffreeexample[5.2]{metadirective}{3} - -\index{context selector!user@\plc{user}} -\index{context selector!condition selector@\code{condition} selector} -The \code{user} selector set can be used in a metadirective -to select directives at execution time when the -\code{condition(}~\plc{boolean-expr}~\code{)} selector expression is not a constant expression. -In this case it is a \plc{dynamic} trait set, and the selection is made at run time, rather -than at compile time. - -In the following example the \plc{foo} function employs the \code{condition} -selector to choose a device for execution at run time. -In the \plc{bar} routine metadirectives are nested. -At the outer level a selection between serial and parallel execution in performed -at run time, followed by another run time selection on the schedule kind in the inner -level when the active \plc{construct} trait is \code{parallel}. - -(Note, the variable \plc{b} in two of the ``selected'' constructs is declared private for the sole purpose -of detecting and reporting that the construct is used. Since the variable is private, its value -is unchanged outside of the construct region, whereas it is changed if the ``unselected'' construct -is used.) - -%(Note: The value of \plc{b} after the \code{parallel} region remains 0 for the -%\code{guided} scheduling case, because its \code{parallel} construct also contains -%the \code{private(}~\plc{b}~\code{)} clause. -%The variable \plc{b} is employed for the sole purpose of distinguishing which -%\code{parallel} construct is selected-- for testing.) - -%While there might be other ways to make these decisions at run time, such as using -%an \code{if} clause on a \code{parallel} construct, this mechanism is much more general. -%For instance, an input ``gpu\_type'' string could be used and tested in boolean expressions -%to select from one of several possible \code{target} constructs. -%Also, setting the scheduling variable (\plc{unbalanced}) within the execution through a -%``work balance'' function might be a more practical approach for setting the schedule kind. - - -\cexample[5.2]{metadirective}{4} - -\ffreeexample[5.2]{metadirective}{4} - -Metadirectives can be used in conjunction with templates as shown in the C++ code below. -Here the template definition generates two versions of the Fibonacci function. -The \splc{tasking} boolean is used in the \scode{condition} selector to enable tasking. -The true form implements a parallel version with \scode{task} and \scode{taskwait} -constructs as in the \splc{tasking.4.c} code in Section~\ref{sec:task_taskwait}. -The false form implements a serial version without any tasking constructs. -Note that the serial version is used in the parallel function for optimally -processing numbers less than 8. - -\cppexample[5.0]{metadirective}{5} - diff --git a/program_control/nested_loop.tex b/program_control/nested_loop.tex index 2fb9270..2f9e1d3 100644 --- a/program_control/nested_loop.tex +++ b/program_control/nested_loop.tex @@ -1,10 +1,10 @@ -\pagebreak +%\pagebreak \section{Nested Loop Constructs} \label{sec:nested_loop} \index{nested loop constructs} The following example of loop construct nesting is conforming because the inner -and outer loop regions bind to different \code{parallel} regions: +and outer loop regions bind to different \kcode{parallel} regions: \cexample{nested_loop}{1} diff --git a/program_control/nesting_restrict.tex b/program_control/nesting_restrict.tex index c276254..7c2ef54 100644 --- a/program_control/nesting_restrict.tex +++ b/program_control/nesting_restrict.tex @@ -1,4 +1,4 @@ -\pagebreak +%\pagebreak \section{Restrictions on Nesting of Regions} \label{sec:nesting_restrict} @@ -18,32 +18,33 @@ \section{Restrictions on Nesting of Regions} \fexample{nesting_restrict}{2} -The following example is non-conforming because the loop and \code{single} regions +The following example is non-conforming because the loop and \kcode{single} regions are closely nested: \cexample{nesting_restrict}{3} +\pagebreak \fexample{nesting_restrict}{3} -The following example is non-conforming because a \code{barrier} region cannot +The following example is non-conforming because a \kcode{barrier} region cannot be closely nested inside a loop region: \cexample{nesting_restrict}{4} \fexample{nesting_restrict}{4} -The following example is non-conforming because the \code{barrier} region cannot -be closely nested inside the \code{critical} region. If this were permitted, +The following example is non-conforming because the \kcode{barrier} region cannot +be closely nested inside the \kcode{critical} region. If this were permitted, it would result in deadlock due to the fact that only one thread at a time can -enter the \code{critical} region: +enter the \kcode{critical} region: \cexample{nesting_restrict}{5} \fexample{nesting_restrict}{5} -The following example is non-conforming because the \code{barrier} region cannot -be closely nested inside the \code{single} region. If this were permitted, it -would result in deadlock due to the fact that only one thread executes the \code{single} +The following example is non-conforming because the \kcode{barrier} region cannot +be closely nested inside the \kcode{single} region. If this were permitted, it +would result in deadlock due to the fact that only one thread executes the \kcode{single} region: \cexample{nesting_restrict}{6} diff --git a/program_control/pause_resource.tex b/program_control/pause_resource.tex index ffacd73..4509fb1 100644 --- a/program_control/pause_resource.tex +++ b/program_control/pause_resource.tex @@ -1,43 +1,44 @@ -\pagebreak -\section{\scode{omp_pause_resource} and \\ - \scode{omp_pause_resource_all} Routines} +%\pagebreak +\section{\kcode{omp_pause_resource} and \\ + \kcode{omp_pause_resource_all} Routines} \label{sec:pause_resource} -\index{routines!omp_pause_resource@\scode{omp_pause_resource}} -\index{omp_pause_resource routine@\scode{omp_pause_resource} routine} -\index{routines!omp_get_max_threads@\scode{omp_get_max_threads}} -\index{omp_get_max_threads routine@\scode{omp_get_max_threadsi} routine} -\index{routines!omp_get_initial_device@\scode{omp_get_initial_device}} -\index{omp_get_initial_device routine@\scode{omp_get_initial_device} routine} +\index{routines!omp_pause_resource@\kcode{omp_pause_resource}} +\index{omp_pause_resource routine@\kcode{omp_pause_resource} routine} +\index{routines!omp_get_max_threads@\kcode{omp_get_max_threads}} +\index{omp_get_max_threads routine@\kcode{omp_get_max_threadsi} routine} +\index{routines!omp_get_initial_device@\kcode{omp_get_initial_device}} +\index{omp_get_initial_device routine@\kcode{omp_get_initial_device} routine} Sometimes, it is necessary to relinquish resources created or allocated for the OpenMP runtime environment to avoid interference with subsequent actions as illustrated by the following example. In the beginning -either a call to the \scode{omp_get_max_threads} routine -or the subsequent \code{parallel} construct may trigger resource allocation +either a call to the \kcode{omp_get_max_threads} routine +or the subsequent \kcode{parallel} construct may trigger resource allocation by the OpenMP runtime, which may cause unexpected side effects -for the subsequent \plc{fork} call. +for the subsequent \ucode{fork} call. It is desirable to relinquish OpenMP resources allocated before -the fork by using the \scode{omp_pause_resource} routine for a given +the fork by using the \kcode{omp_pause_resource} routine for a given device, in this case the host device. The host device number is returned by -the \scode{omp_get_initial_device} routine. -The \scode{omp_pause_hard} value is used here to free as many +the \kcode{omp_get_initial_device} routine. +The \kcode{omp_pause_hard} value is used here to free as many OpenMP resources as possible. After the fork, the child process will initialize its OpenMP runtime -environment when encountering the \code{parallel} construct. +environment when encountering the \kcode{parallel} construct. \cexample[5.0]{pause_resource}{1} +\pagebreak -\index{routines!omp_pause_resource_all@\scode{omp_pause_resource_all}} -\index{omp_pause_resource_all routine@\scode{omp_pause_resource_all} routine} +\index{routines!omp_pause_resource_all@\kcode{omp_pause_resource_all}} +\index{omp_pause_resource_all routine@\kcode{omp_pause_resource_all} routine} The following example illustrates a different use case. After executing the first parallel code (parallel region 1), -the \plc{relinquish} program switches to executing an external parallel program -(called \plc{subprogram}, which is compiled from \splc{pause_resource.2b}). +the \ucode{relinquish} program switches to executing an external parallel program +(called \ucode{subprogram}, which is compiled from \example{pause_resource.2b}). In order to make resources available for the external -subprogram, \plc{relinquish} calls \scode{omp_pause_resource_all} +subprogram, \ucode{relinquish} calls \kcode{omp_pause_resource_all} to relinquish OpenMP resources used by the current program before -calling \scode{execute_command_line} to execute \plc{subprogram}. -The \scode{omp_pause_soft} value is used here to allow subsequent +calling \ucode{execute_command_line} to execute \ucode{subprogram}. +The \kcode{omp_pause_soft} value is used here to allow subsequent OpenMP regions (parallel region 2) to restart more quickly. \ffreeexample[5.0]{pause_resource}{2a} diff --git a/program_control/reproducible.tex b/program_control/reproducible.tex index 1e05921..731cfa8 100644 --- a/program_control/reproducible.tex +++ b/program_control/reproducible.tex @@ -1,59 +1,59 @@ -\pagebreak +%\pagebreak \section{Controlling Concurrency and Reproducibility with -the \code{order} Clause} +the \kcode{order} Clause} \label{sec:reproducible_modifier} -\index{clauses!order(concurrent)@\code{order(concurrent)}} -\index{order(concurrent) clause@\code{order(concurrent)} clause} +\index{clauses!order(concurrent)@\kcode{order(concurrent)}} +\index{order(concurrent) clause@\kcode{order(concurrent)} clause} -The \code{order} clause is used for controlling the parallel execution of +The \kcode{order} clause is used for controlling the parallel execution of loop iterations for one or more loops that are associated with a directive. It is specified with a clause argument and optional modifier. The only supported argument, introduced in OpenMP 5.0, is the keyword -\code{concurrent} which indicates that the loop iterations may execute +\kcode{concurrent} which indicates that the loop iterations may execute concurrently, including iterations in the same chunk per the loop schedule. -Because of the relaxed execution permitted with an \code{order(concurrent)} +Because of the relaxed execution permitted with an \kcode{order(concurrent)} clause, codes must not assume that any cross-iteration data dependences would be preserved or that any two iterations may execute on the same thread. The following example in this section demonstrates the use of -the \code{order(concurrent)} clause, without any modifiers, for controlling +the \kcode{order(concurrent)} clause, without any modifiers, for controlling the parallel execution of loop iterations. -The \code{order(concurrent)} clause cannot be used for the second and third -\code{parallel}~\code{for}/\code{do} constructs because of either having +The \kcode{order(concurrent)} clause cannot be used for the second and third +\kcode{parallel for}/\kcode{do} constructs because of either having data dependences or accessing threadprivate variables. \cexample[5.0]{reproducible}{1} \ffreeexample[5.0]{reproducible}{1} -\index{order(concurrent) clause@\code{order(concurrent)} clause!reproducible modifier@\code{reproducible} modifier} -\index{order(concurrent) clause@\code{order(concurrent)} clause!unconstrained modifier@\code{unconstrained} modifier} -Modifiers to the \code{order} clause, introduced in OpenMP 5.1, may be +\index{order(concurrent) clause@\kcode{order(concurrent)} clause!reproducible modifier@\kcode{reproducible} modifier} +\index{order(concurrent) clause@\kcode{order(concurrent)} clause!unconstrained modifier@\kcode{unconstrained} modifier} +Modifiers to the \kcode{order} clause, introduced in OpenMP 5.1, may be specified to control the reproducibility of the loop schedule for the associated loop(s). A reproducible loop schedule will consistently yield the same mapping of iterations to threads (or SIMD lanes) if the directive name, loop schedule, iteration space, and binding region remain -the same. The \code{reproducible} modifier indicates the loop schedule must -be reproducible, while the \code{unconstrained} modifier indicates that +the same. The \kcode{reproducible} modifier indicates the loop schedule must +be reproducible, while the \kcode{unconstrained} modifier indicates that the loop schedule is not reproducible. -If a modifier is not specified, then the \code{order} clause does not affect +If a modifier is not specified, then the \kcode{order} clause does not affect the reproducibility of the loop schedule. -The next example demonstrates the use of the \code{order(concurrent)} clause +The next example demonstrates the use of the \kcode{order(concurrent)} clause with modifiers for additionally controlling the reproducibility of a loop's schedule. -The two worksharing-loop constructs in the first \code{parallel} construct -specify that the loops have reproducible schedules, thus memory effects from iteration \plc{i} from the first loop will be observable to iteration \plc{i} +The two worksharing-loop constructs in the first \kcode{parallel} construct +specify that the loops have reproducible schedules, thus memory effects from iteration \ucode{i} from the first loop will be observable to iteration \ucode{i} in the second loop. -In the second \code{parallel} construct, the \code{order} clause does not +In the second \kcode{parallel} construct, the \kcode{order} clause does not control reproducibility for the loop schedules. However, since both loops specify the same static schedules, the schedules are reproducible and the data dependences between the loops are preserved by the execution. -In the third \code{parallel} construct, the \code{order} clause indicates +In the third \kcode{parallel} construct, the \kcode{order} clause indicates that the loops are not reproducible, overriding the default reproducibility prescribed by the specified static schedule. Consequentially, -the \code{nowait} clause on the first worksharing-loop construct should not +the \kcode{nowait} clause on the first worksharing-loop construct should not be used to ensure that the data dependences are preserved by the execution. \cexample[5.1]{reproducible}{2} diff --git a/program_control/requires.tex b/program_control/requires.tex index 2c8b6c3..042b463 100644 --- a/program_control/requires.tex +++ b/program_control/requires.tex @@ -1,29 +1,29 @@ -\pagebreak -\section{\code{requires} Directive} +%\pagebreak +\section{\kcode{requires} Directive} \label{sec:requires} -\index{directives!requires@\code{requires}} -\index{requires directive@\code{requires} directive} +\index{directives!requires@\kcode{requires}} +\index{requires directive@\kcode{requires} directive} -The declarative \code{requires} directive can be used to +The declarative \kcode{requires} directive can be used to specify features that an implementation must provide to compile and execute correctly. -\index{requires directive@\code{requires} directive!unified_shared_memory clause@\scode{unified_shared_memory} clause} -\index{clauses!unified_shared_memory@\scode{unified_shared_memory}} -\index{unified_shared_memory clause@\scode{unified_shared_memory} clause} -In the following example the \code{unified\_shared\_memory} clause -of the \code{requires} directive ensures that the host and all +\index{requires directive@\kcode{requires} directive!unified_shared_memory clause@\kcode{unified_shared_memory} clause} +\index{clauses!unified_shared_memory@\kcode{unified_shared_memory}} +\index{unified_shared_memory clause@\kcode{unified_shared_memory} clause} +In the following example the \kcode{unified_shared_memory} clause +of the \kcode{requires} directive ensures that the host and all devices accessible through OpenMP provide a \plc{unified address} space for memory that is shared by all devices. -The example illustrates the use of the \code{requires} directive specifying +The example illustrates the use of the \kcode{requires} directive specifying \plc{unified shared memory} in file scope, before any device -directives or device routines. No \code{map} clause is needed for -the \plc{p} structure on the device (and its address \plc{\&p}, for the C++ code, +directives or device routines. No \kcode{map} clause is needed for +the \ucode{p} structure on the device (and its address \ucode{\&p}, for the C++ code, is the same address on the host and device). -However, scalar variables referenced within the \code{target} -construct still have a default data-sharing attribute of firstprivate. -The \plc{q} scalar is incremented on the device, and its change is +However, scalar variables referenced within the \kcode{target} +construct still have a default data-sharing attribute of \kcode{firstprivate}. +The \ucode{q} scalar is incremented on the device, and its change is not updated on the host. % will defaultmap(toform:scalar) make q use shared address space? %Or will it be ignored at this point. diff --git a/program_control/sources/assumption.1.c b/program_control/sources/assumption.1.c new file mode 100644 index 0000000..e75dcda --- /dev/null +++ b/program_control/sources/assumption.1.c @@ -0,0 +1,47 @@ +/* +* @@name: assumption.1 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_5.1 +*/ + +#include +#include + +#pragma omp declare target +int N; +#pragma omp end declare target + +// Case 1: Delimited scope +#pragma omp begin assumes no_parallelism +extern void fun(int *A); +#pragma omp end assumes + +int main() { + int *A, *B; + N = (rand() % 5 + 1) * 16; + A = (int *) malloc(sizeof(int) * N); + B = (int *) malloc(sizeof(int) * N); + + for(int i = 0; i < N; i++){ + A[i] = 0; + B[i] = i; + } + + #pragma omp target teams distribute parallel for map(tofrom: A[0:N]) + for(int i = 0; i < N; i++){ + fun(A); + } + +// Case 2: Block associated + + #pragma omp assume holds (N % 8 == 0 && N > 0) + #pragma omp simd + for (int i = 0; i < N; ++i){ + A[i] += B[i]; + } + + return 0; +} + diff --git a/program_control/sources/assumption.1.f90 b/program_control/sources/assumption.1.f90 new file mode 100644 index 0000000..0dfaff7 --- /dev/null +++ b/program_control/sources/assumption.1.f90 @@ -0,0 +1,50 @@ +! @@name: assumption.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_5.1 + +module m + !$omp assumes no_parallelism + interface + subroutine fun(A, i) + implicit none + integer :: A(*),i + + end subroutine + end interface + +end module + +program main + use m + implicit none + integer,allocatable :: A(:), B(:) + integer :: i, N + real :: rand_no + + call random_number(rand_no) !! create random, + N = (int(rand_no*5)+1)*16 !! runtime number multiple of 16 + + allocate(A(N),B(N)) !! alloc space & initialize + do i = 1, N + A(i) = 0; B(i) = i + end do + +!! Case 1: Delimited scope, see module interface + + !$omp target teams distribute parallel do map(tofrom: A) + do i = 1, N + call fun(A,i) + end do + +!! Case 2: Block associated + + !$omp assume holds (8*(N/8) == N .and. N>0) !! N is multiple of 8 + !$omp simd + do i = 1, N + A(i) = A(i) + B(i) + end do + !$omp end assume + +end program diff --git a/program_control/sources/declare_variant.3.c b/program_control/sources/declare_variant.3.c new file mode 100644 index 0000000..01d5a8c --- /dev/null +++ b/program_control/sources/declare_variant.3.c @@ -0,0 +1,58 @@ +/* +* @@name: declare_variant.3 +* @@type: C +* @@operation: view +* @@expect: +* @@version: omp_5.1 +*/ + +#include +#include +#include +#include + +#ifdef _OPENMP +#pragma omp begin declare variant match(device={kind(nohost)}) + + #pragma omp begin declare variant match(implementation={vendor(nvidia)}) + + #pragma omp begin declare variant match(device={isa(sm_70)}) + #include "sm70/my_cuda_fun.h" /* only included if isa is sm70 */ + #pragma omp end declare variant + + #pragma omp begin declare variant match(device={isa(sm_80)}) + #include "sm80/my_cuda_fun.h" /* only included if isa is sm80 */ + #pragma omp end declare variant + + #pragma omp end declare variant + + #pragma omp begin declare variant match(implementation={vendor(amd)}) + #include "amdgpu/my_hip_fun.h" /* only included for AMD */ + #pragma omp end declare variant + +#pragma omp end declare variant + +#pragma omp begin declare variant match(device={kind(host)}) + #include "openmp_host/my_fun.h" +#pragma omp end declare variant +#else + #include "generic/my_fun.h" +#endif + +#define N 64 +double array[N]; +#pragma omp target + +int main() { + // Array initialization + for (int i = 0; i < N; ++i) { + array[i] = 0.0; + } + +#pragma omp target map(tofrom: array[0:N]) + for (int i = 0; i < N; ++i) { + array[i] = my_fun(i); + } + return 0; + +} diff --git a/program_control/sources/metadirective.3.f90 b/program_control/sources/metadirective.3.f90 index df91d8f..ddafb70 100644 --- a/program_control/sources/metadirective.3.f90 +++ b/program_control/sources/metadirective.3.f90 @@ -9,7 +9,6 @@ module params ! 3.1415926535897932_8 end module - subroutine exp_pi_diff(d, my_pi) use params implicit none @@ -20,7 +19,6 @@ subroutine exp_pi_diff(d, my_pi) !$omp metadirective & !$omp& when( construct={target}: distribute parallel do ) & !$omp& otherwise( parallel do simd ) - do i = 1,size(d) d(i) = exp( (M_PI-my_pi)*i ) end do diff --git a/program_control/sources/metadirective.4.f90 b/program_control/sources/metadirective.4.f90 index 1ee6a3b..8b59151 100644 --- a/program_control/sources/metadirective.4.f90 +++ b/program_control/sources/metadirective.4.f90 @@ -11,7 +11,7 @@ subroutine foo(a, n, use_gpu) !$omp metadirective & !$omp& when(user={condition(use_gpu)}: & - !$omp& target teams distribute parallel for & + !$omp& target teams distribute parallel do & !$omp& private(b) map(from:a(1:n)) ) & !$omp& otherwise( & !$omp& parallel do) @@ -33,8 +33,8 @@ subroutine bar (a, n, run_parallel, unbalanced) !$omp metadirective & !$omp& when(construct={parallel}, user={condition(unbalanced)}: & - !$omp& for schedule(guided) private(b)) & - !$omp& when(construct={parallel}: for schedule(static)) + !$omp& do schedule(guided) private(b)) & + !$omp& when(construct={parallel}: do schedule(static)) do i = 1,n; a(i)=i; if(i==n) b=1; end do !$omp end metadirective diff --git a/program_control/sources/nested_loop.2.c b/program_control/sources/nested_loop.2.c index 9966ebf..cc7b4da 100644 --- a/program_control/sources/nested_loop.2.c +++ b/program_control/sources/nested_loop.2.c @@ -7,7 +7,6 @@ */ void work(int i, int j) {} - void work1(int i, int n) { int j; @@ -19,7 +18,6 @@ void work1(int i, int n) } } - void good_nesting2(int n) { int i; diff --git a/program_control/sources/pause_resource.1.c b/program_control/sources/pause_resource.1.c index 4579e57..71b8111 100644 --- a/program_control/sources/pause_resource.1.c +++ b/program_control/sources/pause_resource.1.c @@ -38,7 +38,6 @@ int main() int myid = omp_get_thread_num(); printf("child: myid %d of %d\n", myid, nt); } - exit(0); } else { /* parent process */ diff --git a/program_control/sources/pause_resource.2a.f90 b/program_control/sources/pause_resource.2a.f90 index 58bbbc4..d45b129 100644 --- a/program_control/sources/pause_resource.2a.f90 +++ b/program_control/sources/pause_resource.2a.f90 @@ -10,9 +10,9 @@ program relinquish write (*,*) 'In relinquish' -!$omp parallel + !$omp parallel write (*,*) 'In parallel region 1' -!$omp end parallel + !$omp end parallel err = omp_pause_resource_all(omp_pause_soft) @@ -21,7 +21,8 @@ program relinquish cmdstat=err) if (err /= 0) write (*,*) 'Warning: subprogram failed to execute' -!$omp parallel + !$omp parallel write (*,*) 'In parallel region 2' -!$omp end parallel + !$omp end parallel + end program relinquish diff --git a/program_control/sources/selector_scoring.1.c b/program_control/sources/selector_scoring.1.c new file mode 100644 index 0000000..742b84f --- /dev/null +++ b/program_control/sources/selector_scoring.1.c @@ -0,0 +1,62 @@ +/* +* @@name: selector_scoring.1 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_5.0 +*/ + +#include +#include + +#pragma omp declare variant(fx1) match(construct={target}) +#pragma omp declare variant(fx2) match(construct={teams,parallel,for}) +#pragma omp declare variant(fx3) match(device={kind(gpu),isa(sm_70)}) +#pragma omp declare variant(fx4) match(device={arch(nvptx),isa(sm_70)}) +void f(int *a, int i) +{ + *a = i; +} + +void fx1(int *a, int i) +{ + *a = i; +} + +void fx2(int *a, int i) +{ + *a = 2*i; +} + +void fx3(int *a, int i) +{ + *a = 3*i; +} + +void fx4(int *a, int i) +{ + *a = 4*i; +} + +int main() +{ + #define N 4 + int a[N]; + #pragma omp target teams distribute parallel for map(a[:N]) + for (int i = 0; i < N; i++) { + #pragma omp task + { + f(&a[i], i); + } + } + + for (int i = 0; i < N; i++) { + if (a[i] != 4*i) { + printf("Failed\n"); + return 1; + } + } + + printf("Passed\n"); + return 0; +} diff --git a/program_control/sources/selector_scoring.1.f90 b/program_control/sources/selector_scoring.1.f90 new file mode 100644 index 0000000..af27160 --- /dev/null +++ b/program_control/sources/selector_scoring.1.f90 @@ -0,0 +1,68 @@ +! @@name: selector_scoring.1 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_5.0 +module subs +contains + + subroutine f(a,i) + !$omp declare variant(fx1) match(construct={target}) + !$omp declare variant(fx2) match(construct={teams,parallel,do}) + !$omp declare variant(fx3) match(device={kind(gpu),isa(sm_70)}) + !$omp declare variant(fx4) match(device={arch(nvptx),isa(sm_70)}) + integer, intent(out) :: a + integer, value :: i + a = i + end subroutine + + subroutine fx1(a,i) + integer, intent(out) :: a + integer, value :: i + a = i + end subroutine + + subroutine fx2(a,i) + integer, intent(out) :: a + integer, value :: i + a = 2*i + end subroutine + + subroutine fx3(a,i) + integer, intent(out) :: a + integer, value :: i + a = 3*i + end subroutine + + subroutine fx4(a,i) + integer, intent(out) :: a + integer, value :: i + a = 4*i + end subroutine + +end module subs + + +program main + use subs + integer, parameter :: N = 4 + integer :: a(N) + integer :: i + + !$omp target teams distribute parallel do map(a) + do i = 1, N + !$omp task + call f(a(i),i) + !$omp end task + end do + + do i = 1, N + if (a(i) /= 4*i ) then + print *, "Failed" + error stop + end if + end do + + print *, "Passed" + +end program diff --git a/program_control/sources/selector_scoring.2.c b/program_control/sources/selector_scoring.2.c new file mode 100644 index 0000000..9f5964d --- /dev/null +++ b/program_control/sources/selector_scoring.2.c @@ -0,0 +1,71 @@ +/* +* @@name: selector_scoring.2 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_5.0 +*/ + +#include + +// The unified_address and/or unified_shared_memory requirements may be +// implicitly enforced by the implementation via compiler flags. + +const int version = 2; + +#pragma declare variant(kernel_target_ua) \ + match(implementation={requires(unified_address)}) +#pragma declare variant(kernel_target_usm) \ + match(implementation={requires(unified_shared_memory)}) +#pragma declare variant(kernel_target_usm_v2) \ + match(implementation={requires(unified_shared_memory)}, \ + user={condition(score(1): version==2)}) +void kernel(int *a, int n) +{ + #pragma omp parallel for + for (int i = 0; i < n; i++) { + a[i] = i*i; + } +} + +void kernel_target_ua(int *a, int n) +{ + #pragma omp target data map(a[:n]) use_device_ptr(a) + #pragma omp target parallel for + for (int i = 0; i < n; i++) { + a[i] = 2*i*i; + } +} + +void kernel_target_usm(int *a, int n) +{ + #pragma omp target parallel for + for (int i = 0; i < n; i++) { + a[i] = 3*i*i; + } +} + +void kernel_target_usm_v2(int *a, int n) +{ + #pragma omp target teams loop + for (int i = 0; i < n; i++) { + a[i] = 4*i*i; + } +} + +int main() +{ + int a[1000]; + + kernel(a, 1000); + + for (int i = 0; i < 1000; i++) { + if (a[i] != 4*i*i) { + printf("Failed\n"); + return 1; + } + } + + printf("Passed\n"); + return 0; +} diff --git a/program_control/sources/selector_scoring.2.f90 b/program_control/sources/selector_scoring.2.f90 new file mode 100644 index 0000000..4239275 --- /dev/null +++ b/program_control/sources/selector_scoring.2.f90 @@ -0,0 +1,90 @@ +! @@name: selector_scoring.2 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_5.0 +module subs + ! The unified_address and/or unified_shared_memory requirements may be + ! implicitly enforced by the implementation via compiler flags. + + integer, parameter :: version = 2 +contains + + subroutine kernel(a, n) + !$omp declare variant(kernel_target_ua) & + !$omp match(implementation={requires(unified_address)}) + + !$omp declare variant(kernel_target_usm) & + !$omp match(implementation={requires(unified_shared_memory)}) + + !$omp declare variant(kernel_target_usm_v2) & + !$omp match(implementation={requires(unified_shared_memory)}, & + !$omp user={condition(score(1): version==2)}) + + integer :: a(*) + integer, value :: n + integer :: i + !$omp parallel do + do i = 1, n + a(i) = i*i + end do + end subroutine + + subroutine kernel_target_ua(a, n) + use iso_c_binding + integer, target :: a(*) + integer, value :: n + type(c_ptr) :: c_ap + integer, pointer :: ap(:) + integer :: i + c_ap = c_loc(a) + !$omp target data map(a(:n)) use_device_ptr(c_ap) + !$omp target + call c_f_pointer(c_ap, ap) + !$omp parallel do + do i = 1, n + ap(i) = 2*i*i + end do + !$omp end target + !$omp end target data + end subroutine + + subroutine kernel_target_usm(a, n) + integer :: a(*) + integer, value :: n + integer :: i + !$omp target parallel do + do i = 1, n + a(i) = 3*i*i + end do + end subroutine + + subroutine kernel_target_usm_v2(a, n) + integer :: a(*) + integer, value :: n + integer :: i + !$omp target teams loop + do i = 1, n + a(i) = 4*i*i + end do + end subroutine + +end module subs + + +program main + use subs + integer, target :: a(1000) + + call kernel(a, 1000) + + do i = 1, 1000 + if (a(i) /= 4*i*i ) then + print *, "Failed" + error stop + end if + end do + + print *, "Passed" + +end program diff --git a/program_control/standalone.tex b/program_control/standalone.tex index fb89ea2..65e6f0c 100644 --- a/program_control/standalone.tex +++ b/program_control/standalone.tex @@ -1,40 +1,39 @@ -\pagebreak -\section{Placement of \code{flush}, \code{barrier}, \code{taskwait} -and \code{taskyield} Directives} +%\pagebreak +\section{Placement of \kcode{flush}, \kcode{barrier}, \kcode{taskwait} +and \kcode{taskyield} Directives} \label{sec:standalone} \index{standalone directive placement} -\index{constructs!flush@\code{flush}} -\index{constructs!barrier@\code{barrier}} -\index{constructs!taskwait@\code{taskwait}} -\index{constructs!taskyield@\code{taskyield}} -\index{flush construct@\code{flush} construct} -\index{barrier construct@\code{barrier} construct} -\index{taskwait construct@\code{taskwait} construct} -\index{taskyield construct@\code{taskyield} construct} - -The following example is non-conforming, because the \code{flush}, \code{barrier}, -\code{taskwait}, and \code{taskyield} directives are stand-alone directives -and cannot be the immediate substatement of an \code{if} statement. +\index{constructs!flush@\kcode{flush}} +\index{constructs!barrier@\kcode{barrier}} +\index{constructs!taskwait@\kcode{taskwait}} +\index{constructs!taskyield@\kcode{taskyield}} +\index{flush construct@\kcode{flush} construct} +\index{barrier construct@\kcode{barrier} construct} +\index{taskwait construct@\kcode{taskwait} construct} +\index{taskyield construct@\kcode{taskyield} construct} + +The following example is non-conforming, because the \kcode{flush}, \kcode{barrier}, +\kcode{taskwait}, and \kcode{taskyield} directives are stand-alone directives +and cannot be the immediate substatement of an \bcode{if} statement. \cexample[3.1]{standalone}{1} -\pagebreak -The following example is non-conforming, because the \code{flush}, \code{barrier}, -\code{taskwait}, and \code{taskyield} directives are stand-alone directives -and cannot be the action statement of an \code{if} statement or a labeled branch +The following example is non-conforming, because the \kcode{flush}, \kcode{barrier}, +\kcode{taskwait}, and \kcode{taskyield} directives are stand-alone directives +and cannot be the action statement of an \bcode{if} statement or a labeled branch target. \ffreeexample[3.1]{standalone}{1} -The following version of the above example is conforming because the \code{flush}, -\code{barrier}, \code{taskwait}, and \code{taskyield} directives are enclosed +The following version of the above example is conforming because the \kcode{flush}, +\kcode{barrier}, \kcode{taskwait}, and \kcode{taskyield} directives are enclosed in a compound statement. +\pagebreak \cexample[3.1]{standalone}{2} -\pagebreak -The following example is conforming because the \code{flush}, \code{barrier}, -\code{taskwait}, and \code{taskyield} directives are enclosed in an \code{if} +The following example is conforming because the \kcode{flush}, \kcode{barrier}, +\kcode{taskwait}, and \kcode{taskyield} directives are enclosed in an \bcode{if} construct or follow the labeled branch target. \ffreeexample[3.1]{standalone}{2} diff --git a/program_control/target_offload.tex b/program_control/target_offload.tex index 39a4817..58e3863 100644 --- a/program_control/target_offload.tex +++ b/program_control/target_offload.tex @@ -1,39 +1,39 @@ -\pagebreak +%\pagebreak \section{Target Offload} \label{sec:target_offload} -\index{environment variables!OMP_TARGET_OFFLOAD@\scode{OMP_TARGET_OFFLOAD}} -\index{OMP_TARGET_OFFLOAD@\scode{OMP_TARGET_OFFLOAD}} +\index{environment variables!OMP_TARGET_OFFLOAD@\kcode{OMP_TARGET_OFFLOAD}} +\index{OMP_TARGET_OFFLOAD@\kcode{OMP_TARGET_OFFLOAD}} -In the OpenMP 5.0 implementation the \code{OMP\_TARGET\_OFFLOAD} -environment variable was defined to change \plc{default} offload behavior. -By \plc{default} the target code (region) is executed on the host if the target device +In the OpenMP 5.0 implementation the \kcode{OMP_TARGET_OFFLOAD} +environment variable was defined to change default offload behavior. +By default the target code (region) is executed on the host if the target device does not exist or the implementation does not support the target device. %Last sentence uses words of the 5.0 spec pg. 21 lines 7-8 In an OpenMP 5.0 compliant implementation, setting the -\code{OMP\_TARGET\_OFFLOAD} variable to \code{MANDATORY} will -force the program to terminate execution when a \code{target} +\kcode{OMP_TARGET_OFFLOAD} variable to \vcode{MANDATORY} will +force the program to terminate execution when a \kcode{target} construct is encountered and the target device is not supported or is not available. -With a value \code{DEFAULT} the target region will execute on a device if the +With a value \vcode{DEFAULT} the target region will execute on a device if the device exists and is supported by the implementation, otherwise it will execute on the host. -Support for the \code{DISABLED} +Support for the \vcode{DISABLED} value is optional; when it is supported the behavior is as if only the host device exists (other devices are considered non-existent to the runtime), and target regions are executed on the host. The following example reports execution behavior for different -values of the \code{OMP\_TARGET\_OFFLOAD} variable. A handy routine -for extracting the \code{OMP\_TARGET\_OFFLOAD} environment variable +values of the \kcode{OMP_TARGET_OFFLOAD} variable. A handy routine +for extracting the \kcode{OMP_TARGET_OFFLOAD} environment variable value is deployed here, because the OpenMP API does not have a routine for obtaining the value. %(\texit{yet}). Note: The example issues a warning when a pre-5.0 implementation is used, -indicating that the \code{OMP\_TARGET\_OFFLOAD} is ignored. -The value of the \code{OMP\_TARGET\_OFFLOAD} variable is reported -when the \code{OMP\_DISPLAY\_ENV} -environment variable is set to \code{TRUE} or \code{VERBOSE}. +indicating that the \kcode{OMP_TARGET_OFFLOAD} is ignored. +The value of the \kcode{OMP_TARGET_OFFLOAD} variable is reported +when the \kcode{OMP_DISPLAY_ENV} +environment variable is set to \vcode{TRUE} or \vcode{VERBOSE}. %\pagebreak \cexample[5.0]{target_offload_control}{1}[1] diff --git a/program_control/utilities.tex b/program_control/utilities.tex index 6d3b65c..b59e862 100644 --- a/program_control/utilities.tex +++ b/program_control/utilities.tex @@ -6,16 +6,16 @@ \section{Utilities} %--------------------------- \subsection{Timing Routines} \label{subsec:get_wtime} -\index{routines!omp_get_wtime@\scode{omp_get_wtime}} -\index{omp_get_wtime routine@\scode{omp_get_wtime} routine} -\index{routines!omp_get_wtick@\scode{omp_get_wtick}} -\index{omp_get_wtick routine@\scode{omp_get_wtick} routine} +\index{routines!omp_get_wtime@\kcode{omp_get_wtime}} +\index{omp_get_wtime routine@\kcode{omp_get_wtime} routine} +\index{routines!omp_get_wtick@\kcode{omp_get_wtick}} +\index{omp_get_wtick routine@\kcode{omp_get_wtick} routine} -The \scode{omp_get_wtime} routine can be used to measure the elapsed wall +The \kcode{omp_get_wtime} routine can be used to measure the elapsed wall clock time (in seconds) of code execution in a program. The routine is thread safe and can be executed by multiple threads concurrently. The precision of the timer can be obtained by a call to -the \scode{omp_get_wtick} routine. The following example shows a use case. +the \kcode{omp_get_wtick} routine. The following example shows a use case. \cexample{get_wtime}{1} @@ -25,35 +25,35 @@ \subsection{Timing Routines} %--------------------------- \subsection{Environment Display} \label{subsec:display_env} -\index{environment display!OMP_DISPLAY_ENV@\scode{OMP_DISPLAY_ENV}} -\index{environment variables!OMP_DISPLAY_ENV@\scode{OMP_DISPLAY_ENV}} -\index{OMP_DISPLAY_ENV@\scode{OMP_DISPLAY_ENV}} -\index{environment display!omp_display_env routine@\scode{omp_display_env} routine} -\index{routines!omp_display_env@\scode{omp_display_env}} -\index{omp_display_env routine@\scode{omp_display_env} routine} +\index{environment display!OMP_DISPLAY_ENV@\kcode{OMP_DISPLAY_ENV}} +\index{environment variables!OMP_DISPLAY_ENV@\kcode{OMP_DISPLAY_ENV}} +\index{OMP_DISPLAY_ENV@\kcode{OMP_DISPLAY_ENV}} +\index{environment display!omp_display_env routine@\kcode{omp_display_env} routine} +\index{routines!omp_display_env@\kcode{omp_display_env}} +\index{omp_display_env routine@\kcode{omp_display_env} routine} The OpenMP version number and the values of ICVs associated with the relevant environment variables can be displayed at runtime by setting -the \scode{OMP_DISPLAY_ENV} environment variable to either -\code{TRUE} or \code{VERBOSE}. +the \kcode{OMP_DISPLAY_ENV} environment variable to either +\vcode{TRUE} or \vcode{VERBOSE}. The information is displayed once by the runtime. A more flexible or controllable approach is to call -the \scode{omp_display_env} API routine at any desired +the \kcode{omp_display_env} API routine at any desired point of a code to display the same information. -This OpenMP 5.1 API routine takes a single \plc{verbose} argument. -A value of 0 or .false. (for C/C++ or Fortran) indicates +This OpenMP 5.1 API routine takes a single \ucode{verbose} argument. +A value of 0 or \bcode{.false.} (for C/C++ or Fortran) indicates the required OpenMP ICVs associated with environment variables be displayed, -and a value of 1 or .true. (for C/C++ or Fortran) will include +and a value of 1 or \bcode{.true.} (for C/C++ or Fortran) will include vendor-specific ICVs that can be modified by environment variables. The following example illustrates the conditional execution of the API -\scode{omp_display_env} routine. Typically it would be invoked in +\kcode{omp_display_env} routine. Typically it would be invoked in various debug modes of an application. An important use case is to have a single MPI process (e.g., rank = 0) of a hybrid (MPI+OpenMP) code execute the routine, instead of all MPI processes, as would be done by -setting the \scode{OMP_DISPLAY_ENV} to \code{TRUE} or \code{VERBOSE}. +setting the \kcode{OMP_DISPLAY_ENV} to \vcode{TRUE} or \vcode{VERBOSE}. \cexample[5.1]{display_env}{1} @@ -88,22 +88,22 @@ \subsection{Environment Display} %--------------------------- -\subsection{\code{error} Directive} +\subsection{\kcode{error} Directive} \label{subsec:error} -\index{directives!error@\code{error}} -\index{error directive@\code{error} directive} -\index{error directive@\code{error} directive!at clause@\code{at} clause} -\index{clauses!at@\code{at}} -\index{at clause@\code{at} clause} -\index{error directive@\code{error} directive!severity clause@\code{severity} clause} -\index{clauses!severity@\code{severity}} -\index{severity clause@\code{severity} clause} - -The \code{error} directive provides a consistent method for C, C++, and Fortran to emit a \plc{fatal} or -\plc{warning} message at \plc{compilation} or \plc{execution} time, as determined by a \code{severity} -or an \code{at} clause, respectively. When \code{severity(fatal)} is present, the compilation -or execution is aborted. Without any clauses the default behavior is as if \code{at(compilation)} -and \code{severity(fatal)} were specified. +\index{directives!error@\kcode{error}} +\index{error directive@\kcode{error} directive} +\index{error directive@\kcode{error} directive!at clause@\kcode{at} clause} +\index{clauses!at@\kcode{at}} +\index{at clause@\kcode{at} clause} +\index{error directive@\kcode{error} directive!severity clause@\kcode{severity} clause} +\index{clauses!severity@\kcode{severity}} +\index{severity clause@\kcode{severity} clause} + +The \kcode{error} directive provides a consistent method for C, C++, and Fortran to emit a \kcode{fatal} or +\kcode{warning} message at \kcode{compilation} or \kcode{execution} time, as determined by a \kcode{severity} +or an \kcode{at} clause, respectively. When \kcode{severity(fatal)} is present, the compilation +or execution is aborted. Without any clauses the default behavior is as if \kcode{at(compilation)} +and \kcode{severity(fatal)} were specified. The C, C++, and Fortran examples below show all the cases for reporting messages. diff --git a/program_control/variant.tex b/program_control/variant.tex deleted file mode 100644 index 31d7542..0000000 --- a/program_control/variant.tex +++ /dev/null @@ -1,88 +0,0 @@ -\pagebreak -\section{\code{declare}~\code{variant} Directive} -\label{sec:declare_variant} -\index{directives!declare variant@\code{declare}~\code{variant}} -\index{declare variant directive@\code{declare}~\code{variant} directive} -\index{declare variant directive@\code{declare}~\code{variant} directive!match clause@\code{match} clause} -\index{clauses!match@\code{match}} -\index{match clause@\code{match} clause} - -\index{directives!declare target@\code{declare}~\code{target}} -\index{declare target directive@\code{declare}~\code{target} directive} - -\index{directives!begin declare target@\code{begin}~\code{declare}~\code{target}} -\index{begin declare target directive@\code{begin}~\code{declare}~\code{target} directive} - -%A \code{declare variant} directive specifies that the following function is an alternate function, -%a \plc{function variant}, to be used in place of the specified \plc{base function} -%when the trait within the \code{match} clause has a valid context. - -A \code{declare}~\code{variant} directive specifies an alternate function, -\plc{function variant}, to be used in place of the \plc{base function} -%when the trait within the \code{match} clause has a valid context. -when the trait within the \code{match} clause matches the OpenMP context at a given call site. -The base function follows the directive in the C and C++ languages. -In Fortran, either a subroutine or function may be used as the \plc{base function}, -and the \code{declare}~\code{variant} directive must be in the specification -part of a subroutine or function (unless a \plc{base-proc-name} -modifier is used, as in the case of a procedure declaration statement). See -the OpenMP 5.0 Specification for details on the modifier. - -When multiple \code{declare}~\code{variant} directives are used -a function variant becomes a candidate for replacing the base function if the -%base function call context matches the traits of all selectors in the \code{match} clause. -context at the base function call matches the traits of all selectors in the \code{match} clause. -If there are multiple candidates, a score is assigned with rules for each -of the selector traits. The scoring algorithm can be found in the OpenMP 5.0 Specification. - -In the first example the \plc{vxv()} function is called within a \code{parallel} region, -a \code{target} region, and in a sequential part of the program. Two function variants, \plc{p\_vxv()} and \plc{t\_vxv()}, -are defined for the first two regions by using \plc{parallel} and \plc{target} selectors (within -the \plc{construct} trait set) in a \code{match} clause. The \plc{p\_vxv()} function variant includes -a \code{for} construct (\code{do} construct for Fortran) for the \code{parallel} region, -while \plc{t\_vxv()} includes a \code{distribute}~\code{simd} construct for the \code{target} region. -The \plc{t\_vxv()} function is explicitly compiled for the device using a declare target directive. - -Since the two \code{declare}~\code{variant} directives have no selectors that match traits for the context -of the base function call in the sequential part of the program, the base \plc{vxv()} function is used there, -as expected. -(The vectors in the \plc{p\_vxv} and \plc{t\_vxv} functions have been multiplied -by 3 and 2, respectively, for checking the validity of the replacement. Normally -the purpose of a function variant is to produce the same results by a different method.) - -%Note: a \code{target teams} construct is used to direct execution onto a device, with a -%\code{distribute simd} construct in the function variant. As of the OpenMP 5.0 implementation -%no intervening code is allowed between a \code{target} and \code{teams} construct. So -%using a \code{target} construct to direct execution onto a device, and including -%\code{teams distribute simd} in the variant function would produce non conforming code. - -%\pagebreak -\cexample[5.1]{declare_variant}{1} - -\ffreeexample[5.0]{declare_variant}{1} - - -%\pagebreak - -In this example, traits from the \plc{device} set are used to select a function variant. -In the \code{declare}~\code{variant} directive, an \plc{isa} selector -specifies that if the implementation of the ``\plc{core-avx512}'' -instruction set is detected at compile time the \plc{avx512\_saxpy()} -variant function is used for the call to \plc{base\_saxpy()}. - -A compilation of \plc{avx512\_saxpy()} is aware of -the AVX-512 instruction set that supports 512-bit vector extensions (for Xeon or Xeon Phi architectures). -Within \plc{avx512\_saxpy()}, the \code{parallel}~\code{for}~\code{simd} construct performs parallel execution, and -takes advantage of 64-byte data alignment. -When the \plc{avx512\_saxpy()} function variant is not selected, the base \plc{base\_saxpy()} function variant -containing only a basic \code{parallel}~\code{for} construct is used for the call to \plc{base\_saxpy()}. - -%Note: -%An allocator is used to set the alignment to 64 bytes when an OpenMP compilation is performed. -%Details about allocator variable declarations and functions -%can be found in the allocator example of the Memory Management Chapter. - -%\pagebreak -\cexample[5.0]{declare_variant}{2} - -\ffreeexample[5.0]{declare_variant}{2} diff --git a/sources/check_tags b/sources/check_tags index ff9e135..e9d9cd3 100755 --- a/sources/check_tags +++ b/sources/check_tags @@ -44,16 +44,20 @@ done # check all files echo " >>> Checking version tags in tex files" | tee -a $log_file +echo "$chk_tags -vtag $BASE_DIR/*/*.tex" | tee -a $log_file $chk_tags -vtag $BASE_DIR/*/*.tex 2>&1 | tee -a $log_file echo " >>> Checking tags and line length of source files" | tee -a $log_file +echo "$chk_tags -sc $BASE_DIR/*/sources/*" | tee -a $log_file $chk_tags -sc $BASE_DIR/*/sources/* 2>&1 | tee -a $log_file +echo "$chk_tags -clen $BASE_DIR/*/sources/*" | tee -a $log_file $chk_tags -clen $BASE_DIR/*/sources/* 2>&1 | tee -a $log_file echo " >>> Checking if all source files are referenced in tex files" | tee -a $log_file $chk_tags -v -list -vtag $BASE_DIR/*/*.tex | sort > msrc_in_tex.list ls $BASE_DIR/*/sources/* | sort > msrc_files.list +echo "diff -p msrc_in_tex.list msrc_files.list" | tee -a $log_file diff -p msrc_in_tex.list msrc_files.list 2>&1 | tee -a $log_file \rm -f msrc_in_tex.list msrc_files.list diff --git a/synchronization/acquire_release.tex b/synchronization/acquire_release.tex index 6817bdd..0da2b34 100644 --- a/synchronization/acquire_release.tex +++ b/synchronization/acquire_release.tex @@ -1,11 +1,11 @@ -\pagebreak +%\pagebreak \section{Synchronization Based on Acquire/Release Semantics} \label{sec:acquire_and_release_semantics} %OpenMP 5.0 introduced ``release/acquire'' memory ordering semantics to the %specification. The memory ordering behavior of OpenMP constructs and routines %that permit two threads to synchronize with each other are defined in terms of -%\textit{release flushes} and \textit{acquire flushes}, where a release flush +%\emph{release flushes} and \emph{acquire flushes}, where a release flush %must occur at the source of the synchronization and an acquire flush must occur %at the sink of the synchronization. Flushes resulting from a \code{flush} %directive without a list may function as a release flush, an acquire flush, or @@ -21,107 +21,107 @@ \section{Synchronization Based on Acquire/Release Semantics} \index{flushes!acquire} \index{flushes!release} \index{clauses!memory ordering clauses} -\index{memory ordering clauses!acquire@\code{acquire}} -\index{acquire clause@\code{acquire} clause} -\index{memory ordering clauses!release@\code{release}} -\index{release clause@\code{release} clause} -\index{memory ordering clauses!acq_rel@\scode{acq_rel}} -\index{acq_rel clause@\scode{acq_rel} clause} -\index{flush construct@\code{flush} construct} -\index{atomic construct@\code{atomic} construct} -\index{clauses!acquire@\code{acquire}} -\index{clauses!release@\code{release}} -\index{clauses!acq_rel@\scode{acq_rel}} +\index{memory ordering clauses!acquire@\kcode{acquire}} +\index{acquire clause@\kcode{acquire} clause} +\index{memory ordering clauses!release@\kcode{release}} +\index{release clause@\kcode{release} clause} +\index{memory ordering clauses!acq_rel@\kcode{acq_rel}} +\index{acq_rel clause@\kcode{acq_rel} clause} +\index{flush construct@\kcode{flush} construct} +\index{atomic construct@\kcode{atomic} construct} +\index{clauses!acquire@\kcode{acquire}} +\index{clauses!release@\kcode{release}} +\index{clauses!acq_rel@\kcode{acq_rel}} As explained in the Memory Model chapter of this document, a flush operation -may be an \emph{acquire flush} and/or a \emph{release flush}, and OpenMP 5.0 +may be an \plc{acquire flush} and/or a \plc{release flush}, and OpenMP 5.0 defines acquire/release semantics in terms of these fundamental flush operations. For any synchronization between two threads that is specified by OpenMP, a release flush logically occurs at the source of the synchronization and an acquire flush logically occurs at the sink of the synchronization. -OpenMP 5.0 added memory ordering clauses -- \code{acquire}, \code{release}, and -\code{acq\_rel} -- to the \code{flush} and \code{atomic} constructs for +OpenMP 5.0 added memory ordering clauses -- \kcode{acquire}, \kcode{release}, and +\kcode{acq_rel} -- to the \kcode{flush} and \kcode{atomic} constructs for explicitly requesting acquire/release semantics. Furthermore, implicit flushes for all OpenMP constructs and runtime routines that synchronize OpenMP threads in some manner were redefined in terms of synchronizing release and acquire -flushes to avoid the requirement of strong memory fences (see the \plc{Flush -Synchronization and Happens Before} and \plc{Implicit Flushes} sections of the +flushes to avoid the requirement of strong memory fences (see the \docref{Flush +Synchronization and Happens Before} and \docref{Implicit Flushes} sections of the OpenMP Specifications document). The examples that follow in this section illustrate how acquire and release flushes may be employed, implicitly or explicitly, for synchronizing threads. A -\code{flush} directive without a list and without any memory ordering clause +\kcode{flush} directive without a list and without any memory ordering clause can also function as both an acquire and release flush for facilitating thread synchronization. Flushes implied on entry to, or exit from, an atomic -operation (specified by an \code{atomic} construct) may function as an acquire +operation (specified by an \kcode{atomic} construct) may function as an acquire flush or a release flush if a memory ordering clause appears on the construct. -On entry to and exit from a \code{critical} construct there is now an implicit +On entry to and exit from a \kcode{critical} construct there is now an implicit acquire flush and release flush, respectively. %%%%%%%%%%%%%%%%%% -\index{constructs!critical@\code{critical}} -\index{critical construct@\code{critical} construct} +\index{constructs!critical@\kcode{critical}} +\index{critical construct@\kcode{critical} construct} \index{flushes!implicit} The first example illustrates how the release and acquire flushes implied by a -\code{critical} region guarantee a value written by the first thread is visible -to a read of the value on the second thread. Thread 0 writes to \plc{x} and -then executes a \code{critical} region in which it writes to \plc{y}; the write -to \plc{x} happens before the execution of the \code{critical} region, +\kcode{critical} region guarantee a value written by the first thread is visible +to a read of the value on the second thread. Thread 0 writes to \ucode{x} and +then executes a \kcode{critical} region in which it writes to \ucode{y}; the write +to \ucode{x} happens before the execution of the \kcode{critical} region, consistent with the program order of the thread. Meanwhile, thread 1 executes a -\code{critical} region in a loop until it reads a non-zero value from -\plc{y} in the \code{critical} region, after which it prints the value of -\plc{x}; again, the execution of the \code{critical} regions happen before the -read from \plc{x} based on the program order of the thread. The \code{critical} +\kcode{critical} region in a loop until it reads a non-zero value from +\ucode{y} in the \kcode{critical} region, after which it prints the value of +\ucode{x}; again, the execution of the \kcode{critical} regions happen before the +read from \ucode{x} based on the program order of the thread. The \kcode{critical} regions executed by the two threads execute in a serial manner, with a -pairwise synchronization from the exit of one \code{critical} region to the -entry to the next \code{critical} region. These pairwise synchronizations +pair-wise synchronization from the exit of one \kcode{critical} region to the +entry to the next \kcode{critical} region. These pair-wise synchronizations result from the implicit release flushes that occur on exit from -\code{critical} regions and the implicit acquire flushes that occur on entry to -\code{critical} regions; hence, the execution of each \code{critical} region in -the sequence happens before the execution of the next \code{critical} region. -A ``happens before'' order is therefore established between the assignment to \plc{x} -by thread 0 and the read from \plc{x} by thread 1, and so thread 1 must see that -\plc{x} equals 10. +\kcode{critical} regions and the implicit acquire flushes that occur on entry to +\kcode{critical} regions; hence, the execution of each \kcode{critical} region in +the sequence happens before the execution of the next \kcode{critical} region. +A ``happens before'' order is therefore established between the assignment to \ucode{x} +by thread 0 and the read from \ucode{x} by thread 1, and so thread 1 must see that +\ucode{x} equals 10. -\pagebreak +%\pagebreak \cexample[5.0]{acquire_release}{1} \ffreeexample[5.0]{acquire_release}{1} -\index{constructs!atomic@\code{atomic}} -\index{atomic construct@\code{atomic} construct} -\index{atomic construct@\code{atomic} construct!write clause@\code{write} clause} -\index{atomic construct@\code{atomic} construct!read clause@\code{read} clause} -\index{atomic construct@\code{atomic} construct!memory ordering clauses} -\index{write clause@\code{write} clause} -\index{read clause@\code{read} clause} -\index{clauses!write@\code{write}} -\index{clauses!read@\code{read}} -\index{memory ordering clauses!seq_cst@\scode{seq_cst}} -\index{seq_cst clause@\scode{seq_cst} clause} -\index{clauses!seq_cst@\scode{seq_cst}} -In the second example, the \code{critical} constructs are exchanged with -\code{atomic} constructs that have \textit{explicit} memory ordering specified. When the -atomic read operation on thread 1 reads a non-zero value from \plc{y}, this +\index{constructs!atomic@\kcode{atomic}} +\index{atomic construct@\kcode{atomic} construct} +\index{atomic construct@\kcode{atomic} construct!write clause@\kcode{write} clause} +\index{atomic construct@\kcode{atomic} construct!read clause@\kcode{read} clause} +\index{atomic construct@\kcode{atomic} construct!memory ordering clauses} +\index{write clause@\kcode{write} clause} +\index{read clause@\kcode{read} clause} +\index{clauses!write@\kcode{write}} +\index{clauses!read@\kcode{read}} +\index{memory ordering clauses!seq_cst@\kcode{seq_cst}} +\index{seq_cst clause@\kcode{seq_cst} clause} +\index{clauses!seq_cst@\kcode{seq_cst}} +In the second example, the \kcode{critical} constructs are exchanged with +\kcode{atomic} constructs that have \emph{explicit} memory ordering specified. When the +\plc{atomic read} operation on thread 1 reads a non-zero value from \ucode{y}, this results in a release/acquire synchronization that in turn implies that the -assignment to \plc{x} on thread 0 happens before the read of \plc{x} on thread +assignment to \ucode{x} on thread 0 happens before the read of \ucode{x} on thread 1. Therefore, thread 1 will print ``x = 10''. \cexample[5.0]{acquire_release}{2} \ffreeexample[5.0]{acquire_release}{2} \pagebreak -\index{constructs!atomic@\code{atomic}} -\index{atomic construct@\code{atomic} construct!relaxed atomic operations} -\index{flush construct@\code{flush} construct} -In the third example, \code{atomic} constructs that specify relaxed atomic -operations are used with explicit \code{flush} directives to enforce memory -ordering between the two threads. The explicit \code{flush} directive on thread -0 must specify a release flush and the explicit \code{flush} directive on +\index{constructs!atomic@\kcode{atomic}} +\index{atomic construct@\kcode{atomic} construct!relaxed atomic operations} +\index{flush construct@\kcode{flush} construct} +In the third example, \kcode{atomic} constructs that specify relaxed atomic +operations are used with explicit \kcode{flush} directives to enforce memory +ordering between the two threads. The explicit \kcode{flush} directive on thread +0 must specify a release flush and the explicit \kcode{flush} directive on thread 1 must specify an acquire flush to establish a release/acquire -synchronization between the two threads. The \code{flush} and \code{atomic} -constructs encountered by thread 0 can be replaced by the \code{atomic} construct used in -Example 2 for thread 0, and similarly the \code{flush} and \code{atomic} -constructs encountered by thread 1 can be replaced by the \code{atomic} +synchronization between the two threads. The \kcode{flush} and \kcode{atomic} +constructs encountered by thread 0 can be replaced by the \kcode{atomic} construct used in +Example 2 for thread 0, and similarly the \kcode{flush} and \kcode{atomic} +constructs encountered by thread 1 can be replaced by the \kcode{atomic} construct used in Example 2 for thread 1. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%3 @@ -140,21 +140,21 @@ \section{Synchronization Based on Acquire/Release Semantics} \cexample[5.0]{acquire_release}{3} \ffreeexample[5.0]{acquire_release}{3} -Example 4 will fail to order the write to \plc{x} on thread 0 before the read -from \plc{x} on thread 1. Importantly, the implicit release flush on exit from -the \code{critical} region will not synchronize with the acquire flush that -occurs on the atomic read operation performed by thread 1. This is because +Example 4 will fail to order the write to \ucode{x} on thread 0 before the read +from \ucode{x} on thread 1. Importantly, the implicit release flush on exit from +the \kcode{critical} region will not synchronize with the acquire flush that +occurs on the \plc{atomic read} operation performed by thread 1. This is because implicit release flushes that occur on a given construct may only synchronize with implicit acquire flushes on a compatible construct (and vice-versa) that internally makes use of the same synchronization variable. For a -\code{critical} construct, this might correspond to a \plc{lock} object that +\kcode{critical} construct, this might correspond to a \plc{lock} object that is used by a given implementation (for the synchronization semantics of other -constructs due to implicit release and acquire flushes, refer to the \plc{Implicit -Flushes} section of the OpenMP Specifications document). Either an explicit \code{flush} +constructs due to implicit release and acquire flushes, refer to the \docref{Implicit +Flushes} section of the OpenMP Specifications document). Either an explicit \kcode{flush} directive that provides a release flush (i.e., a flush without a list that does -not have the \code{acquire} clause) must be specified between the -\code{critical} construct and the atomic write, or an atomic operation that -modifies \plc{y} and provides release semantics must be specified. +not have the \kcode{acquire} clause) must be specified between the +\kcode{critical} construct and the \plc{atomic write}, or an atomic operation that +modifies \ucode{y} and provides release semantics must be specified. %{\color{violet} %In the following example synchronization between the acquire flush of the atomic read diff --git a/synchronization/atomic.tex b/synchronization/atomic.tex index e373dbc..8cf6809 100644 --- a/synchronization/atomic.tex +++ b/synchronization/atomic.tex @@ -1,36 +1,36 @@ -\pagebreak -\section{\code{atomic} Construct} +%\pagebreak +\section{\kcode{atomic} Construct} \label{sec:atomic} -\index{constructs!atomic@\code{atomic}} -\index{atomic construct@\code{atomic} construct} -\index{atomic construct@\code{atomic} construct!update clause@\code{update} clause} -\index{clauses!update@\code{update}} -\index{update clause@\code{update} clause} +\index{constructs!atomic@\kcode{atomic}} +\index{atomic construct@\kcode{atomic} construct} +\index{atomic construct@\kcode{atomic} construct!update clause@\kcode{update} clause} +\index{clauses!update@\kcode{update}} +\index{update clause@\kcode{update} clause} The following example avoids race conditions (simultaneous updates of an element -of \plc{x} by multiple threads) by using the \code{atomic} construct . +of \ucode{x} by multiple threads) by using the \kcode{atomic} construct . -The advantage of using the \code{atomic} construct in this example is that it -allows updates of two different elements of \plc{x} to occur in parallel. If -a \code{critical} construct were used instead, then all updates to elements of -\plc{x} would be executed serially (though not in any guaranteed order). +The advantage of using the \kcode{atomic} construct in this example is that it +allows updates of two different elements of \ucode{x} to occur in parallel. If +a \kcode{critical} construct were used instead, then all updates to elements of +\ucode{x} would be executed serially (though not in any guaranteed order). -Note that the \code{atomic} directive applies only to the statement immediately -following it. As a result, elements of \plc{y} are not updated atomically in +Note that the \kcode{atomic} directive applies only to the statement immediately +following it. As a result, elements of \ucode{y} are not updated atomically in this example. \cexample[3.1]{atomic}{1} \fexample[3.1]{atomic}{1} -\index{atomic construct@\code{atomic} construct!write clause@\code{write} clause} -\index{atomic construct@\code{atomic} construct!read clause@\code{read} clause} -\index{write clause@\code{write} clause} -\index{clauses!write@\code{write}} -\index{read clause@\code{read} clause} -\index{clauses!read@\code{read}} -The following example illustrates the \code{read} and \code{write} clauses -for the \code{atomic} directive. These clauses ensure that the given variable +\index{atomic construct@\kcode{atomic} construct!write clause@\kcode{write} clause} +\index{atomic construct@\kcode{atomic} construct!read clause@\kcode{read} clause} +\index{write clause@\kcode{write} clause} +\index{clauses!write@\kcode{write}} +\index{read clause@\kcode{read} clause} +\index{clauses!read@\kcode{read}} +The following example illustrates the \kcode{read} and \kcode{write} clauses +for the \kcode{atomic} directive. These clauses ensure that the given variable is read or written, respectively, as a whole. Otherwise, some other thread might read or write part of the variable while the current thread was reading or writing another part of the variable. Note that most hardware provides atomic reads and @@ -41,15 +41,15 @@ \section{\code{atomic} Construct} \fexample[3.1]{atomic}{2} -\index{atomic construct@\code{atomic} construct!capture clause@\code{capture} clause} -\index{capture clause@\code{capture} clause} -\index{clauses!capture@\code{capture}} -The following example illustrates the \code{capture} clause for the \code{atomic} +\index{atomic construct@\kcode{atomic} construct!capture clause@\kcode{capture} clause} +\index{capture clause@\kcode{capture} clause} +\index{clauses!capture@\kcode{capture}} +The following example illustrates the \kcode{capture} clause for the \kcode{atomic} directive. In this case the value of a variable is captured, and then the variable is incremented. These operations occur atomically. This example could -be implemented using the fetch-and-add instruction available on many kinds of hardware. -The example also shows a way to implement a spin lock using the \code{capture} - and \code{read} clauses. +be implemented using the \emph{fetch-and-add} instruction available on many kinds of hardware. +The example also shows a way to implement a spin lock using the \kcode{capture} + and \kcode{read} clauses. \cexample[3.1]{atomic}{3} diff --git a/synchronization/atomic_cas.tex b/synchronization/atomic_cas.tex index 1a4350e..c1315bd 100644 --- a/synchronization/atomic_cas.tex +++ b/synchronization/atomic_cas.tex @@ -1,41 +1,41 @@ -\pagebreak +%\pagebreak \section{Atomic Compare} \label{sec:cas} -\index{constructs!atomic@\code{atomic}} -\index{atomic construct@\code{atomic} construct} -\index{clauses!capture@\code{capture}} -\index{clauses!compare@\code{compare}} -\index{capture clause@\code{capture} clause} -\index{compare clause@\code{compare} clause} +\index{constructs!atomic@\kcode{atomic}} +\index{atomic construct@\kcode{atomic} construct} +\index{clauses!capture@\kcode{capture}} +\index{clauses!compare@\kcode{compare}} +\index{capture clause@\kcode{capture} clause} +\index{compare clause@\kcode{compare} clause} -In OpenMP 5.1 the \scode{compare} clause was added to the extended-atomic clauses. -The \scode{compare} clause extends the semantics to perform the \scode{atomic} +In OpenMP 5.1 the \kcode{compare} clause was added to the \plc{extended-atomic} clauses. +The \kcode{compare} clause extends the semantics to perform the \kcode{atomic} update conditionally. In the following C/C++ example, two formats of structured blocks -are shown for associated \scode{atomic} constructs with the \scode{compare} clause. -In the first \scode{atomic} construct, the format forms a conditional update statement. -In the second \scode{atomic} construct the format forms a conditional expression statement. -The ``greater than'' and ``less than'' forms are not available with the Fortran \scode{compare} -clause. One can use the \splc{max} and \splc{min} functions with the \scode{atomic}~\scode{update} +are shown for associated \kcode{atomic} constructs with the \kcode{compare} clause. +In the first \kcode{atomic} construct, the format forms a \plc{conditional update} statement. +In the second \kcode{atomic} construct the format forms a \plc{conditional expression} statement. +The ``greater than'' and ``less than'' forms are not available with the Fortran \kcode{compare} +clause. One can use the \vcode{max} and \vcode{min} functions with the \kcode{atomic update} construct to perform the C/C++ example operations. \cexample[5.1]{cas}{1} %\ffreeexample[5.1]{cas}{1} -In OpenMP 5.1 the \scode{compare} clause was also added to support Compare And -Swap (CAS) semantics. In the following example the \splc{enqueue} routine +In OpenMP 5.1 the \kcode{compare} clause was also added to support \emph{Compare And +Swap} (CAS) semantics. In the following example the \ucode{enqueue} routine (a naive implementation of a Michael and Scott enqueue function), uses the -\scode{compare} clause, with the \scode{capture} clause, to perform and compare -(\splc{q->head == node->next}) and swap (\splc{if-else} assignments) of the +\kcode{compare} clause, with the \kcode{capture} clause, to perform and compare +(\ucode{q->head == node->next}) and swap (\plc{if-else} assignments) of the form: \begin{description}[noitemsep,labelindent=5mm,widest=f90] \item \splc{{ r = x == e; if(r) { x = d; } else { v = x; } }}. \end{description} -The example program concurrently enqueues nodes from an array of nodes (\splc{nodes[N]}). +The example program concurrently enqueues nodes from an array of nodes (\ucode{nodes[N]}). Since the equivalence of Fortran pointers can be determined only with a function (such as associated), -no Fortran version is provided here. The use of the associated function in an atomic compare syntax is +no Fortran version is provided here. The use of the associated function in an \kcode{atomic compare} syntax is being considered in a future release. \cexample[5.1]{cas}{2} diff --git a/synchronization/atomic_hint.tex b/synchronization/atomic_hint.tex new file mode 100644 index 0000000..a83239d --- /dev/null +++ b/synchronization/atomic_hint.tex @@ -0,0 +1,31 @@ +%\pagebreak +\section{Atomic Hint} +\label{sec:atomic_hint} +\index{constructs!atomic@\kcode{atomic}} +\index{atomic construct@\kcode{atomic} construct} +\index{atomic construct@\kcode{atomic} construct!hint clause@\kcode{hint} clause} +\index{clauses!hint@\kcode{hint}} +\index{hint clause@\kcode{hint} clause} + +The atomic \kcode{hint} clause can be used to specify the +expected access to an atomic operation; thereby providing a hint +to be used for optimizing the synchronization of the atomic operation. + +In the example below the \kcode{omp_sync_hint_uncontended} constant +in the \kcode{hint} clause specifies that few threads are expected +to attempt to perform the atomic operation at the same time. +This is justified in this case if \ucode{calc_vals} takes considerably +more time than the atomic operations, and the subsequent time of +arrival to execute the \kcode{atomic} region is varied about a mean time +and by times (much) greater than the execution time of the atomic +operation. + +In the case where the execution time for \ucode{calc_vals} is short +compared to the atomic operation time, the \kcode{omp_sync_hint_contended} +hint parameter might be used. + +\cexample[5.0]{atomic}{4} + +\ffreeexample[5.0]{atomic}{4} + + diff --git a/synchronization/atomic_restrict.tex b/synchronization/atomic_restrict.tex index c5d6f9c..50efed8 100644 --- a/synchronization/atomic_restrict.tex +++ b/synchronization/atomic_restrict.tex @@ -1,10 +1,10 @@ -\pagebreak -\section{Restrictions on the \code{atomic} Construct} +%\pagebreak +\section{Restrictions on the \kcode{atomic} Construct} \label{sec:atomic_restrict} -\index{constructs!atomic@\code{atomic}} -\index{atomic construct@\code{atomic} construct} +\index{constructs!atomic@\kcode{atomic}} +\index{atomic construct@\kcode{atomic} construct} -The following non-conforming examples illustrate the restrictions on the \code{atomic} +The following non-conforming examples illustrate the restrictions on the \kcode{atomic} construct. \cexample[3.1]{atomic_restrict}{1} @@ -14,7 +14,7 @@ \section{Restrictions on the \code{atomic} Construct} \cexample[3.1]{atomic_restrict}{2} \fortranspecificstart -The following example is non-conforming because \code{I} and \code{R} reference +The following example is non-conforming because \ucode{I} and \ucode{R} reference the same location but have different types. \fnexample[3.1]{atomic_restrict}{2} diff --git a/synchronization/barrier_regions.tex b/synchronization/barrier_regions.tex index a975d38..d0607ab 100644 --- a/synchronization/barrier_regions.tex +++ b/synchronization/barrier_regions.tex @@ -1,22 +1,22 @@ -\pagebreak -\section{Binding of \code{barrier} Regions} +%\pagebreak +\section{Binding of \kcode{barrier} Regions} \label{sec:barrier_regions} -\index{binding!barrier regions@\code{barrier} regions} +\index{binding!barrier regions@\kcode{barrier} regions} -The binding rules call for a \code{barrier} region to bind to the closest enclosing -\code{parallel} region. +The binding rules call for a \kcode{barrier} region to bind to the closest enclosing +\kcode{parallel} region. -In the following example, the call from the main program to \plc{sub2} is conforming -because the \code{barrier} region (in \plc{sub3}) binds to the \code{parallel} -region in \plc{sub2}. The call from the main program to \plc{sub1} is conforming -because the \code{barrier} region binds to the \code{parallel} region in subroutine -\plc{sub2}. +In the following example, the call from the main program to \ucode{sub2} is conforming +because the \kcode{barrier} region (in \ucode{sub3}) binds to the \kcode{parallel} +region in \ucode{sub2}. The call from the main program to \ucode{sub1} is conforming +because the \kcode{barrier} region binds to the \kcode{parallel} region in subroutine +\ucode{sub2}. -The call from the main program to \plc{sub3} is conforming because the \code{barrier} -region binds to the implicit inactive \code{parallel} region enclosing the sequential -part. Also note that the \code{barrier} region in \plc{sub3} when called from -\plc{sub2} only synchronizes the team of threads in the enclosing \code{parallel} -region and not all the threads created in \plc{sub1}. +The call from the main program to \ucode{sub3} is conforming because the \kcode{barrier} +region binds to the implicit inactive \kcode{parallel} region enclosing the sequential +part. Also note that the \kcode{barrier} region in \ucode{sub3} when called from +\ucode{sub2} only synchronizes the team of threads in the enclosing \kcode{parallel} +region and not all the threads created in \ucode{sub1}. \cexample{barrier_regions}{1} diff --git a/synchronization/critical.tex b/synchronization/critical.tex index cd6fc7b..5bbac5e 100644 --- a/synchronization/critical.tex +++ b/synchronization/critical.tex @@ -1,24 +1,24 @@ -\pagebreak -\section{\code{critical} Construct} +%\pagebreak +\section{\kcode{critical} Construct} \label{sec:critical} -\index{constructs!critical@\code{critical}} -\index{critical construct@\code{critical} construct} -\index{critical construct@\code{critical} construct!hint clause@\code{hint} clause} -\index{clauses!hint@\code{hint}} -\index{hint clause@\code{hint} clause} +\index{constructs!critical@\kcode{critical}} +\index{critical construct@\kcode{critical} construct} +\index{critical construct@\kcode{critical} construct!hint clause@\kcode{hint} clause} +\index{clauses!hint@\kcode{hint}} +\index{hint clause@\kcode{hint} clause} -The following example includes several \code{critical} constructs. The example +The following example includes several \kcode{critical} constructs. The example illustrates a queuing model in which a task is dequeued and worked on. To guard against multiple threads dequeuing the same task, the dequeuing operation must -be in a \code{critical} region. Because the two queues in this example are independent, -they are protected by \code{critical} constructs with different names, \plc{xaxis} -and \plc{yaxis}. +be in a \kcode{critical} region. Because the two queues in this example are independent, +they are protected by \kcode{critical} constructs with different names, \ucode{xaxis} +and \ucode{yaxis}. \cexample{critical}{1} \fexample{critical}{1} -The following example extends the previous example by adding the \code{hint} clause to the \code{critical} constructs. +The following example extends the previous example by adding the \kcode{hint} clause to the \kcode{critical} constructs. \cexample[5.0]{critical}{2} diff --git a/synchronization/depobj.tex b/synchronization/depobj.tex index 7b8183a..79bb573 100644 --- a/synchronization/depobj.tex +++ b/synchronization/depobj.tex @@ -1,59 +1,59 @@ -\pagebreak -\section{\code{depobj} Construct} +%\pagebreak +\section{\kcode{depobj} Construct} \label{sec:depobj} -\index{constructs!depobj@\code{depobj}} -\index{depobj construct@\code{depobj} construct} -\index{depobj construct@\code{depobj} construct!depend clause@\code{depend} clause} -\index{depend clause@\code{depend} clause} -\index{clauses!depend@\code{depend}} +\index{constructs!depobj@\kcode{depobj}} +\index{depobj construct@\kcode{depobj} construct} +\index{depobj construct@\kcode{depobj} construct!depend clause@\kcode{depend} clause} +\index{depend clause@\kcode{depend} clause} +\index{clauses!depend@\kcode{depend}} -The stand-alone \code{depobj} construct provides a mechanism +The stand-alone \kcode{depobj} construct provides a mechanism to create a \plc{depend object} that expresses a dependence to be -used subsequently in the \code{depend} clause of another construct. -The dependence is created from a dependence type and a storage location, -within a \code{depend} clause of an \code{depobj} construct; +used subsequently in the \kcode{depend} clause of another construct. +Dependence information is created from a dependence type and a storage location +that is specified in the \kcode{depend} clause of an \kcode{depobj} construct, %just as one would find directly on a \code{task} construct. and it is stored in the depend object. -The depend object is represented by a variable of type \code{omp\_depend\_t} -in C/C++ (by a scalar variable of integer kind \code{omp\_depend\_kind} in Fortran). - -\index{depobj construct@\code{depobj} construct!update clause@\code{update} clause} -\index{update clause@\code{update} clause} -\index{clauses!update@\code{update}} -\index{depobj construct@\code{depobj} construct!destroy clause@\code{destroy} clause} -\index{destroy clause@\code{destroy} clause} -\index{clauses!destroy@\code{destroy}} -In the example below the stand-alone \code{depobj} construct uses the -\code{depend}, \code{update} and \code{destroy} clauses to +The depend object is represented by a variable of type \kcode{omp_depend_t} +in C/C++ and by a scalar variable of integer kind \kcode{omp_depend_kind} in Fortran. + +\index{depobj construct@\kcode{depobj} construct!update clause@\kcode{update} clause} +\index{update clause@\kcode{update} clause} +\index{clauses!update@\kcode{update}} +\index{depobj construct@\kcode{depobj} construct!destroy clause@\kcode{destroy} clause} +\index{destroy clause@\kcode{destroy} clause} +\index{clauses!destroy@\kcode{destroy}} +In the example below the stand-alone \kcode{depobj} construct uses the +\kcode{depend}, \kcode{update} and \kcode{destroy} clauses to \plc{initialize}, \plc{update} and \plc{uninitialize} -a depend object (\code{obj}). +a depend object (\ucode{obj}). -The first \code{depobj} construct initializes the \code{obj} +The first \kcode{depobj} construct initializes the \ucode{obj} depend object with -an \code{inout} dependence type with a storage -location defined by variable \code{a}. -This dependence is passed into the \plc{driver} -routine via the \code{obj} depend object. - -In the first \plc{driver} routine call, \emph{Task 1} uses -the dependence of the object (\code{inout}), -while \emph{Task 2} uses an \code{in} dependence specified -directly in a \code{depend} clause. +an \kcode{inout} dependence type and with a storage +location defined by variable \ucode{a}. +This dependence is passed into the \ucode{driver} +routine via the \ucode{obj} depend object. + +In the first \ucode{driver} routine call, \emph{Task 1} uses +the dependence of the object (\kcode{inout}), +while \emph{Task 2} uses an \kcode{in} dependence specified +directly in a \kcode{depend} clause. For these task dependences \emph{Task 1} must execute and complete before \emph{Task 2} begins. -Before the second call to \plc{driver}, \code{obj} is updated -using the \code{depobj} construct to represent an \code{in} dependence. -Hence, in the second call to \plc{driver}, \emph{Task 1} -will have an \code{in} dependence; and \emph{Task 1} and -\emph{Task 2} can execute simultaneously. Note: in an \code{update} +Before the second call to \ucode{driver}, \ucode{obj} is updated +using the \kcode{depobj} construct to represent an \kcode{in} dependence. +Hence, in the second call to \ucode{driver}, \emph{Task 1} +will have an \kcode{in} dependence; and \emph{Task 1} and +\emph{Task 2} can execute simultaneously. Note: in an \kcode{update} clause, only the dependence type can be (is) updated. -The third \code{depobj} construct uses the \code{destroy} clause. -It frees resources as it puts the depend object in an uninitialized state-- +The third \kcode{depobj} construct uses the \kcode{destroy} clause. +It frees resources as it puts the depend object in an uninitialized state -- effectively destroying the depend object. After an object has been uninitialized it can be initialized again -with a new dependence type \emph{and} a new variable. +with a new dependence type and a new variable. \cexample[5.2]{depobj}{1} diff --git a/synchronization/doacross.tex b/synchronization/doacross.tex index 7abe1ff..36c44d4 100644 --- a/synchronization/doacross.tex +++ b/synchronization/doacross.tex @@ -1,64 +1,65 @@ -\pagebreak +%\pagebreak \section{Doacross Loop Nest} \label{sec:doacross} \index{dependences!doacross loop nest} -\index{doacross loop nest!ordered construct@\code{ordered} construct} -\index{ordered construct@\code{ordered} construct!doacross loop nest} -\index{doacross loop nest!doacross clause@\code{doacross} clause} -\index{constructs!ordered@\code{ordered}} -\index{clauses!doacross@\code{doacross}} -\index{doacross clause@\code{doacross} clause} +\index{doacross loop nest!ordered construct@\kcode{ordered} construct} +\index{ordered construct@\kcode{ordered} construct!doacross loop nest} +\index{doacross loop nest!doacross clause@\kcode{doacross} clause} +\index{constructs!ordered@\kcode{ordered}} +\index{clauses!doacross@\kcode{doacross}} +\index{doacross clause@\kcode{doacross} clause} -An \code{ordered} clause can be used on a loop construct with an integer +An \kcode{ordered} clause can be used on a worksharing-loop construct with an integer parameter argument to define the number of associated loops within a \plc{doacross loop nest} where cross-iteration dependences exist. -A \code{doacross} clause on an \code{ordered} construct within an ordered +A \kcode{doacross} clause on an \kcode{ordered} construct within an \plc{ordered} loop describes the dependences of the \plc{doacross} loops. -In the code below, the \code{doacross(sink:i-1)} clause defines an \plc{i-1} -to \plc{i} cross-iteration dependence that specifies a wait point for -the completion of computation from iteration \plc{i-1} before proceeding -to the subsequent statements. The \scode{doacross(source:omp_cur_iteration)} -or \scode{doacross(source:)} clause indicates -the completion of computation from the current iteration (\plc{i}) +In the code below, the \kcode{doacross(sink:\ucode{i-1})} clause defines an \ucode{i-1} +to \ucode{i} cross-iteration dependence that specifies a wait point for +the completion of computation from iteration \ucode{i-1} before proceeding +to the subsequent statements. The \kcode{doacross(source:omp_cur_iteration)} +or \kcode{doacross(source:)} clause indicates +the completion of computation from the current iteration (\ucode{i}) to satisfy the cross-iteration dependence that arises from the iteration. -The \scode{omp_cur_iteration} keyword is optional for the \scode{source} +The \kcode{omp_cur_iteration} keyword is optional for the \kcode{source} dependence type. For this example the same sequential ordering could have been achieved -with an \code{ordered} clause without a parameter, on the loop directive, -and a single \code{ordered} directive without the \code{doacross} clause -specified for the statement executing the \plc{bar} function. +with an \kcode{ordered} clause without a parameter on the worksharing-loop directive, +and a single \kcode{ordered} directive without the \kcode{doacross} clause +specified for the statement executing the \ucode{bar} function. \cexample[5.2]{doacross}{1} \ffreeexample[5.2]{doacross}{1} +\pagebreak The following code is similar to the previous example but with -\plc{doacross loop nest} extended to two nested loops, \plc{i} and \plc{j}, -as specified by the \code{ordered(2)} clause on the loop directive. -In the C/C++ code, the \plc{i} and \plc{j} loops are the first and +the \plc{doacross loop nest} extended to two nested loops, \ucode{i} and \ucode{j}, +as specified by the \kcode{ordered(\ucode{2})} clause on the worksharing-loop directive. +In the C/C++ code, the \ucode{i} and \ucode{j} loops are the first and second associated loops, respectively, whereas -in the Fortran code, the \plc{j} and \plc{i} loops are the first and +in the Fortran code, the \ucode{j} and \ucode{i} loops are the first and second associated loops, respectively. -The \code{doacross(sink:i-1,j)} and \code{doacross(sink:i,j-1)} clauses in +The \kcode{doacross(sink:\ucode{i-1,j})} and \kcode{doacross(sink:\ucode{i,j-1})} clauses in the C/C++ code define cross-iteration dependences in two dimensions from -iterations (\plc{i-1, j}) and (\plc{i, j-1}) to iteration (\plc{i, j}). -Likewise, the \code{doacross(sink:j-1,i)} and \code{doacross(sink:j,i-1)} clauses +iterations (\ucode{i-1, j}) and (\ucode{i, j-1}) to iteration (\ucode{i, j}). +Likewise, the \kcode{doacross(sink:\ucode{j-1,i})} and \kcode{doacross(sink:\ucode{j,i-1})} clauses in the Fortran code define cross-iteration dependences from iterations -(\plc{j-1, i}) and (\plc{j, i-1}) to iteration (\plc{j, i}). +(\ucode{j-1, i}) and (\ucode{j, i-1}) to iteration (\ucode{j, i}). \cexample[5.2]{doacross}{2} \ffreeexample[5.2]{doacross}{2} -The following example shows the incorrect use of the \code{ordered} -directive with a \code{doacross} clause. There are two issues with the code. -The first issue is a missing \code{ordered}~\code{doacross(source:)} directive, +The following example shows an incorrect use of the \kcode{ordered} +directive with a \kcode{doacross} clause. There are two issues with the code. +The first issue is a missing \kcode{ordered doacross(source:)} directive, which could cause a deadlock. -The second issue is the \code{doacross(sink:i+1,j)} and \code{doacross(sink:i,j+1)} +The second issue is the \kcode{doacross(sink:\ucode{i+1,j})} and \kcode{doacross(sink:\ucode{i,j+1})} clauses define dependences on lexicographically later -source iterations (\plc{i+1, j}) and (\plc{i, j+1}), which could cause +source iterations (\ucode{i+1, j}) and (\ucode{i, j+1}), which could cause a deadlock as well since they may not start to execute until the current iteration completes. \cexample[5.2]{doacross}{3} @@ -66,12 +67,13 @@ \section{Doacross Loop Nest} \ffreeexample[5.2]{doacross}{3} -The following example illustrates the use of the \code{collapse} clause for -a \plc{doacross loop nest}. The \plc{i} and \plc{j} loops are the associated +The following example illustrates the use of the \kcode{collapse} clause for +a \plc{doacross loop nest}. The \ucode{i} and \ucode{j} loops are the associated loops for the collapsed loop as well as for the \plc{doacross loop nest}. -The example also shows a compliant usage of the dependence source -directive placed before the corresponding sink directive. -Checking the completion of computation from previous iterations at the sink point can occur after the source statement. +The example also shows a conforming usage of the \kcode{ordered} directive specifying a cross-iteration source +that is placed before a corresponding \kcode{ordered} directive specifying a +cross-iteration sink. There is no requirement that the source specification +must follow the sink specification in a given iteration. \cexample[5.2]{doacross}{4} diff --git a/synchronization/flush_nolist.tex b/synchronization/flush_nolist.tex deleted file mode 100644 index f9e1d8f..0000000 --- a/synchronization/flush_nolist.tex +++ /dev/null @@ -1,15 +0,0 @@ -\pagebreak -\section{\code{flush} Construct without a List} -\label{sec:flush_nolist} -\index{constructs!flush@\code{flush}} -\index{flush construct@\code{flush} construct} -\index{flushes!flush without a list} - -The following example distinguishes the shared variables affected by a \code{flush} -construct with no list from the shared objects that are not affected: - -\cexample{flush_nolist}{1} - -\fexample{flush_nolist}{1} - - diff --git a/synchronization/init_lock.tex b/synchronization/init_lock.tex index 0836067..25ec36d 100644 --- a/synchronization/init_lock.tex +++ b/synchronization/init_lock.tex @@ -1,10 +1,10 @@ -\subsection{\code{omp\_init\_lock} Routine} +\subsection{\kcode{omp_init_lock} Routine} \label{subsec:init_lock} -\index{routines!omp_init_lock@\scode{omp_init_lock}} -\index{omp_init_lock routine@\scode{omp_init_lock} routine} -The following example demonstrates how to initialize an array of locks in a \code{parallel} -region by using \code{omp\_init\_lock}. +\index{routines!omp_init_lock@\kcode{omp_init_lock}} +\index{omp_init_lock routine@\kcode{omp_init_lock} routine} +The following example demonstrates how to initialize an array of locks in a \kcode{parallel} +region by using \kcode{omp_init_lock}. \cppexample{init_lock}{1} diff --git a/synchronization/init_lock_with_hint.tex b/synchronization/init_lock_with_hint.tex index 9055c9d..04112a7 100644 --- a/synchronization/init_lock_with_hint.tex +++ b/synchronization/init_lock_with_hint.tex @@ -1,11 +1,11 @@ %\pagebreak -\subsection{\code{omp\_init\_lock\_with\_hint} Routine} +\subsection{\kcode{omp_init_lock_with_hint} Routine} \label{subsec:init_lock_with_hint} -\index{routines!omp_init_lock_with_hint@\scode{omp_init_lock_with_hint}} -\index{omp_init_lock_with_hint routine@\scode{omp_init_lock_with_hint} routine} -The following example demonstrates how to initialize an array of locks in a \code{parallel} region by using \code{omp\_init\_lock\_with\_hint}. -Note, hints are combined with an \code{|} or \code{+} operator in C/C++ and a \code{+} operator in Fortran. +\index{routines!omp_init_lock_with_hint@\kcode{omp_init_lock_with_hint}} +\index{omp_init_lock_with_hint routine@\kcode{omp_init_lock_with_hint} routine} +The following example demonstrates how to initialize an array of locks in a \kcode{parallel} region by using \kcode{omp_init_lock_with_hint}. +Note, hints are combined with an \bcode{|} or \bcode{+} operator in C/C++ and a \bcode{+} operator in Fortran. \cppexample[5.0]{init_lock_with_hint}{1} diff --git a/synchronization/lock_owner.tex b/synchronization/lock_owner.tex index d96428d..c515a69 100644 --- a/synchronization/lock_owner.tex +++ b/synchronization/lock_owner.tex @@ -1,20 +1,20 @@ \subsection{Ownership of Locks} \label{subsec:lock_owner} -\index{routines!omp_unset_lock@\scode{omp_unset_lock}} -\index{omp_unset_lock routine@\scode{omp_unset_lock} routine} +\index{routines!omp_unset_lock@\kcode{omp_unset_lock}} +\index{omp_unset_lock routine@\kcode{omp_unset_lock} routine} Ownership of locks has changed since OpenMP 2.5. In OpenMP 2.5, locks are owned -by threads; so a lock released by the \code{omp\_unset\_lock} routine must be +by threads; so a lock released by the \kcode{omp_unset_lock} routine must be owned by the same thread executing the routine. Beginning with OpenMP 3.0, locks are owned -by task regions; so a lock released by the \code{omp\_unset\_lock} routine in -a task region must be owned by the same task region. +by tasks; so a lock released by the \kcode{omp_unset_lock} routine in +a task must be owned by the same task. This change in ownership requires extra care when using locks. The following program -is conforming in OpenMP 2.5 because the thread that releases the lock \code{lck} -in the parallel region is the same thread that acquired the lock in the sequential -part of the program (primary thread of parallel region and the initial thread are +is conforming in OpenMP 2.5 because the thread that releases the lock \ucode{lck} +in the \kcode{parallel} region is the same thread that acquired the lock in the sequential +part of the program (primary thread of \kcode{parallel} region and the initial thread are the same). However, it is not conforming beginning with OpenMP 3.0, because the task -region that releases the lock \code{lck} is different from the task region that +region that releases the lock \ucode{lck} is different from the task region that acquires the lock. \cexample[5.1]{lock_owner}{1} diff --git a/synchronization/locks.tex b/synchronization/locks.tex index a79b58f..4af4684 100644 --- a/synchronization/locks.tex +++ b/synchronization/locks.tex @@ -1,4 +1,4 @@ -\pagebreak +%\pagebreak \section{Lock Routines} \label{sec:locks} diff --git a/synchronization/ordered.tex b/synchronization/ordered.tex index 2272f50..b599a3a 100644 --- a/synchronization/ordered.tex +++ b/synchronization/ordered.tex @@ -1,12 +1,12 @@ \pagebreak -\section{\code{ordered} Clause and \code{ordered} Construct} +\section{\kcode{ordered} Clause and \kcode{ordered} Construct} \label{sec:ordered} -\index{clauses!ordered@\code{ordered}} -\index{ordered clause@\code{ordered} clause} -\index{constructs!ordered@\code{ordered}} -\index{ordered construct@\code{ordered} construct} +\index{clauses!ordered@\kcode{ordered}} +\index{ordered clause@\kcode{ordered} clause} +\index{constructs!ordered@\kcode{ordered}} +\index{ordered construct@\kcode{ordered} construct} -Ordered constructs are useful for sequentially ordering the output from work that +The \kcode{ordered} constructs are useful for sequentially ordering the output from work that is done in parallel. The following program prints out the indices in sequential order: @@ -14,17 +14,17 @@ \section{\code{ordered} Clause and \code{ordered} Construct} \fexample{ordered}{1} -It is possible to have multiple \code{ordered} constructs within a loop region -with the \code{ordered} clause specified. The first example is non-conforming -because all iterations execute two \code{ordered} regions. An iteration of a -loop must not execute more than one \code{ordered} region: +It is possible to have multiple \kcode{ordered} constructs within a loop region +with the \kcode{ordered} clause specified. The first example is non-conforming +because all iterations execute two \kcode{ordered} regions. An iteration of a +loop must not execute more than one \kcode{ordered} region: \cexample{ordered}{2} \fexample{ordered}{2} -The following is a conforming example with more than one \code{ordered} construct. -Each iteration will execute only one \code{ordered} region: +The following is a conforming example with more than one \kcode{ordered} construct. +Each iteration will execute only one \kcode{ordered} region: \cexample{ordered}{3} diff --git a/synchronization/simple_lock.tex b/synchronization/simple_lock.tex index bcd088a..e8c29fe 100644 --- a/synchronization/simple_lock.tex +++ b/synchronization/simple_lock.tex @@ -1,18 +1,18 @@ \subsection{Simple Lock Routines} \label{subsec:simple_lock} -\index{routines!omp_set_lock@\scode{omp_set_lock}} -\index{omp_set_lock routine@\scode{omp_set_lock} routine} -\index{routines!omp_test_lock@\scode{omp_test_lock}} -\index{omp_test_lock routine@\scode{omp_test_lock} routine} +\index{routines!omp_set_lock@\kcode{omp_set_lock}} +\index{omp_set_lock routine@\kcode{omp_set_lock} routine} +\index{routines!omp_test_lock@\kcode{omp_test_lock}} +\index{omp_test_lock routine@\kcode{omp_test_lock} routine} In the following example, the lock routines cause the threads to be idle while waiting for entry to the first critical section, but to do other work while waiting -for entry to the second. The \code{omp\_set\_lock} function blocks, but the \scode{omp_test_lock} -function does not, allowing the work in \code{skip} to be done. +for entry to the second. The \kcode{omp_set_lock} function blocks, but the \kcode{omp_test_lock} +function does not, allowing the work in \ucode{skip} to be done. Note that the argument to the lock routines should have type -\scode{omp_lock_t} (or \scode{omp_lock_kind} in Fortran), -and that there is no need to flush the lock variable (\plc{lck}). +\kcode{omp_lock_t} (or \kcode{omp_lock_kind} in Fortran), +and that there is no need to flush the lock variable (\ucode{lck}). \cexample{simple_lock}{1} diff --git a/synchronization/sources/atomic.4.c b/synchronization/sources/atomic.4.c new file mode 100644 index 0000000..7232040 --- /dev/null +++ b/synchronization/sources/atomic.4.c @@ -0,0 +1,20 @@ +/* +* @@name: atomic.4 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_5.0 +*/ + +void calc_val(float *val); + +void boxster(float *box_totals, float *vals, int *box, int N) +{ + #pragma omp parallel for + for(int idx=0; idx<=N; idx++) + { + calc_val(&vals[idx]); + #pragma omp atomic hint(omp_sync_hint_uncontended) + box_totals[ box[idx] ] = box_totals[ box[idx] ] + vals[idx]; + } +} diff --git a/synchronization/sources/atomic.4.f90 b/synchronization/sources/atomic.4.f90 new file mode 100644 index 0000000..7c4f354 --- /dev/null +++ b/synchronization/sources/atomic.4.f90 @@ -0,0 +1,21 @@ +! @@name: atomic.4 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_5.0 + +subroutine boxster(box_totals, vals, box, N) + external calc_val + real, intent(inout) :: box_totals(:) + real, intent(in) :: vals(:) + integer, intent(in) :: box(:) + integer :: N, idx + + !$omp parallel do + do idx=1,N + call calc_val(vals(idx)) + !$omp atomic hint(omp_sync_hint_uncontended) + box_totals( box(idx) ) = box_totals( box(idx) ) + vals(idx) + enddo + +end subroutine \ No newline at end of file diff --git a/synchronization/sources/flush_nolist.1.c b/synchronization/sources/flush_nolist.1.c deleted file mode 100644 index 68d8f02..0000000 --- a/synchronization/sources/flush_nolist.1.c +++ /dev/null @@ -1,58 +0,0 @@ -/* -* @@name: flush_nolist.1 -* @@type: C -* @@operation: link -* @@expect: success -* @@version: pre_omp_3.0 -*/ -int x, *p = &x; - -void f1(int *q) -{ - *q = 1; - #pragma omp flush - /* x, p, and *q are flushed */ - /* because they are shared and accessible */ - /* q is not flushed because it is not shared. */ -} - -void f2(int *q) -{ - #pragma omp barrier - *q = 2; - #pragma omp barrier - - /* a barrier implies a flush */ - /* x, p, and *q are flushed */ - /* because they are shared and accessible */ - /* q is not flushed because it is not shared. */ -} - -int g(int n) -{ - int i = 1, j, sum = 0; - *p = 1; - #pragma omp parallel reduction(+: sum) num_threads(10) - { - f1(&j); - - /* i, n and sum were not flushed */ - /* because they were not accessible in f1 */ - /* j was flushed because it was accessible */ - sum += j; - - f2(&j); - - /* i, n, and sum were not flushed */ - /* because they were not accessible in f2 */ - /* j was flushed because it was accessible */ - sum += i + j + *p + n; - } - return sum; -} - -int main() -{ - int result = g(7); - return result; -} diff --git a/synchronization/sources/flush_nolist.1.f b/synchronization/sources/flush_nolist.1.f deleted file mode 100644 index 8eb3a31..0000000 --- a/synchronization/sources/flush_nolist.1.f +++ /dev/null @@ -1,68 +0,0 @@ -! @@name: flush_nolist.1 -! @@type: F-fixed -! @@operation: link -! @@expect: success -! @@version: pre_omp_3.0 - SUBROUTINE F1(Q) - COMMON /DATA/ X, P - INTEGER, TARGET :: X - INTEGER, POINTER :: P - INTEGER Q - - Q = 1 -!$OMP FLUSH - ! X, P and Q are flushed - ! because they are shared and accessible - END SUBROUTINE F1 - - SUBROUTINE F2(Q) - COMMON /DATA/ X, P - INTEGER, TARGET :: X - INTEGER, POINTER :: P - INTEGER Q - -!$OMP BARRIER - Q = 2 -!$OMP BARRIER - ! a barrier implies a flush - ! X, P and Q are flushed - ! because they are shared and accessible - END SUBROUTINE F2 - - INTEGER FUNCTION G(N) - COMMON /DATA/ X, P - INTEGER, TARGET :: X - INTEGER, POINTER :: P - INTEGER N - INTEGER I, J, SUM - - I = 1 - SUM = 0 - P = 1 -!$OMP PARALLEL REDUCTION(+: SUM) NUM_THREADS(10) - CALL F1(J) - ! I, N and SUM were not flushed - ! because they were not accessible in F1 - ! J was flushed because it was accessible - SUM = SUM + J - - CALL F2(J) - ! I, N, and SUM were not flushed - ! because they were not accessible in f2 - ! J was flushed because it was accessible - SUM = SUM + I + J + P + N -!$OMP END PARALLEL - - G = SUM - END FUNCTION G - - PROGRAM FLUSH_NOLIST - COMMON /DATA/ X, P - INTEGER, TARGET :: X - INTEGER, POINTER :: P - INTEGER RESULT, G - - P => X - RESULT = G(7) - PRINT *, RESULT - END PROGRAM FLUSH_NOLIST diff --git a/synchronization/worksharing_critical.tex b/synchronization/worksharing_critical.tex index b51bd4c..b5c7df9 100644 --- a/synchronization/worksharing_critical.tex +++ b/synchronization/worksharing_critical.tex @@ -1,18 +1,18 @@ -\pagebreak -\section{Worksharing Constructs Inside a \code{critical} Construct} +%\pagebreak +\section{Worksharing Constructs Inside a \kcode{critical} Construct} \label{sec:worksharing_critical} \index{constructs!worksharing} -\index{constructs!critical@\code{critical}} -\index{critical construct@\code{critical} construct} +\index{constructs!critical@\kcode{critical}} +\index{critical construct@\kcode{critical} construct} -The following example demonstrates using a worksharing construct inside a \code{critical} -construct. This example is conforming because the worksharing \code{single} -region is not closely nested inside the \code{critical} region. A single thread -executes the one and only section in the \code{sections} region, and executes -the \code{critical} region. The same thread encounters the nested \code{parallel} +The following example demonstrates using a worksharing construct inside a \kcode{critical} +construct. This example is conforming because the worksharing \kcode{single} +region is not closely nested inside the \kcode{critical} region. A single thread +executes the one and only section in the \kcode{sections} region, and executes +the \kcode{critical} region. The same thread encounters the nested \kcode{parallel} region, creates a new team of threads, and becomes the primary thread of the new team. -One of the threads in the new team enters the \code{single} region and increments -\code{i} by \code{1}. At the end of this example \code{i} is equal to \code{2}. +One of the threads in the new team enters the \kcode{single} region and increments +\ucode{i} by 1. At the end of this example \ucode{i} is equal to 2. \cexample{worksharing_critical}{1} diff --git a/tasking/parallel_masked_taskloop.tex b/tasking/parallel_masked_taskloop.tex index 2549c97..9c9388c 100644 --- a/tasking/parallel_masked_taskloop.tex +++ b/tasking/parallel_masked_taskloop.tex @@ -1,39 +1,39 @@ -\pagebreak -\section{Combined \code{parallel} \code{masked} and \code{taskloop} Constructs} +%\pagebreak +\section{Combined \kcode{parallel masked} and \kcode{taskloop} Constructs} \label{sec:parallel_masked_taskloop} -\index{combined constructs!parallel masked taskloop@\code{parallel} \code{masked} \code{taskloop}} -\index{combined constructs!parallel masked taskloop simd@\code{parallel} \code{masked} \code{taskloop} \code{simd}} -\index{constructs!parallel@\code{parallel}} -\index{constructs!masked@\code{masked}} -\index{constructs!taskloop@\code{taskloop}} -\index{constructs!simd@\code{simd}} -\index{parallel construct@\code{parallel} construct} -\index{masked construct@\code{masked} construct} -\index{taskloop construct@\code{taskloop} construct} -\index{simd construct@\code{simd} construct} +\index{combined constructs!parallel masked taskloop@\kcode{parallel masked taskloop}} +\index{combined constructs!parallel masked taskloop simd@\kcode{parallel masked taskloop simd}} +\index{constructs!parallel@\kcode{parallel}} +\index{constructs!masked@\kcode{masked}} +\index{constructs!taskloop@\kcode{taskloop}} +\index{constructs!simd@\kcode{simd}} +\index{parallel construct@\kcode{parallel} construct} +\index{masked construct@\kcode{masked} construct} +\index{taskloop construct@\kcode{taskloop} construct} +\index{simd construct@\kcode{simd} construct} -Just as the \code{for} and \code{do} constructs were combined -with the \code{parallel} construct for convenience, so too, the combined -\code{parallel}~\code{masked}~\code{taskloop} and -\code{parallel}~\code{masked}~\code{taskloop}~\code{simd} +Just as the \kcode{for} and \kcode{do} constructs were combined +with the \kcode{parallel} construct for convenience, so too, the combined +\kcode{parallel masked taskloop} and +\kcode{parallel masked taskloop simd} constructs have been created for convenience when using the -\code{taskloop} construct. +\kcode{taskloop} construct. -In the following example the first \code{taskloop} construct is enclosed -by the usual \code{parallel} and \code{masked} constructs to form +In the following example the first \kcode{taskloop} construct is enclosed +by the usual \kcode{parallel} and \kcode{masked} constructs to form a team of threads, and a single task generator (primary thread) for -the \code{taskloop} construct. +the \kcode{taskloop} construct. The same OpenMP operations for the first taskloop are accomplished by the second -taskloop with the \code{parallel}~\code{masked}~\code{taskloop} +taskloop with the \kcode{parallel masked taskloop} combined construct. -The third taskloop uses the combined \code{parallel}~\code{masked}~\code{taskloop}~\code{simd} -construct to accomplish the same behavior as closely nested \code{parallel masked}, -and \code{taskloop simd} constructs. +The third taskloop uses the combined \kcode{parallel masked taskloop simd} +construct to accomplish the same behavior as closely nested \kcode{parallel masked}, +and \kcode{taskloop simd} constructs. As with any combined construct the clauses of the components may be used -with appropriate restrictions. The combination of the \code{parallel}~\code{masked} construct -with the \code{taskloop} or \code{taskloop}~\code{simd} construct produces no additional +with appropriate restrictions. The combination of the \kcode{parallel masked} construct +with the \kcode{taskloop} or \kcode{taskloop simd} construct produces no additional restrictions. \cexample[5.1]{parallel_masked_taskloop}{1} diff --git a/tasking/task_dep.tex b/tasking/task_dep.tex index d7507df..70549ac 100644 --- a/tasking/task_dep.tex +++ b/tasking/task_dep.tex @@ -1,4 +1,4 @@ -\pagebreak +%\pagebreak \section{Task Dependences} \label{sec:task_depend} \index{dependences!task dependences} @@ -6,21 +6,21 @@ \section{Task Dependences} \subsection{Flow Dependence} \label{subsec:task_flow_depend} \index{task dependences!flow dependence} -\index{task construct@\code{task} construct!depend clause@\code{depend} clause} -\index{task construct@\code{task} construct} -\index{constructs!task@\code{task}} -\index{depend clause@\code{depend} clause} -\index{clauses!depend@\code{depend}} +\index{task construct@\kcode{task} construct!depend clause@\kcode{depend} clause} +\index{task construct@\kcode{task} construct} +\index{constructs!task@\kcode{task}} +\index{depend clause@\kcode{depend} clause} +\index{clauses!depend@\kcode{depend}} -This example shows a simple flow dependence using a \code{depend} -clause on the \code{task} construct. +This example shows a simple flow dependence using a \kcode{depend} +clause on the \kcode{task} construct. \cexample[4.0]{task_dep}{1} \ffreeexample[4.0]{task_dep}{1} -The program will always print ``x = 2'', because the \code{depend} -clauses enforce the ordering of the tasks. If the \code{depend} clauses had been +The program will always print \pout{x = 2}, because the \kcode{depend} +clauses enforce the ordering of the tasks. If the \kcode{depend} clauses had been omitted, then the tasks could execute in any order and the program and the program would have a race condition. @@ -28,15 +28,15 @@ \subsection{Anti-dependence} \label{subsec:task_anti_depend} \index{task dependences!anti dependence} -This example shows an anti-dependence using the \code{depend} -clause on the \code{task} construct. +This example shows an anti-dependence using the \kcode{depend} +clause on the \kcode{task} construct. \cexample[4.0]{task_dep}{2} \ffreeexample[4.0]{task_dep}{2} -The program will always print ``x = 1'', because the \code{depend} -clauses enforce the ordering of the tasks. If the \code{depend} clauses had been +The program will always print \pout{x = 1}, because the \kcode{depend} +clauses enforce the ordering of the tasks. If the \kcode{depend} clauses had been omitted, then the tasks could execute in any order and the program would have a race condition. @@ -44,15 +44,15 @@ \subsection{Output Dependence} \label{subsec:task_out_depend} \index{task dependences!output dependence} -This example shows an output dependence using the \code{depend} -clause on the \code{task} construct. +This example shows an output dependence using the \kcode{depend} +clause on the \kcode{task} construct. \cexample[4.0]{task_dep}{3} \ffreeexample[4.0]{task_dep}{3} -The program will always print ``x = 2'', because the \code{depend} -clauses enforce the ordering of the tasks. If the \code{depend} clauses had been +The program will always print \pout{x = 2}, because the \kcode{depend} +clauses enforce the ordering of the tasks. If the \kcode{depend} clauses had been omitted, then the tasks could execute in any order and the program would have a race condition. @@ -62,28 +62,28 @@ \subsection{Concurrent Execution with Dependences} \index{task dependences!concurrent execution with} In this example we show potentially concurrent execution of tasks using multiple -flow dependences expressed using the \code{depend} clause on the \code{task} +flow dependences expressed using the \kcode{depend} clause on the \kcode{task} construct. The last two tasks are dependent on the first task. However, there is no dependence between the last two tasks, which may execute in any order (or concurrently if -more than one thread is available). Thus, the possible outputs are ``x -+ 1 = 3. x + 2 = 4.'' and ``x + 2 = 4. x + 1 = 3.''. -If the \code{depend} clauses had been omitted, then all of the tasks could execute +more than one thread is available). Thus, the possible outputs are +\pout{x + 1 = 3. x + 2 = 4.} and \pout{x + 2 = 4. x + 1 = 3.}. +If the \kcode{depend} clauses had been omitted, then all of the tasks could execute in any order and the program would have a race condition. \cexample[4.0]{task_dep}{4} \ffreeexample[4.0]{task_dep}{4} -The following example illustrates the semantic difference between \scode{inout} -and \scode{inoutset} dependence types. In CASE 1, tasks generated at T1 +The following example illustrates the semantic difference between \kcode{inout} +and \kcode{inoutset} dependence types. In Case 1, tasks generated at T1 inside the loop have dependences among themselves due to -the \scode{inout} dependence type and with task T2. +the \kcode{inout} dependence type and with task T2. As a result, these tasks are executed sequentially before the print statement from task T2. -In CASE 2, tasks generated at T3 inside the loop have no dependences -among themselves from the \scode{inoutset} dependence type, but have +In Case 2, tasks generated at T3 inside the loop have no dependences +among themselves from the \kcode{inoutset} dependence type, but have dependences with task T4. As a result, these tasks are executed concurrently before the print statement from task T4. @@ -97,61 +97,62 @@ \subsection{Matrix multiplication} \index{task dependences!matrix multiplication} This example shows a task-based blocked matrix multiplication. Matrices are of -NxN elements, and the multiplication is implemented using blocks of BSxBS elements. +\ucode{N}x\ucode{N} elements, and the multiplication is implemented using blocks +of \ucode{BS}x\ucode{BS} elements. \cexample[4.0]{task_dep}{5} \ffreeexample[4.0]{task_dep}{5} -\subsection{\code{taskwait} with Dependences} +\subsection{\kcode{taskwait} with Dependences} \label{subsec:taskwait_depend} -\index{task dependences!taskwait construct with@\code{taskwait} construct with} -\index{taskwait construct@\code{taskwait} construct} -\index{constructs!taskwait@\code{taskwait}} -\index{taskwait construct@\code{taskwait} construct!depend clause@\code{depend} clause} -\index{depend clause@\code{depend} clause} -\index{clauses!depend@\code{depend}} +\index{task dependences!taskwait construct with@\kcode{taskwait} construct with} +\index{taskwait construct@\kcode{taskwait} construct} +\index{constructs!taskwait@\kcode{taskwait}} +\index{taskwait construct@\kcode{taskwait} construct!depend clause@\kcode{depend} clause} +\index{depend clause@\kcode{depend} clause} +\index{clauses!depend@\kcode{depend}} In this subsection three examples illustrate how the -\code{depend} clause can be applied to a \code{taskwait} construct to make the +\kcode{depend} clause can be applied to a \kcode{taskwait} construct to make the generating task wait for specific child tasks to complete. This is an OpenMP 5.0 feature. In the same manner that -dependences can order executions among child tasks with \code{depend} clauses on -\code{task} constructs, the generating task can be scheduled to wait on child tasks -at a \code{taskwait} before it can proceed. +dependences can order executions among child tasks with \kcode{depend} clauses on +\kcode{task} constructs, the generating task can be scheduled to wait on child tasks +at a \kcode{taskwait} before it can proceed. -Note: Since the \code{depend} clause on a \code{taskwait} construct relaxes the +Note: Since the \kcode{depend} clause on a \kcode{taskwait} construct relaxes the default synchronization behavior (waiting for all children to finish), it is important to -realize that child tasks that are not predecessor tasks, as determined by the \code{depend} -clause of the \code{taskwait} construct, may be running concurrently while the +realize that child tasks that are not predecessor tasks, as determined by the \kcode{depend} +clause of the \kcode{taskwait} construct, may be running concurrently while the generating task is executing after the taskwait. -In the first example the generating task waits at the \code{taskwait} construct +In the first example the generating task waits at the \kcode{taskwait} construct for the completion of the first child task because a dependence on the first task -is produced by \plc{x} with an \code{in} dependence type within the \code{depend} -clause of the \code{taskwait} construct. -Immediately after the first \code{taskwait} construct it is safe to access the -\plc{x} variable by the generating task, as shown in the print statement. +is produced by \ucode{x} with an \kcode{in} dependence type within the \kcode{depend} +clause of the \kcode{taskwait} construct. +Immediately after the first \kcode{taskwait} construct it is safe to access the +\ucode{x} variable by the generating task, as shown in the print statement. There is no completion restraint on the second child task. -Hence, immediately after the first \code{taskwait} it is unsafe to access the -\plc{y} variable since the second child task may still be executing. -The second \code{taskwait} ensures that the second child task has completed; hence -it is safe to access the \plc{y} variable in the following print statement. +Hence, immediately after the first \kcode{taskwait} it is unsafe to access the +\ucode{y} variable since the second child task may still be executing. +The second \kcode{taskwait} ensures that the second child task has completed; hence +it is safe to access the \ucode{y} variable in the following print statement. \cexample[5.0]{task_dep}{6} \ffreeexample[5.0]{task_dep}{6} In this example the first two tasks are serialized, because a dependence on -the first child is produced by \plc{x} with the \code{in} dependence type -in the \code{depend} clause of the second task. -However, the generating task at the first \code{taskwait} waits only on the +the first child is produced by \ucode{x} with the \kcode{in} dependence type +in the \kcode{depend} clause of the second task. +However, the generating task at the first \kcode{taskwait} waits only on the first child task to complete, because a dependence on only the first child task -is produced by \plc{x} with an \code{in} dependence type within the -\code{depend} clause of the \code{taskwait} construct. -The second \code{taskwait} (without a \code{depend} clause) is included -to guarantee completion of the second task before \plc{y} is accessed. -(While unnecessary, the \code{depend(inout:} \code{y)} clause on the 2nd child task is +is produced by \ucode{x} with an \kcode{in} dependence type within the +\kcode{depend} clause of the \kcode{taskwait} construct. +The second \kcode{taskwait} (without a \kcode{depend} clause) is included +to guarantee completion of the second task before \ucode{y} is accessed. +(While unnecessary, the \kcode{depend(inout: \ucode{y})} clause on the 2nd child task is included to illustrate how the child task dependences can be completely annotated in a data-flow model.) @@ -165,21 +166,21 @@ \subsection{\code{taskwait} with Dependences} This example is similar to the previous one, except the generating task is directed to also wait for completion of the second task. -The \code{depend} clause of the \code{taskwait} construct now includes an -\code{in} dependence type for \plc{y}. Hence the generating task must now -wait on completion of any child task having \plc{y} with an \code{out} -(here \code{inout}) dependence type in its \code{depend} clause. -So, the \code{depend} clause of the \code{taskwait} construct now constrains -the second task to complete at the \code{taskwait}, too. +The \kcode{depend} clause of the \kcode{taskwait} construct now includes an +\kcode{in} dependence type for \ucode{y}. Hence the generating task must now +wait on completion of any child task having \ucode{y} with an \kcode{out} +(here \kcode{inout}) dependence type in its \kcode{depend} clause. +So, the \kcode{depend} clause of the \kcode{taskwait} construct now constrains +the second task to complete at the taskwait, too. %--both tasks must now complete execution at the \code{taskwait}. -(This change makes the second \code{taskwait} of the previous example unnecessary-- +(This change makes the second taskwait of the previous example unnecessary-- it has been removed in this example.) -Note: While a taskwait construct ensures that all child tasks have completed; a depend clause on a taskwait +Note: While a \kcode{taskwait} construct ensures that all child tasks have completed; a depend clause on a \kcode{taskwait} construct only waits for specific child tasks (prescribed by the dependence type and list -items in the \code{taskwait}'s \code{depend} clause). +items in the \kcode{taskwait}'s \kcode{depend} clause). This and the previous example illustrate the need to carefully determine -the dependence type of variables in the \code{taskwait} \code{depend} clause +the dependence type of variables in the \kcode{depend} clause of the \kcode{taskwait} construct. when selecting child tasks that the generating task must wait on, so that its execution after the taskwait does not produce race conditions on variables accessed by non-completed child tasks. @@ -193,13 +194,13 @@ \subsection{Mutually Exclusive Execution with Dependences} \index{task dependences!mutually exclusive execution} In this example we show a series of tasks, including mutually exclusive -tasks, expressing dependences using the \code{depend} clause on the -\code{task} construct. +tasks, expressing dependences using the \kcode{depend} clause on the +\kcode{task} construct. -The program will always print~6. Tasks T1, T2 and T3 will be scheduled first, +The program will always print \pout{6}. Tasks T1, T2 and T3 will be scheduled first, in any order. Task T4 will be scheduled after tasks T1 and T2 are completed. T5 will be scheduled after tasks T1 and T3 are completed. Due -to the \code{mutexinoutset} dependence type on \code{c}, T4 and T5 may be +to the \kcode{mutexinoutset} dependence type on \ucode{c}, T4 and T5 may be scheduled in any order with respect to each other, but not at the same time. Tasks T6 will be scheduled after both T4 and T5 are completed. @@ -207,10 +208,10 @@ \subsection{Mutually Exclusive Execution with Dependences} \ffreeexample[5.0]{task_dep}{9} -The following example demonstrates a situation where the \code{mutexinoutset} -dependence type is advantageous. If \code{shortTaskB} completes -before \code{longTaskA}, the runtime can take advantage of this by -scheduling \code{longTaskBC} before \code{shortTaskAC}. +The following example demonstrates a situation where the \kcode{mutexinoutset} +dependence type is advantageous. If \ucode{shortTaskB} completes +before \ucode{longTaskA}, the runtime can take advantage of this by +scheduling \ucode{longTaskBC} before \ucode{shortTaskAC}. \cexample[5.0]{task_dep}{10} @@ -219,35 +220,35 @@ \subsection{Mutually Exclusive Execution with Dependences} \subsection{Multidependences Using Iterators} \label{subsec:depend_iterator} \index{task dependences!using iterators} -\index{depend clause@\code{depend} clause!iterator modifier@\code{iterator} modifier} -\index{iterator modifier@\code{iterator} modifier} +\index{depend clause@\kcode{depend} clause!iterator modifier@\kcode{iterator} modifier} +\index{iterator modifier@\kcode{iterator} modifier} The following example uses an iterator to define a dynamic number of dependences. -In the \code{single} construct of a parallel region a loop generates n tasks -and each task has an \code{out} dependence specified through an element of -the \plc{v} array. This is followed by a single task that defines an \code{in} +In the \kcode{single} construct of a parallel region a loop generates \ucode{n} tasks +and each task has an \kcode{out} dependence specified through an element of +the \ucode{v} array. This is followed by a single task that defines an \kcode{in} dependence on each element of the array. This is accomplished by -using the \code{iterator} modifier in the \code{depend} clause, supporting a dynamic number -of dependences (\plc{n} here). +using the \kcode{iterator} modifier in the \kcode{depend} clause, supporting a dynamic number +of dependences (\ucode{n} here). -The task for the \plc{print\_all\_elements} function is not executed until all dependences +The task for the \ucode{print_all_elements} procedure is not executed until all dependences prescribed (or registered) by the iterator are fulfilled; that is, after all the tasks generated by the loop have completed. -Note, one cannot simply use an array section in the \code{depend} clause -of the second task construct because this would violate the \code{depend} clause restriction: +Note, one cannot simply use an array section in the \kcode{depend} clause +of the second task construct because this would violate the \kcode{depend} clause restriction: -``List items used in \code{depend} clauses of the same task or sibling tasks +``List items used in \kcode{depend} clauses of the same task or sibling tasks must indicate identical storage locations or disjoint storage locations''. In this case each of the loop tasks use a single disjoint (different storage) -element in their \code{depend} clause; however, +element in their \kcode{depend} clause; however, the array-section storage area prescribed in the commented directive is neither identical nor disjoint to the storage prescribed by the elements of the loop tasks. The iterator overcomes this restriction by effectively -creating n disjoint storage areas. +creating \ucode{n} disjoint storage areas. \cexample[5.0]{task_dep}{11} @@ -258,20 +259,20 @@ \subsection{Dependence for Undeferred Tasks} \index{task dependences!undeferred tasks} In the following example, we show that even if a task is undeferred as specified -by an \code{if} clause that evaluates to \plc{false}, task dependences are +by an \kcode{if} clause that evaluates to \vcode{false}, task dependences are still honored. -The \code{depend} clauses of the first and second explicit tasks specify that +The \kcode{depend} clauses of the first and second explicit tasks specify that the first task is completed before the second task. -The second explicit task has an \code{if} clause that evaluates to \plc{false}. +The second explicit task has an \kcode{if} clause that evaluates to \vcode{false}. This means that the execution of the generating task (the implicit task of -the \code{single} region) must be suspended until the second explicit task +the \kcode{single} region) must be suspended until the second explicit task is completed. But, because of the dependence, the first explicit task must complete first, then the second explicit task can execute and complete, and only then the generating task can resume to the print statement. -Thus, the program will always print ``\texttt{x = 2}''. +Thus, the program will always print \pout{x = 2}. \cexample[4.0]{task_dep}{12} \clearpage @@ -279,11 +280,11 @@ \subsection{Dependence for Undeferred Tasks} \ffreeexample[4.0]{task_dep}{12} -In OpenMP 5.1 the \scode{omp_all_memory} \splc{reserved locator} was introduced +In OpenMP 5.1 the \kcode{omp_all_memory} \plc{reserved locator} was introduced to specify storage of all objects in memory. In the following example, it is used in Task 4 as a convenient way to specify that the locator (list item) denotes the storage of all objects (locations) in memory, and -will therefore match the \splc{a} and \splc{d} locators of Task 2, Task 3 and Task 6. +will therefore match the \ucode{a} and \ucode{d} locators of Task 2, Task 3 and Task 6. The dependences guarantee the ordered execution of Tasks 2 and 3 before 4, and Task 4 before Task 6. Since there are no dependences imposed on Task 1 and Task 5, they can be diff --git a/tasking/task_detach.tex b/tasking/task_detach.tex index cc21b5e..5d478c7 100644 --- a/tasking/task_detach.tex +++ b/tasking/task_detach.tex @@ -1,23 +1,23 @@ \pagebreak \section{Task Detachment} \label{sec:task_detachment} -\index{task construct@\code{task} construct!detach clause@\code{detach} clause} -\index{detach clause@\code{detach} clause} -\index{clauses!detach@\code{detach}} -\index{routines!omp_fulfill_event@\scode{omp_fulfill_event}} -\index{omp_fulfill_event routine@\scode{omp_fulfill_event} routine} +\index{task construct@\kcode{task} construct!detach clause@\kcode{detach} clause} +\index{detach clause@\kcode{detach} clause} +\index{clauses!detach@\kcode{detach}} +\index{routines!omp_fulfill_event@\kcode{omp_fulfill_event}} +\index{omp_fulfill_event routine@\kcode{omp_fulfill_event} routine} % if used, then generated task must be completed. % No definition of a detachable task -The \code{detach} clause on a \code{task} construct provides a mechanism for an asynchronous +The \kcode{detach} clause on a \kcode{task} construct provides a mechanism for an asynchronous routine to be called within a task block, and for the routine's callback to signal completion to the OpenMP runtime, through an -event fulfillment, triggered by a call to the \code{omp\_fulfill\_event} routine. -When a \code{detach} clause is used on a task construct, -completion of the \emph{detachable} task occurs when the task's structured +event fulfillment, triggered by a call to the \kcode{omp_fulfill_event} routine. +When a \kcode{detach} clause is used on a \kcode{task} construct, +completion of the detachable task occurs when the task's structured block is completed AND an \plc{allow-completion} event is -fulfilled by a call to the \code{omp\_fulfill\_event} +fulfilled by a call to the \kcode{omp_fulfill_event} routine with the \plc{event-handle} argument. The first example illustrates the basic components used in a detachable task. @@ -30,12 +30,12 @@ \section{Task Detachment} ------------------------- The first example creates a detachable task -that executes the asynchronous \plc{async\_work} routine, -passing the \plc{omp\_fulfill\_event} function and the (firstprivate) event handle -to the function. Here, the \code{omp\_fulfill\_event} function is -the ``callback'' function to be executed at the end of the \plc{async\_work} function's +that executes the asynchronous \ucode{async_work} routine, +passing the \kcode{omp_fulfill_event} function and the (firstprivate) event handle +to the function. Here, the OpenMP \kcode{omp_fulfill_event} procedure is +the ``callback'' function to be executed at the end of the \ucode{async_work} function's asynchronous operations, -with the associated data, \plc{event}. +with the associated data, \ucode{event}. \cexample[5.0]{task_detach}{1} @@ -44,21 +44,21 @@ \section{Task Detachment} %ASYNCHRONOUS IO -In the following example, text data is written asynchronously to the file \plc{async\_data}, -using POSIX asynchronous IO (aio). An aio ``control block'', \plc{cb}, is set up to -send a signal when IO is complete, and the \plc{sigaction} function registers -the signal action, a callback to \plc{callback\_aioSigHandler}. +In the following example, text data is written asynchronously to the file \ucode{async_data}, +using POSIX asynchronous IO (aio). An aio ``control block'', \ucode{cb}, is set up to +send a signal when IO is complete, and the \ucode{sigaction} function registers +the signal action, a callback to \ucode{callback_aioSigHandler}. -The first task (TASK1) starts the asynchronous IO and runs as a detachable task. -The second and third tasks (TASK2 and TASK3) perform synchronous IO to stdout with print statements. -The difference between the two types of tasks is that the thread for TASK1 is freed for -other execution within the parallel region, while the threads for TASK2 and TASK3 wait +The first task (Task1) starts the asynchronous IO and runs as a detachable task. +The second and third tasks (Task2 and Task3) perform synchronous IO to stdout with print statements. +The difference between the two types of tasks is that the thread for Task1 is freed for +other execution within the \kcode{parallel} region, while the threads for Task2 and Task3 wait on the (synchronous) IO to complete, and cannot perform other work while the operating system is performing the synchronous IO. -The \code{if} clause ensures that the detachable task is launched -and the call to the \splc{aio_write} function returns -before TASK2 and TASK3 are generated (while the async IO occurs in the ``background'' and eventually -executes the callback function). The barrier at the end of the parallel region ensures that the +The \kcode{if} clause ensures that the detachable task is launched +and the call to the \ucode{aio_write} function returns +before Task2 and Task3 are generated (while the async IO occurs in the ``background'' and eventually +executes the callback function). The barrier at the end of the \kcode{parallel} region ensures that the detachable task has completed. \cexample[5.0]{task_detach}{2} diff --git a/tasking/task_priority.tex b/tasking/task_priority.tex index bc3e7fa..229ef27 100644 --- a/tasking/task_priority.tex +++ b/tasking/task_priority.tex @@ -1,21 +1,21 @@ -\pagebreak +%\pagebreak \section{Task Priority} \label{sec:task_priority} -\index{task construct@\code{task} construct!priority clause@\code{priority} clause} -\index{priority clause@\code{priority} clause} -\index{clauses!priority@\code{priority}} +\index{task construct@\kcode{task} construct!priority clause@\kcode{priority} clause} +\index{priority clause@\kcode{priority} clause} +\index{clauses!priority@\kcode{priority}} %\subsection{Task Priority} %\label{subsec:task_priority} -In this example we compute arrays in a matrix through a \plc{compute\_array} routine. -Each task has a priority value equal to the value of the loop variable \plc{i} at the +In this example we compute arrays in a matrix through a \ucode{compute_array} routine. +Each task has a priority value equal to the value of the loop variable \ucode{i} at the moment of its creation. A higher priority on a task means that a task is a candidate to run sooner. The creation of tasks occurs in ascending order (according to the iteration space of -the loop) but a hint, by means of the \code{priority} clause, is provided to reverse +the loop) but a hint, by means of the \kcode{priority} clause, is provided to reverse the execution order. \cexample[4.5]{task_priority}{1} diff --git a/tasking/taskgroup.tex b/tasking/taskgroup.tex index 6fed0f6..df392ac 100644 --- a/tasking/taskgroup.tex +++ b/tasking/taskgroup.tex @@ -1,19 +1,19 @@ -\pagebreak -\section{\code{taskgroup} Construct} +%\pagebreak +\section{\kcode{taskgroup} Construct} \label{sec:taskgroup} -\index{constructs!taskgroup@\code{taskgroup}} -\index{taskgroup construct@\code{taskgroup} construct} +\index{constructs!taskgroup@\kcode{taskgroup}} +\index{taskgroup construct@\kcode{taskgroup} construct} -In this example, tasks are grouped and synchronized using the \code{taskgroup} +In this example, tasks are grouped and synchronized using the \kcode{taskgroup} construct. -Initially, one task (the task executing the \code{start\_background\_work()} -call) is created in the \code{parallel} region, and later a parallel tree traversal -is started (the task executing the root of the recursive \code{compute\_tree()} +Initially, one task (the task executing the \ucode{start_background_work()} +routine) is created in the \kcode{parallel} region, and later a parallel tree traversal +is started (the task executing the root of the recursive \ucode{compute_tree()} calls). While synchronizing tasks at the end of each tree traversal, using the -\code{taskgroup} construct ensures that the formerly started background task +\kcode{taskgroup} construct ensures that the formerly started background task does not participate in the synchronization and is left free to execute in parallel. -This is opposed to the behavior of the \code{taskwait} construct, which would +This is opposed to the behavior of the \kcode{taskwait} construct, which would include the background tasks in the synchronization. \cexample[4.0]{taskgroup}{1} diff --git a/tasking/tasking.tex b/tasking/tasking.tex index 3cd87ed..c0e4103 100644 --- a/tasking/tasking.tex +++ b/tasking/tasking.tex @@ -1,14 +1,14 @@ \pagebreak -\section{\code{task} and \code{taskwait} Constructs} +\section{\kcode{task} and \kcode{taskwait} Constructs} \label{sec:task_taskwait} -\index{constructs!task@\code{task}} -\index{task construct@\code{task} construct} -\index{constructs!taskwait@\code{taskwait}} -\index{taskwait construct@\code{taskwait} construct} +\index{constructs!task@\kcode{task}} +\index{task construct@\kcode{task} construct} +\index{constructs!taskwait@\kcode{taskwait}} +\index{taskwait construct@\kcode{taskwait} construct} The following example shows how to traverse a tree-like structure using explicit -tasks. Note that the \code{traverse} function should be called from within a -parallel region for the different specified tasks to be executed in parallel. Also +tasks. Note that the \ucode{traverse} function should be called from within a +\kcode{parallel} region for the different specified tasks to be executed in parallel. Also note that the tasks will be executed in no specified order because there are no synchronization directives. Thus, assuming that the traversal will be done in post order, as in the sequential code, is wrong. @@ -17,7 +17,7 @@ \section{\code{task} and \code{taskwait} Constructs} \ffreeexample[3.0]{tasking}{1} -In the next example, we force a postorder traversal of the tree by adding a \code{taskwait} +In the next example, we force a postorder traversal of the tree by adding a \kcode{taskwait} directive. Now, we can safely assume that the left and right sons have been executed before we process the current node. @@ -25,20 +25,21 @@ \section{\code{task} and \code{taskwait} Constructs} \ffreeexample[3.0]{tasking}{2} -The following example demonstrates how to use the \code{task} construct to process -elements of a linked list in parallel. The thread executing the \code{single} +The following example demonstrates how to use the \kcode{task} construct to process +elements of a linked list in parallel. The thread executing the \kcode{single} region generates all of the explicit tasks, which are then executed by the threads -in the current team. The pointer \plc{p} is \code{firstprivate} by default -on the \code{task} construct so it is not necessary to specify it in a \code{firstprivate} +in the current team. The pointer \ucode{p} is firstprivate by default +on the \kcode{task} construct so it is not necessary to specify it in a \kcode{firstprivate} clause. +% \cexample[3.0]{tasking}{3} \ffreeexample[3.0]{tasking}{3} -The \code{fib()} function should be called from within a \code{parallel} region +The \ucode{fib()} function should be called from within a \kcode{parallel} region for the different specified tasks to be executed in parallel. Also, only one thread -of the \code{parallel} region should call \code{fib()} unless multiple concurrent +of the \kcode{parallel} region should call \ucode{fib()} unless multiple concurrent Fibonacci computations are desired. \cexample[3.0]{tasking}{4} @@ -53,7 +54,7 @@ \section{\code{task} and \code{taskwait} Constructs} one thread and execute them with the threads in the team. While generating these tasks, the implementation may reach its limit on unassigned tasks. If it does, the implementation is allowed to cause the thread executing the task generating -loop to suspend its task at the task scheduling point in the \code{task} directive, +loop to suspend its task at the task scheduling point in the \kcode{task} directive, and start executing unassigned tasks. Once the number of unassigned tasks is sufficiently low, the thread may resume execution of the task generating loop. @@ -61,15 +62,15 @@ \section{\code{task} and \code{taskwait} Constructs} \fexample[3.0]{tasking}{5} -\index{task construct@\code{task} construct!untied clause@\code{untied} clause} -\index{untied clause@\code{untied} clause} -\index{clauses!untied@\code{untied}} +\index{task construct@\kcode{task} construct!untied clause@\kcode{untied} clause} +\index{untied clause@\kcode{untied} clause} +\index{clauses!untied@\kcode{untied}} \index{task scheduling point} The following example is the same as the previous one, except that the tasks are generated in an untied task. While generating the tasks, the implementation may reach its limit on unassigned tasks. If it does, the implementation is allowed to cause the thread executing the task generating loop to suspend its task at the -task scheduling point in the \code{task} directive, and start executing unassigned +task scheduling point in the \kcode{task} directive, and start executing unassigned tasks. If that thread begins execution of a task that takes a long time to complete, the other threads may complete all the other tasks before it is finished. @@ -83,24 +84,24 @@ \section{\code{task} and \code{taskwait} Constructs} \fexample[3.0]{tasking}{6} The following two examples demonstrate how the scheduling rules illustrated in -Section 2.11.3 of the OpenMP 4.0 specification affect the usage of -\code{threadprivate} variables in tasks. A \code{threadprivate} +the \docref{Task Scheduling} section of the OpenMP 4.0 specification affect the usage of +threadprivate variables in tasks. A threadprivate variable can be modified by another task that is executed by the same thread. Thus, -the value of a \code{threadprivate} variable cannot be assumed to be unchanged +the value of a threadprivate variable cannot be assumed to be unchanged across a task scheduling point. In untied tasks, task scheduling points may be added in any place by the implementation. A task switch may occur at a task scheduling point. A single thread may execute -both of the task regions that modify \code{tp}. The parts of these task regions -in which \code{tp} is modified may be executed in any order so the resulting -value of \code{var} can be either 1 or 2. +both of the \kcode{task} regions that modify \ucode{tp}. The parts of these \kcode{task} regions +in which \ucode{tp} is modified may be executed in any order so the resulting +value of \ucode{var} can be either 1 or 2. \cexample[3.0]{tasking}{7} \fexample[3.0]{tasking}{7} In this example, scheduling constraints prohibit a thread in the team from executing -a new task that modifies \code{tp} while another such task region tied to the +a new task that modifies \ucode{tp} while another such \kcode{task} region tied to the same thread is suspended. Therefore, the value written will persist across the task scheduling point. @@ -109,96 +110,96 @@ \section{\code{task} and \code{taskwait} Constructs} \fexample[3.0]{tasking}{8} The following two examples demonstrate how the scheduling rules illustrated in -Section 2.11.3 of the OpenMP 4.0 specification affect the usage of locks +\docref{Task Scheduling} section of the OpenMP 4.0 specification affect the usage of locks and critical sections in tasks. If a lock is held across a task scheduling point, no attempt should be made to acquire the same lock in any code that may be interleaved. Otherwise, a deadlock is possible. In the example below, suppose the thread executing task 1 defers task 2. When it encounters the task scheduling point at task 3, it could suspend task 1 and -begin task 2 which will result in a deadlock when it tries to enter critical region +begin task 2 which will result in a deadlock when it tries to enter \kcode{critical} region 1. \cexample[3.0]{tasking}{9} \fexample[3.0]{tasking}{9} -In the following example, \code{lock} is held across a task scheduling point. +In the following example, \ucode{lock} is held across a task scheduling point. However, according to the scheduling restrictions, the executing thread can't -begin executing one of the non-descendant tasks that also acquires \code{lock} before -the task region is complete. Therefore, no deadlock is possible. +begin executing one of the non-descendant tasks that also acquires \ucode{lock} before +the \kcode{task} region is complete. Therefore, no deadlock is possible. \cexample[3.0]{tasking}{10} \ffreeexample[3.0]{tasking}{10} \clearpage -\index{task construct@\code{task} construct!mergeable clause@\code{mergeable} clause} -\index{clauses!mergeable@\code{mergeable}} -\index{mergeable clause@\code{mergeable} clause} -The following examples illustrate the use of the \code{mergeable} clause in the -\code{task} construct. In this first example, the \code{task} construct has -been annotated with the \code{mergeable} clause. The addition of this clause +\index{task construct@\kcode{task} construct!mergeable clause@\kcode{mergeable} clause} +\index{clauses!mergeable@\kcode{mergeable}} +\index{mergeable clause@\kcode{mergeable} clause} +The following examples illustrate the use of the \kcode{mergeable} clause in the +\kcode{task} construct. In this first example, the \kcode{task} construct has +been annotated with the \kcode{mergeable} clause. The addition of this clause allows the implementation to reuse the data environment (including the ICVs) of -the parent task for the task inside \code{foo} if the task is included or undeferred. +the parent task for the task inside \ucode{foo} if the task is included or undeferred. Thus, the result of the execution may differ depending on whether the task is merged or not. Therefore the mergeable clause needs to be used with caution. In this example, -the use of the mergeable clause is safe. As \code{x} is a shared variable the +the use of the mergeable clause is safe. As \ucode{x} is a shared variable the outcome does not depend on whether or not the task is merged (that is, the task will always increment the same variable and will always compute the same value -for \code{x}). +for \ucode{x}). \cexample[3.1]{tasking}{11} \ffreeexample[3.1]{tasking}{11} -This second example shows an incorrect use of the \code{mergeable} clause. In +This second example shows an incorrect use of the \kcode{mergeable} clause. In this example, the created task will access different instances of the variable -\code{x} if the task is not merged, as \code{x} is \code{firstprivate}, but -it will access the same variable \code{x} if the task is merged. As a result, +\ucode{x} if the task is not merged, as \ucode{x} is firstprivate, but +it will access the same variable \ucode{x} if the task is merged. As a result, the behavior of the program is unspecified, and it can print two different values -for \code{x} depending on the decisions taken by the implementation. +for \ucode{x} depending on the decisions taken by the implementation. \cexample[3.1]{tasking}{12} \ffreeexample[3.1]{tasking}{12} -\index{task construct@\code{task} construct!final clause@\code{final} clause} -\index{clauses!final@\code{final}} -\index{final clause@\code{final} clause} -\index{routines!omp_in_final@\scode{omp_in_final}} -\index{omp_in_final routine@\scode{omp_in_final} routine} -The following example shows the use of the \code{final} clause and the \code{omp\_in\_final} +\index{task construct@\kcode{task} construct!final clause@\kcode{final} clause} +\index{clauses!final@\kcode{final}} +\index{final clause@\kcode{final} clause} +\index{routines!omp_in_final@\kcode{omp_in_final}} +\index{omp_in_final routine@\kcode{omp_in_final} routine} +The following example shows the use of the \kcode{final} clause and the \kcode{omp_in_final} API call in a recursive binary search program. To reduce overhead, once a certain -depth of recursion is reached the program uses the \code{final} clause to create +depth of recursion is reached the program uses the \kcode{final} clause to create only included tasks, which allow additional optimizations. -The use of the \code{omp\_in\_final} API call allows programmers to optimize +The use of the \kcode{omp_in_final} API call allows programmers to optimize their code by specifying which parts of the program are not necessary when a task -can create only included tasks (that is, the code is inside a \code{final} task). +can create only included tasks (that is, the code is inside a final task). In this example, the use of a different state variable is not necessary so once the program reaches the part of the computation that is finalized and copying from -the parent state to the new state is eliminated. The allocation of \code{new\_state} +the parent state to the new state is eliminated. The allocation of \ucode{new_state} in the stack could also be avoided but it would make this example less clear. The -\code{final} clause is most effective when used in conjunction with the \code{mergeable} -clause since all tasks created in a \code{final} task region are included tasks -that can be merged if the \code{mergeable} clause is present. +\kcode{final} clause is most effective when used in conjunction with the \kcode{mergeable} +clause since all tasks created in a final \kcode{task} region are included tasks +that can be merged if the \kcode{mergeable} clause is present. \cexample[3.1]{tasking}{13} \ffreeexample[3.1]{tasking}{13} -\index{task construct@\code{task} construct!if clause@\code{if} clause} -\index{clauses!if@\code{if}} -\index{if clause@\code{if} clause} -The following example illustrates the difference between the \code{if} and the -\code{final} clauses. The \code{if} clause has a local effect. In the first -nest of tasks, the one that has the \code{if} clause will be undeferred but -the task nested inside that task will not be affected by the \code{if} clause -and will be created as usual. Alternatively, the \code{final} clause affects -all \code{task} constructs in the \code{final} task region but not the \code{final} +\index{task construct@\kcode{task} construct!if clause@\kcode{if} clause} +\index{clauses!if@\kcode{if}} +\index{if clause@\kcode{if} clause} +The following example illustrates the difference between the \kcode{if} and the +\kcode{final} clauses. The \kcode{if} clause has a local effect. In the first +nest of tasks, the one that has the \kcode{if} clause will be undeferred but +the task nested inside that task will not be affected by the \kcode{if} clause +and will be created as usual. Alternatively, the \kcode{final} clause affects +all \kcode{task} constructs in the final \kcode{task} region but not the final task itself. In the second nest of tasks, the nested tasks will be created as included -tasks. Note also that the conditions for the \code{if} and \code{final} clauses +tasks. Note also that the conditions for the \kcode{if} and \kcode{final} clauses are usually the opposite. \cexample[3.1]{tasking}{14} diff --git a/tasking/taskloop.tex b/tasking/taskloop.tex index aacaa7f..dcc677b 100644 --- a/tasking/taskloop.tex +++ b/tasking/taskloop.tex @@ -1,21 +1,23 @@ -\pagebreak -\section{\code{taskloop} Construct} +%\pagebreak +\section{\kcode{taskloop} Construct} \label{sec:taskloop} -\index{constructs!taskloop@\code{taskloop}} -\index{taskloop construct@\code{taskloop} construct} -\index{taskloop construct@\code{taskloop} construct!grainsize clause@\code{grainsize} clause} -\index{taskloop construct@\code{taskloop} construct!nogroup clause@\code{nogroup} clause} -\index{clauses!grainsize@\code{grainsize}} -\index{grainsize clause@\code{grainsize} clause} -\index{clauses!nogroup@\code{nogroup}} -\index{nogroup clause@\code{nogroup} clause} +\index{constructs!taskloop@\kcode{taskloop}} +\index{taskloop construct@\kcode{taskloop} construct} +\index{taskloop construct@\kcode{taskloop} construct!grainsize clause@\kcode{grainsize} clause} +\index{taskloop construct@\kcode{taskloop} construct!nogroup clause@\kcode{nogroup} clause} +\index{clauses!grainsize@\kcode{grainsize}} +\index{grainsize clause@\kcode{grainsize} clause} +\index{clauses!nogroup@\kcode{nogroup}} +\index{nogroup clause@\kcode{nogroup} clause} The following example illustrates how to execute a long running task concurrently with tasks created -with a \code{taskloop} directive for a loop having unbalanced amounts of work for its iterations. +with a \kcode{taskloop} directive for a loop having unbalanced amounts of work for its iterations. -The \code{grainsize} clause specifies that each task is to execute at least 500 iterations of the loop. +The \kcode{grainsize} clause specifies that each task is to execute at least \ucode{500} iterations of the loop. -The \code{nogroup} clause removes the implicit taskgroup of the \code{taskloop} construct; the explicit \code{taskgroup} construct in the example ensures that the function is not exited before the long-running task and the loops have finished execution. +The \kcode{nogroup} clause removes the implicit taskgroup of the \kcode{taskloop} construct; +the explicit \kcode{taskgroup} construct in the example ensures that the function is not exited +before the long-running task and the loops have finished execution. \cexample[4.5]{taskloop}{1} @@ -23,24 +25,24 @@ \section{\code{taskloop} Construct} %\clearpage -Because a \code{taskloop} construct encloses a loop, it is often incorrectly +Because a \kcode{taskloop} construct encloses a loop, it is often incorrectly perceived as a worksharing construct (when it is directly nested in -a \code{parallel} region). +a \kcode{parallel} region). While a worksharing construct distributes the loop iterations across all threads in a team, -the entire loop of a \code{taskloop} construct is executed by every thread of the team. +the entire loop of a \kcode{taskloop} construct is executed by every thread of the team. In the example below the first taskloop occurs closely nested within -a \code{parallel} region and the entire loop is executed by each of the \plc{T} threads; -hence the reduction sum is executed \plc{T}*\plc{N} times. +a \kcode{parallel} region and the entire loop is executed by each of the \ucode{T} threads; +hence the reduction sum is executed \ucode{T}*\ucode{N} times. -The loop of the second taskloop is within a \code{single} region and is executed -by a single thread so that only \plc{N} reduction sums occur. (The other -\plc{N}-1 threads of the \code{parallel} region will participate in executing the -tasks. This is the common use case for the \code{taskloop} construct.) +The loop of the second taskloop is within a \kcode{single} region and is executed +by a single thread so that only \ucode{N} reduction sums occur. (The other +\ucode{N}-1 threads of the \kcode{parallel} region will participate in executing the +tasks. This is the common use case for the \kcode{taskloop} construct.) -In the example, the code thus prints \code{x1 = 16384} (\plc{T}*\plc{N}) and -\code{x2 = 1024} (\plc{N}). +In the example, the code thus prints \pout{x1 = 16384} (\ucode{T}*\ucode{N}) and +\pout{x2 = 1024} (\ucode{N}). \cexample[4.5]{taskloop}{2} diff --git a/tasking/taskyield.tex b/tasking/taskyield.tex index 2cd2da5..26ccc6c 100644 --- a/tasking/taskyield.tex +++ b/tasking/taskyield.tex @@ -1,13 +1,13 @@ \pagebreak -\section{\code{taskyield} Construct} +\section{\kcode{taskyield} Construct} \label{sec:taskyield} -\index{constructs!taskyield@\code{taskyield}} -\index{taskyield construct@\code{taskyield} construct} +\index{constructs!taskyield@\kcode{taskyield}} +\index{taskyield construct@\kcode{taskyield} construct} -The following example illustrates the use of the \code{taskyield} directive. +The following example illustrates the use of the \kcode{taskyield} construct. The tasks in the example compute something useful and then do some computation -that must be done in a critical region. By using \code{taskyield} when a task -cannot get access to the \code{critical} region the implementation can suspend +that must be done in a critical region. By using \kcode{taskyield} when a task +cannot get access to the \kcode{critical} region the implementation can suspend the current task and schedule some other task that can do something useful. \cexample[3.1]{taskyield}{1} diff --git a/util/chk_tags.c b/util/chk_tags.c index ca56b3c..34f6f55 100644 --- a/util/chk_tags.c +++ b/util/chk_tags.c @@ -31,6 +31,9 @@ #define T_PNAME 0x02 #define T_MTAGS 0x04 +#define LLN_DEF 75 +#define PRE_OMP "pre_omp_3.0" + #define skipspc(ic, cp) \ for (cp += ic; *cp == ' ' || *cp == '\t'; cp++) @@ -49,7 +52,7 @@ static struct tags_s { {"type", "", 4, 0, 0}, {"operation", "view|compile|link|run", 9, 0, 0}, {"expect", "success|error|ct-error|rt-error" - "|undefined|unspecified", 6, 0, 0}, + "|undefined|unspecified", 6, 0, 0}, {"version", "omp_*", 7, 0, 0}, {"env", "", 3, 0, 0}, {"depend", "", 6, 0, 0}, @@ -142,29 +145,21 @@ int check_type(char *ttype, char *fext, int *iext) } /* check validity of a tag value. - return 0 - unexpected, >= 1 - OK */ + return 0 - OK, 1 - unexpected */ int check_tvalue(char *tvalue, int itag) { - int s = 1; + int s = 0; if (*tvalue == '\0') - s = 0; + s = 1; else if (itag == vtag_idx) { - if (strncasecmp(tvalue, "pre_omp", 7) == 0) + if (strncasecmp(tvalue, "pre_omp", 7) != 0 && + strncasecmp(tvalue, tags[itag].tvals, 4) != 0) s = 1; - else - s = (strncasecmp(tvalue, tags[itag].tvals, 4) == 0)? 1 : 0; } else if (itag > 1 && itag < etag_idx) { char *cp = strcasestr(tags[itag].tvals, tvalue); if (!cp) - s = 0; - else { - char *vp = tags[itag].tvals; - while (vp != cp) { - if (*vp == '|') s++; - vp++; - } - } + s = 1; } return s; } @@ -207,7 +202,7 @@ int fix_tags(char *fname, char *mname, char *fext, int iext) if (tcnt == 1) { if (!tags[vtag_idx].c) { fprintf(fou, "%c", (iext>1)? '!':'*'); - fprintf(fou, " @@version:\tpre_omp_3.0\n"); + fprintf(fou, " @@version:\t%s\n", PRE_OMP); } tcnt = 2; } @@ -366,7 +361,7 @@ int proc_file(char *fname, int vflg) ic++; else if (!ctag) { prn_fname(&tcnt, fname, vflg); - fprintf(stderr, "\tmissing (:) after @@%s\n", tags[i].name); + fprintf(stderr, " missing (:) after @@%s\n", tags[i].name); tags[i].c = ic; tcnt |= T_MTAGS; } @@ -390,14 +385,14 @@ int proc_file(char *fname, int vflg) if (tags[i].c == 0) tags[i].c = ic; prn_fname(&tcnt, fname, vflg); tcnt |= T_MTAGS; - fprintf(stderr, "\tmis-matched @@%s: %s\n", tags[i].name, cp); + fprintf(stderr, " mis-matched @@%s: %s\n", tags[i].name, cp); } } else { s = check_tvalue(cp, i); - if (!s) { + if (s) { prn_fname(&tcnt, fname, vflg); - fprintf(stderr, "\t*** unknown value for @@%s: %s\n", + fprintf(stderr, " *** unknown value for @@%s: %s\n", tags[i].name, cp); } } @@ -408,10 +403,10 @@ int proc_file(char *fname, int vflg) else if (i < etag_idx || i >= max_tags) { prn_fname(&tcnt, fname, vflg); if (tags[i].c != 0) - fprintf(stderr, "\t*** duplicated tag - @@%.*s (lines %d, %d)\n", + fprintf(stderr, " *** duplicated tag - @@%.*s (lines %d, %d)\n", ic, cp, tags[i].r, lineno); else - fprintf(stderr, "\t*** unrecognized tag - @@%.*s\n", ic, cp); + fprintf(stderr, " *** unrecognized tag - @@%.*s\n", ic, cp); } } fclose(fp); @@ -419,7 +414,7 @@ int proc_file(char *fname, int vflg) if (ctag) return cnt; if (!tags[0].c || !tags[1].c) { prn_fname(&tcnt, fname, vflg); - fprintf(stderr, "\t*** no"); + fprintf(stderr, " *** no"); if (!tags[0].c) fprintf(stderr, " @@%s", tags[0].name); if (!tags[0].c && !tags[1].c) fprintf(stderr, " or"); if (!tags[1].c) fprintf(stderr, " @@%s", tags[1].name); @@ -428,7 +423,7 @@ int proc_file(char *fname, int vflg) if ((vflg&F_STRICT) && !tags[vtag_idx].c) { tcnt |= T_MTAGS; prn_fname(&tcnt, fname, vflg); - fprintf(stderr, "\t*** no @@%s tag found\n", tags[vtag_idx].name); + fprintf(stderr, " *** no @@%s tag found\n", tags[vtag_idx].name); } /* fix tag values when required */ @@ -466,7 +461,7 @@ int get_mkeys(char *kbuf, char *sname, char *tag) n = 0; tag[n] = '\0'; if (*cp == ']') cp++; - while (*cp == ' ' || *cp == '\t') cp++; + skipspc(0, cp); } else tag[0] = '\0'; @@ -617,17 +612,17 @@ void usage(char *pgnam) printf("options:\n\ -sc ; strictly check for \"@@:\"\n\ -vtag ; check version tags in tex file\n\ - -clen ; check line length over a limit (def =75)\n\ + -clen ; check line length over a limit (def =%d)\n\ -fix ; apply tag fix if needed (to _fix)\n\ -list ; list tags (with -vtag to list version tags)\n\ - -v ; view filenames\n"); + -v ; view filenames\n", LLN_DEF); exit(0); } /* the driver */ int main(int argc, char *argv[]) { - int cnt, fcnt = -1, bcnt = -1, scnt = 0, tcnt = 0, vflg = 0, lln = 75; + int cnt, fcnt = -1, bcnt = -1, scnt = 0, tcnt = 0, vflg = 0, lln = LLN_DEF; char *pgnam = strrchr(argv[0], '/'); pgnam = pgnam? (pgnam+1) : argv[0]; while (--argc > 0) { @@ -648,7 +643,7 @@ int main(int argc, char *argv[]) if (*cp == '=') cp++; if (*cp) { lln = atoi(cp); - if (lln < 2) lln = 75; + if (lln < 2) lln = LLN_DEF; } } else if (strcmp(cp, "-h") == 0) diff --git a/util/list_tags b/util/list_tags index 73a4b63..e10b7d3 100755 --- a/util/list_tags +++ b/util/list_tags @@ -10,7 +10,8 @@ class Pflag(Enum): clen = 2 env = 3 fenv = 4 -tag_names = ['name', 'type', 'operation', 'expect', 'version', 'env', 'depend'] +tag_names = ['name', 'type', 'operation', 'expect', 'version', + 'env', 'depend'] total_cnt = 0 def print_hline(): @@ -32,8 +33,18 @@ def esc_vstr(tag, val): a += '"' return a +def chk_ext(fname): + idx = fname.rfind(".") + 1 + if idx > 0: + ext = "cfht" + if ext.find(fname[idx]) >= 0: + return 1 + return 0 + def list_tags(fname, pflag): global total_cnt + if chk_ext(fname) == 0: + return 0 with open(fname, 'r') as f: tags = {} cnt = 0 @@ -52,6 +63,7 @@ def list_tags(fname, pflag): tags[m[0]] = re.sub('\n', '', m[1]) elif cnt > 0: break + if cnt > 0 and pflag == Pflag.vtag: idx = fname.rfind('/') + 1 if idx < 1: @@ -62,6 +74,7 @@ def list_tags(fname, pflag): a += re.sub("omp_","",tags[tag]) a += '}{'+'{0:d}'.format(cnt)+'}{}{}{}}' print(a) + elif pflag == Pflag.list: total_cnt += 1 if total_cnt == 1: @@ -72,9 +85,12 @@ def list_tags(fname, pflag): if tag in tags: a += esc_vstr(tag, tags[tag]) print(a) + return cnt def check_lln(fname, lsize): + if chk_ext(fname) == 0: + return 0 with open(fname, 'r') as f: cnt, s, lineno = 0, 0, 0 for line in f: diff --git a/versioninfo b/versioninfo new file mode 100644 index 0000000..a70cecf --- /dev/null +++ b/versioninfo @@ -0,0 +1,8 @@ +# Examples Document Version +version=5.2.2 + +# Supported Spec Version +version_spec=5.2 + +# Document Release Date +version_date=April 2024