Skip to content

Commit

Permalink
Merge branch 'master' of github.com:RRZE-HPC/likwid
Browse files Browse the repository at this point in the history
  • Loading branch information
TomTheBear committed Nov 5, 2023
2 parents df82943 + 5dddffb commit 31854d9
Show file tree
Hide file tree
Showing 56 changed files with 15,008 additions and 6,136 deletions.
13 changes: 13 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,15 @@ OBJ := $(filter-out $(BUILD_DIR)/loadDataARM.o,$(OBJ))
endif
ifneq ($(NVIDIA_INTERFACE), true)
OBJ := $(filter-out $(BUILD_DIR)/nvmon.o,$(OBJ))
OBJ := $(filter-out $(BUILD_DIR)/nvmon_nvml.o,$(OBJ))
OBJ := $(filter-out $(BUILD_DIR)/topology_gpu.o,$(OBJ))
OBJ := $(filter-out $(BUILD_DIR)/libnvctr.o,$(OBJ))
endif
ifneq ($(ROCM_INTERFACE), true)
OBJ := $(filter-out $(BUILD_DIR)/rocmon.o,$(OBJ))
OBJ := $(filter-out $(BUILD_DIR)/rocmon-marker.o,$(OBJ))
OBJ := $(filter-out $(BUILD_DIR)/topology_gpu_rocm.o,$(OBJ))
endif
ifeq ($(COMPILER),GCCPOWER)
OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ))
OBJ := $(filter-out $(BUILD_DIR)/access_x86.o,$(OBJ))
Expand Down Expand Up @@ -195,6 +201,7 @@ $(L_APPS): $(addprefix $(SRC_DIR)/applications/,$(addsuffix .lua,$(L_APPS)))
@echo "===> ADJUSTING $@"
@if [ "$(ACCESSMODE)" = "direct" ]; then sed -i -e s/"access_mode = 1"/"access_mode = 0"/g $(SRC_DIR)/applications/$@.lua;fi
@sed -e s/'<INSTALLED_BINPREFIX>'/$(subst /,\\/,$(INSTALLED_BINPREFIX))/g \
-e s/'<INSTALLED_LIBPREFIX>'/$(subst /,\\/,$(INSTALLED_LIBPREFIX))/g \
-e s/'<INSTALLED_PREFIX>'/$(subst /,\\/,$(INSTALLED_PREFIX))/g \
-e s/'<VERSION>'/$(VERSION).$(RELEASE).$(MINOR)/g \
-e s/'<DATE>'/$(DATE)/g \
Expand Down Expand Up @@ -236,6 +243,7 @@ $(DYNAMIC_TARGET_LIB): $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_HWLOC_LIB)
@ln -sf $(TARGET_LIB) $(TARGET_LIB).$(VERSION).$(RELEASE)
@sed -e s+'@PREFIX@'+$(INSTALLED_PREFIX)+g \
-e s+'@NVIDIA_INTERFACE@'+$(NVIDIA_INTERFACE)+g \
-e s+'@ROCM_INTERFACE@'+$(ROCM_INTERFACE)+g \
-e s+'@FORTRAN_INTERFACE@'+$(FORTRAN_INTERFACE)+g \
-e s+'@LIBPREFIX@'+$(INSTALLED_LIBPREFIX)+g \
-e s+'@BINPREFIX@'+$(INSTALLED_BINPREFIX)+g \
Expand Down Expand Up @@ -303,6 +311,11 @@ $(BUILD_DIR)/%.o: %.c
$(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
$(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d

$(BUILD_DIR)/rocmon_marker.o: rocmon_marker.c
@echo "===> COMPILE $@"
$(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
$(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE2 $@

$(BUILD_DIR)/%.o: %.cc
@echo "===> COMPILE $@"
$(Q)$(CXX) -c $(DEBUG_FLAGS) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
Expand Down
28 changes: 28 additions & 0 deletions README_ROCM.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
## Build & Install

```bash
export ROCM_HOME=/opt/rocm
make
make install
```

## Test

Build

```bash
cd test
# make clean
make test-topology-gpu-rocm
make test-rocmon-triad
make test-rocmon-triad-marker
```

Run

```bash
export LD_LIBRARY_PATH=/home/users/kraljic/likwid-rocmon/install/lib:/opt/rocm/hip/lib:/opt/rocm/hsa/lib:/opt/rocm/rocprofiler/lib:$LD_LIBRARY_PATH
export ROCP_METRICS=/opt/rocm/rocprofiler/lib/metrics.xml # for rocmon test
export HSA_TOOLS_LIB=librocprofiler64.so.1 # allows rocmon to intercept hsa commands
./gpu-test-topology-gpu-rocm
```
14 changes: 14 additions & 0 deletions config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ INSTRUMENT_BENCH = true#NO SPACE
# For configuring include paths, go to CUDA section
NVIDIA_INTERFACE = false#NO SPACE

# Build LIKWID with AMD GPU interface (ROCm)
# For configuring include paths, go to ROCm section
ROCM_INTERFACE = false#NO SPACE

#################################################################
#################################################################
# Advanced configuration options #
Expand Down Expand Up @@ -172,3 +176,13 @@ CUPTIINCLUDE = $(CUDA_HOME)/extras/CUPTI/include
# In order to hook into the CUDA application, the appDaemon is required
# If you just want the NvMarkerAPI, you can keep it false
BUILDAPPDAEMON=false

# ROCm build data
# LIKWID requires ROCm to be present only for compilation with
# ROCM_INTERFACE=true. At runtime, the ROCm library have
# to be in the LD_LIBRARY_PATH to dynamically load the libraries.
# Include directory for ROCm headers
HSAINCLUDE = $(ROCM_HOME)/include
ROCPROFILERINCLUDE = $(ROCM_HOME)/include/rocprofiler
HIPINCLUDE = $(ROCM_HOME)/include
RSMIINCLUDE = $(ROCM_HOME)/include
16 changes: 13 additions & 3 deletions doc/applications/likwid-perfctr.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ custom event sets. The \ref Marker_API can measure mulitple named regions and th
</TR>
<TR>
<TD>-W, --gpugroup &lt;arg&gt;</TD>
<TD>Specify which event string or performance group should be measured on the GPUs. Only if built with NVIDIA_INTERFACE=true.</TD>
<TD>Specify which event string or performance group should be measured on the Nvidia GPUs. Only if built with NVIDIA_INTERFACE=true.</TD>
</TR>
<TR>
<TD>-R &lt;arg&gt;</TD>
<TD>Specify which event string or performance group should be measured on the AMD GPUs. Only if built with ROCM_INTERFACE=true.</TD>
</TR>
<TR>
<TD>-c &lt;arg&gt;</TD>
Expand All @@ -68,7 +72,11 @@ custom event sets. The \ref Marker_API can measure mulitple named regions and th
</TR>
<TR>
<TD>-G &lt;arg&gt;</TD>
<TD>Defines the GPUs that should be measured<BR>You can use simple lists like 0,1,3 or ranges like 0-2. Only if built with NVIDIA_INTERFACE=true.</TD>
<TD>Defines the Nvidia GPUs that should be measured<BR>You can use simple lists like 0,1,3 or ranges like 0-2. Only if built with NVIDIA_INTERFACE=true.</TD>
</TR>
<TR>
<TD>-I &lt;arg&gt;</TD>
<TD>Defines the AMD GPUs that should be measured<BR>You can use simple lists like 0,1,3 or ranges like 0-2. Only if built with ROCM_INTERFACE=true.</TD>
</TR>
<TR>
<TD>-H</TD>
Expand Down Expand Up @@ -274,6 +282,8 @@ The LIKWID package contains an example code: see \ref F-markerAPI-code.
Since the calls to the LIKWID library are executed by your application, the runtime will raise and in specific circumstances, there are some other problems like the time measurement. You can execute <CODE>LIKWID_MARKER_THREADINIT</CODE> and <CODE>LIKWID_MARKER_START</CODE> inside the same parallel region but put a barrier between the calls to ensure that there is no big timing difference between the threads. The common way is to init LIKWID and the participating threads inside of an initialization routine, use only START and STOP in your code and close the Marker API in a finalization routine. Be aware that at the first start of a region, the thread-local hash table gets a new entry to store the measured values. If your code inside the region is short or you are executing the region only once, the overhead of creating the hash table entry can be significant compared to the execution of the region code. The overhead of creating the hash tables can be done in prior by using the <CODE>LIKWID_MARKER_REGISTER</CODE> function. It must be called by each thread and one time for each compute region. It is completely <I>optional</I>, <CODE>LIKWID_MARKER_START</CODE> performs the same operations.

<H2>CUDA code</H2>
With LIKWID 5.0 CUDA kernels can be measured. There is a special NvMarkerAPI for Nvidia GPUs. The usage is similar to the CPU MarkerAPI, just replace <CODE>LIKWID_MARKER_</CODE> with <CODE>LIKWID_NVMARKER_</CODE>. The two MarkerAPIs can be mixed.
With LIKWID 5.0 CUDA kernels can be measured. There is a special NvMarkerAPI for Nvidia GPUs. The usage is similar to the CPU MarkerAPI, just replace <CODE>LIKWID_MARKER_</CODE> with <CODE>LIKWID_NVMARKER_</CODE>. All MarkerAPIs can be mixed.

<H2>ROCm code</H2>
ROCm kernels can be measured. There is a special RocmonMarkerAPI for AMD GPUs. The usage is similar to the CPU or Nvidia MarkerAPI, just replace <CODE>LIKWID_MARKER_</CODE> with <CODE>ROCMON_MARKER_</CODE>. All MarkerAPIs can be mixed.
*/
7 changes: 5 additions & 2 deletions doc/likwid-doxygen.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*! \mainpage LIKWID - Like I Knew What I Am Doing

\section Introduction
This is an effort to develop easy to use but yet powerful performance tools for the GNU Linux operating system. While the focus of LIKWID was on x86 processors, it is now ported to ARM and POWER processors. A backend for Nvidia GPUs is part of LIKWID with version 5.0.<BR>
This is an effort to develop easy to use but yet powerful performance tools for the GNU Linux operating system. While the focus of LIKWID was on x86 processors, it is now ported to ARM and POWER processors. A backend for Nvidia GPUs is part of LIKWID with version 5.0. With the Rocmon backend, AMD GPUs can be monitored.<BR>

LIKWID follows the philosophy:
- Simple
Expand All @@ -16,7 +16,7 @@ LIKWID follows the philosophy:
\section Tools LIKWID Tools
- \ref likwid-topology : A tool to display the thread and cache topology on multicore/multisocket computers.
- \ref likwid-pin : A tool to pin your threaded application without changing your code. Works for pthreads and OpenMP.
- \ref likwid-perfctr : A tool to measure hardware performance counters on x86, ARM and POWER processors as well as Nvidia GPUs. It can be used as wrapper application without modifying the profiled code or with a marker API to measure only parts of the code.
- \ref likwid-perfctr : A tool to measure hardware performance counters on x86, ARM and POWER processors as well as Nvidia/AMD GPUs. It can be used as wrapper application without modifying the profiled code or with a marker API to measure only parts of the code.
- \ref likwid-powermeter : A tool for accessing RAPL counters and query Turbo mode steps on Intel processor. RAPL counters are also available in \ref likwid-perfctr.
- \ref likwid-setFrequencies : A tool to print and manage the clock frequency of CPU hardware threads and the Uncore (Intel only).
- \ref likwid-memsweeper : A tool to cleanup ccNUMA domains and LLC caches to get a clean environment for benchmarks.
Expand Down Expand Up @@ -133,6 +133,9 @@ Optionally, a global configuration file \ref likwid.cfg can be given to modify s
- For compute capability < 7.0: support based on CUPTI Events API
- For compute capability >= 7.0: support based on CUpti Profiling API

\subsection Architectures_AMD AMD GPU architectures
- ROCm 5.0 and higher capable GPUs

\section Examples Example Codes
Using the Likwid API:
- \ref C-likwidAPI-code
Expand Down
23 changes: 20 additions & 3 deletions doc/likwid-perfctr.1
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.TH LIKWID-PERFCTR 1 <DATE> likwid\-<VERSION>
.SH NAME
likwid-perfctr \- configure and read out hardware performance counters on x86, ARM and POWER CPUs and Nvidia GPUs
likwid-perfctr \- configure and read out hardware performance counters on x86, ARM and POWER CPUs and Nvidia/AMD GPUs
.SH SYNOPSIS
.B likwid-perfctr
.RB [\-vhHmaiefO]
Expand Down Expand Up @@ -34,6 +34,12 @@ or
.IR gpu_performance_group
or
.IR gpu_performance_event_string (*) ]
.RB [ \-I
.IR gpu_list (**) ]
.RB [ \-R
.IR gpu_performance_group
or
.IR gpu_performance_event_string (**) ]
.RB [ \-\-stats ]
.SH DESCRIPTION
.B likwid-perfctr
Expand All @@ -44,6 +50,7 @@ There are preconfigured performance groups with useful event sets and derived me
events can be measured with custom event sets. The marker API can measure mulitple named regions and the
results are accumulated over multiple region calls.
.IR (*) Option only available if built with Nvidia GPU support
.IR (**) Option only available if built with AMD GPU support

.SH OPTIONS
.TP
Expand All @@ -66,7 +73,7 @@ run in marker API mode
print available performance groups for current processor, then exit.
.TP
.B \-\^e
print available counters and performance events of current processor and (if available) Nvidia GPUs.
print available counters and performance events of current processor and (if available) Nvidia or AMD GPUs.
.TP
.B \-\^o, \-\-\^output <filename>
store all ouput to a file instead of stdout. For the filename the following placeholders are supported:
Expand Down Expand Up @@ -116,7 +123,7 @@ Force writing of registers even if they are in use.
Print only events and corresponding counters matching <search_str>
.TP
.B \-\^G, \-\-\^gpus <gpu_list>
specify a numerical list of GPU IDs. The list may contain multiple
specify a numerical list of Nvidia GPU IDs. The list may contain multiple
items, separated by comma, and ranges. For example 0,3,9-11.
.TP
.B \-\^W, \-\-\^gpugroup <gpu performance group> or <gpu performance event set string>
Expand All @@ -125,6 +132,16 @@ This can be one of the tags output with the -a flag in the GPU section.
Also a custom event set can be specified by a comma separated list of events. Each event has the format
eventId:GPUx (x=0,1,2,...). You can add as many events to the string until you hit an error.
.TP
.B \-\^I, \-\-\^gpus <gpu_list>
specify a numerical list of AMD GPU IDs. The list may contain multiple
items, separated by comma, and ranges. For example 0,3,9-11.
.TP
.B \-\^R, \-\-\^gpugroup <gpu performance group> or <gpu performance event set string>
specify which performance group to measure on the specified AMD GPUs.
This can be one of the tags output with the -a flag in the GPU section.
Also a custom event set can be specified by a comma separated list of events. Each event has the format
eventId:GPUx (x=0,1,2,...). You can add as many events to the string until you hit an error.
.TP
.B \-\-\^stats
Always print statistics table

Expand Down
8 changes: 4 additions & 4 deletions doc/likwid-topology.1
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.TH LIKWID-TOPOLOGY 1 <DATE> likwid\-<VERSION>
.SH NAME
likwid-topology \- print thread, cache, NUMA and Nvidia GPU topology
likwid-topology \- print thread, cache, NUMA and Nvidia/AMD GPU topology
.SH SYNOPSIS
.B likwid-topology
.RB [\-hvgcCG]
Expand All @@ -11,12 +11,12 @@ likwid-topology \- print thread, cache, NUMA and Nvidia GPU topology
.SH DESCRIPTION
.B likwid-topology
is a command line application to print the thread and cache
topology on multicore x86, ARM and POWER processors and Nvidia GPUs.
topology on multicore x86, ARM and POWER processors and Nvidia/AMD GPUs.
Used with mono spaced fonts it can draw the processor topology of a
machine in ASCII art. Beyond topology likwid-topology determines the
clock of a processor and prints detailed informations about the caches hierarchy.
When compiled with NVIDIA_INTERFACE=true in config.mk and the CUDA/CUPTI library reachable
at runtime, likwid-topology prints information about the Nvidia GPUs in the system.
at runtime, likwid-topology prints information about the Nvidia GPUs in the system. The same is possible for AMD GPUs with ROCM_INTERFACE=TRUE and the required ROCm libraries.
.SH OPTIONS
.TP
.B \-h, \-\-\^help
Expand All @@ -38,7 +38,7 @@ prints detailed information about cache hierarchy
measures and output the processor clock. This involves a longer run time of likwid-topology.
.TP
.B \-G, \-\-\^gpus
prints detailed information about the Nvidia GPUs in the system (if compiled with Nvidia support)
prints detailed information about the Nvidia/AMD GPUs in the system (if compiled with Nvidia or AMD support)
.TP
.B \-o, \-\-\^output <file>
write the output to file instead of stdout.
Expand Down
13 changes: 13 additions & 0 deletions groups/amd_gpu/GDS.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
SHORT GDS Instructions

EVENTSET
ROCM0 ROCP_SQ_INSTS_GDS
ROCM1 ROCP_SQ_WAVES

METRICS
GPU GDS rw insts per work-item ROCM0/ROCM1

LONG
--
The average number of GDS read or GDS write instructions executed
per work item (affected by flow control).
16 changes: 16 additions & 0 deletions groups/amd_gpu/MEM.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
SHORT Memory utilization

EVENTSET
ROCM0 ROCP_TA_TA_BUSY
ROCM1 ROCP_GRBM_GUI_ACTIVE
ROCM2 ROCP_SE_NUM

METRICS
GPU memory utilization 100*max(ROCM0,16)/ROCM1/ROCM2

LONG
--
The percentage of GPUTime the memory unit is active. The result includes
the stall time (MemUnitStalled). This is measured with all extra fetches
and writes and any cache or memory effects taken into account.
Value range: 0% to 100% (fetch-bound).
18 changes: 18 additions & 0 deletions groups/amd_gpu/PCI.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
SHORT PCI Transfers

EVENTSET
ROCM0 RSMI_PCI_THROUGHPUT_SENT
ROCM1 RSMI_PCI_THROUGHPUT_RECEIVED


METRICS
Runtime time
PCI sent ROCM0
PCI received ROCM1
PCI send bandwidth 1E-6*ROCM0/time
PCI recv bandwidth 1E-6*ROCM1/time

LONG
--
Currently not usable since the RSMI_PCI_THROUGHPUT_* events require
one second per call, so 2 seconds for both of them.
17 changes: 17 additions & 0 deletions groups/amd_gpu/POWER.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
SHORT Power, temperature and voltage

EVENTSET
ROCM0 RSMI_POWER_AVE[0]
ROCM1 RSMI_TEMP_EDGE
ROCM2 RSMI_VOLT_VDDGFX


METRICS
Power average 1E-6*ROCM0
Edge temperature 1E-3*ROCM1
Voltage 1E-3*ROCM2

LONG
--
Gets the current average power consumption in watts, the
temperature in celsius and the voltage in volts.
13 changes: 13 additions & 0 deletions groups/amd_gpu/SALU.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
SHORT SALU Instructions

EVENTSET
ROCM0 ROCP_SQ_INSTS_SALU
ROCM1 ROCP_SQ_WAVES

METRICS
GPU SALU insts per work-item ROCM0/ROCM1

LONG
--
The average number of scalar ALU instructions executed per work-item
(affected by flow control).
13 changes: 13 additions & 0 deletions groups/amd_gpu/SFETCH.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
SHORT SFetch Instructions

EVENTSET
ROCM0 ROCP_SQ_INSTS_SMEM
ROCM1 ROCP_SQ_WAVES

METRICS
GPU SFETCH insts per work-item ROCM0/ROCM1

LONG
--
The average number of scalar fetch instructions from the video memory
executed per work-item (affected by flow control).
17 changes: 17 additions & 0 deletions groups/amd_gpu/STALLED.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
SHORT ALU stalled by LDS

EVENTSET
ROCM0 ROCP_SQ_WAIT_INST_LDS
ROCM1 ROCP_SQ_WAVES
ROCM2 ROCP_GRBM_GUI_ACTIVE

METRICS
GPU ALD stalled 100*ROCM0*4/ROCM1/ROCM2

LONG
--
The percentage of GPUTime ALU units are stalled by the LDS input queue
being full or the output queue being not ready. If there are LDS bank
conflicts, reduce them. Otherwise, try reducing the number of LDS
accesses if possible.
Value range: 0% (optimal) to 100% (bad).
16 changes: 16 additions & 0 deletions groups/amd_gpu/UTIL.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
SHORT GPU utilization

EVENTSET
ROCM0 ROCP_GRBM_COUNT
ROCM1 ROCP_GRBM_GUI_ACTIVE


METRICS
GPU utilization 100*ROCM1/ROCM0


LONG
--
This group reassembles the 'GPUBusy' metric provided by RocProfiler.
We should add, that we can select the GPUBusy metric directly and the
calculations are done internally in case the metric formula changes.
Loading

0 comments on commit 31854d9

Please sign in to comment.