Skip to content
This repository has been archived by the owner on Jan 7, 2023. It is now read-only.

Commit

Permalink
intel/fs/copy-prop: Don't walk all the ACPs for each instruction
Browse files Browse the repository at this point in the history
In order to set up KILL sets, the dataflow code was walking the entire
array of ACPs for every instruction.  If you assume the number of ACPs
increases roughly with the number of instructions, this is O(n^2).  As
it turns out, regions_overlap() is not nearly as cheap as one would like
and shows up as a significant chunk on perf traces.

This commit changes things around and instead first builds an array of
exec_lists which it uses like a hash table (keyed off ACP source or
destination) similar to what's done in the rest of the copy-prop code.
By first walking the list of ACPs and populating the table and then
walking instructions and only looking at ACPs which probably have the
same VGRF number, we can reduce the complexity to O(n).  This takes the
execution time of the piglit vs-isnan-dvec test from about 56.4 seconds
on an unoptimized debug build (what we run in CI) with NIR_VALIDATE=0 to
about 38.7 seconds.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
  • Loading branch information
gfxstrand committed May 10, 2019
1 parent 20bbc17 commit f8bda81
Showing 1 changed file with 68 additions and 11 deletions.
79 changes: 68 additions & 11 deletions src/intel/compiler/brw_fs_copy_propagation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#define ACP_HASH_SIZE 64

#include "util/bitset.h"
#include "util/u_math.h"
#include "brw_fs.h"
#include "brw_fs_live_variables.h"
#include "brw_cfg.h"
Expand All @@ -46,6 +47,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */
struct acp_entry : public exec_node {
fs_reg dst;
fs_reg src;
unsigned global_idx;
uint8_t size_written;
uint8_t size_read;
enum opcode opcode;
Expand Down Expand Up @@ -142,6 +144,8 @@ fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
acp[next_acp] = entry;

entry->global_idx = next_acp;

/* opt_copy_propagation_local populates out_acp with copies created
* in a block which are still live at the end of the block. This
* is exactly what we want in the COPY set.
Expand All @@ -167,21 +171,74 @@ void
fs_copy_prop_dataflow::setup_initial_values()
{
/* Initialize the COPY and KILL sets. */
foreach_block (block, cfg) {
foreach_inst_in_block(fs_inst, inst, block) {
if (inst->dst.file != VGRF)
continue;
{
/* Create a temporary table of ACP entries which we'll use for efficient
* look-up. Unfortunately, we have to do this in two steps because we
* have to match both sources and destinations and an ACP entry can only
* be in one list at a time.
*
* We choose to make the table size between num_acp/2 and num_acp/4 to
* try and trade off between the time it takes to initialize the table
* via exec_list constructors or make_empty() and the cost of
* collisions. In practice, it doesn't appear to matter too much what
* size we make the table as long as it's roughly the same order of
* magnitude as num_acp. We get most of the benefit of the table
* approach even if we use a table of size ACP_HASH_SIZE though a
* full-sized table is 1-2% faster in practice.
*/
unsigned acp_table_size = util_next_power_of_two(num_acp) / 4;
acp_table_size = MAX2(acp_table_size, ACP_HASH_SIZE);
exec_list *acp_table = new exec_list[acp_table_size];

/* Mark ACP entries which are killed by this instruction. */
for (int i = 0; i < num_acp; i++) {
if (regions_overlap(inst->dst, inst->size_written,
acp[i]->dst, acp[i]->size_written) ||
regions_overlap(inst->dst, inst->size_written,
acp[i]->src, acp[i]->size_read)) {
BITSET_SET(bd[block->num].kill, i);
/* First, get all the KILLs for instructions which overwrite ACP
* destinations.
*/
for (int i = 0; i < num_acp; i++) {
unsigned idx = acp[i]->dst.nr & (acp_table_size - 1);
acp_table[idx].push_tail(acp[i]);
}

foreach_block (block, cfg) {
foreach_inst_in_block(fs_inst, inst, block) {
if (inst->dst.file != VGRF)
continue;

unsigned idx = inst->dst.nr & (acp_table_size - 1);
foreach_in_list(acp_entry, entry, &acp_table[idx]) {
if (regions_overlap(inst->dst, inst->size_written,
entry->dst, entry->size_written))
BITSET_SET(bd[block->num].kill, entry->global_idx);
}
}
}

/* Clear the table for the second pass */
for (unsigned i = 0; i < acp_table_size; i++)
acp_table[i].make_empty();

/* Next, get all the KILLs for instructions which overwrite ACP
* sources.
*/
for (int i = 0; i < num_acp; i++) {
unsigned idx = acp[i]->src.nr & (acp_table_size - 1);
acp_table[idx].push_tail(acp[i]);
}

foreach_block (block, cfg) {
foreach_inst_in_block(fs_inst, inst, block) {
if (inst->dst.file != VGRF)
continue;

unsigned idx = inst->dst.nr & (acp_table_size - 1);
foreach_in_list(acp_entry, entry, &acp_table[idx]) {
if (regions_overlap(inst->dst, inst->size_written,
entry->src, entry->size_read))
BITSET_SET(bd[block->num].kill, entry->global_idx);
}
}
}

delete [] acp_table;
}

/* Populate the initial values for the livein and liveout sets. For the
Expand Down

0 comments on commit f8bda81

Please sign in to comment.