From 0dd18d70aecd3bafb7dcde15430a024942fe243c Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Fri, 3 Jan 2020 15:06:52 -0800 Subject: [PATCH] intel/fs/gen6: Generalize aligned_pairs_class to SIMD16 aligned barycentrics. This is mainly meant to avoid shader-db regressions on SNB as we start using VGRFs for barycentrics more frequently. Currently the aligned_pairs_class is only useful in SIMD8 mode, because in SIMD16 mode barycentric vectors are typically 4 GRFs. This is not a problem on Gen4-5, because on those platforms all VGRF allocations are pair-aligned in SIMD16 mode. However on Gen6 we end up using either the fast or the slow path of LINTERP rather non-deterministically based on the behavior of the register allocator. Fix it by repurposing aligned_pairs_class to hold PLN-aligned registers of whatever the natural size of a barycentric vector is in the current dispatch width. On SNB this prevents the following shader-db regressions (including SIMD32 programs) in combination with the interpolation rework part of this series: total instructions in shared programs: 13983257 -> 14527274 (3.89%) instructions in affected programs: 1766255 -> 2310272 (30.80%) helped: 0 HURT: 11608 LOST: 26 GAINED: 13 Reviewed-by: Kenneth Graunke --- src/intel/compiler/brw_compiler.h | 4 +- src/intel/compiler/brw_fs_reg_allocate.cpp | 73 +++++++++++++--------- 2 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 6e6610a26af..61b7b4f21e6 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -83,10 +83,10 @@ struct brw_compiler { uint8_t *ra_reg_to_grf; /** - * ra class for the aligned pairs we use for PLN, which doesn't + * ra class for the aligned barycentrics we use for PLN, which doesn't * appear in *classes. */ - int aligned_pairs_class; + int aligned_bary_class; } fs_reg_sets[3]; void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index f53fb2bf253..b26f10ea44e 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -72,6 +72,15 @@ fs_visitor::assign_regs_trivial() } +/** + * Size of a register from the aligned_bary_class register class. + */ +static unsigned +aligned_bary_size(unsigned dispatch_width) +{ + return (dispatch_width == 8 ? 2 : 4); +} + static void brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width) { @@ -145,10 +154,11 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width) if (devinfo->gen >= 6) ra_set_allocate_round_robin(regs); int *classes = ralloc_array(compiler, int, class_count); - int aligned_pairs_class = -1; + int aligned_bary_class = -1; /* Allocate space for q values. We allocate class_count + 1 because we - * want to leave room for the aligned pairs class if we have it. */ + * want to leave room for the aligned barycentric class if we have it. + */ unsigned int **q_values = ralloc_array(compiler, unsigned int *, class_count + 1); for (int i = 0; i < class_count + 1; ++i) @@ -158,8 +168,8 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width) * between them and the base GRF registers (and also each other). */ int reg = 0; - int pairs_base_reg = 0; - int pairs_reg_count = 0; + int aligned_bary_base_reg = 0; + int aligned_bary_reg_count = 0; for (int i = 0; i < class_count; i++) { int class_reg_count; if (devinfo->gen <= 5 && dispatch_width >= 16) { @@ -202,10 +212,10 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width) } classes[i] = ra_alloc_reg_class(regs); - /* Save this off for the aligned pair class at the end. */ - if (class_sizes[i] == 2) { - pairs_base_reg = reg; - pairs_reg_count = class_reg_count; + /* Save this off for the aligned barycentric class at the end. */ + if (class_sizes[i] == int(aligned_bary_size(dispatch_width))) { + aligned_bary_base_reg = reg; + aligned_bary_reg_count = class_reg_count; } if (devinfo->gen <= 5 && dispatch_width >= 16) { @@ -246,29 +256,33 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width) for (int reg = 0; reg < base_reg_count; reg++) ra_make_reg_conflicts_transitive(regs, reg); - /* Add a special class for aligned pairs, which we'll put delta_xy - * in on Gen <= 6 so that we can do PLN. + /* Add a special class for aligned barycentrics, which we'll put the + * first source of LINTERP on so that we can do PLN on Gen <= 6. */ - if (devinfo->has_pln && dispatch_width == 8 && devinfo->gen <= 6) { - aligned_pairs_class = ra_alloc_reg_class(regs); - - for (int i = 0; i < pairs_reg_count; i++) { - if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) { - ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i); + if (devinfo->has_pln && (devinfo->gen == 6 || + (dispatch_width == 8 && devinfo->gen <= 5))) { + aligned_bary_class = ra_alloc_reg_class(regs); + + for (int i = 0; i < aligned_bary_reg_count; i++) { + if ((ra_reg_to_grf[aligned_bary_base_reg + i] & 1) == 0) { + ra_class_add_reg(regs, aligned_bary_class, + aligned_bary_base_reg + i); } } for (int i = 0; i < class_count; i++) { - /* These are a little counter-intuitive because the pair registers - * are required to be aligned while the register they are - * potentially interferring with are not. In the case where the - * size is even, the worst-case is that the register is - * odd-aligned. In the odd-size case, it doesn't matter. + /* These are a little counter-intuitive because the barycentric + * registers are required to be aligned while the register they are + * potentially interferring with are not. In the case where the size + * is even, the worst-case is that the register is odd-aligned. In + * the odd-size case, it doesn't matter. */ - q_values[class_count][i] = class_sizes[i] / 2 + 1; - q_values[i][class_count] = class_sizes[i] + 1; + q_values[class_count][i] = class_sizes[i] / 2 + + aligned_bary_size(dispatch_width) / 2; + q_values[i][class_count] = class_sizes[i] + + aligned_bary_size(dispatch_width) - 1; } - q_values[class_count][class_count] = 1; + q_values[class_count][class_count] = aligned_bary_size(dispatch_width) - 1; } ra_set_finalize(regs, q_values); @@ -281,7 +295,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width) for (int i = 0; i < class_count; i++) compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i]; compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf; - compiler->fs_reg_sets[index].aligned_pairs_class = aligned_pairs_class; + compiler->fs_reg_sets[index].aligned_bary_class = aligned_bary_class; } void @@ -782,14 +796,15 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling) /* Special case: on pre-Gen7 hardware that supports PLN, the second operand * of a PLN instruction needs to be an even-numbered register, so we have a - * special register class aligned_pairs_class to handle this case. + * special register class aligned_bary_class to handle this case. */ - if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0) { + if (compiler->fs_reg_sets[rsi].aligned_bary_class >= 0) { foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { if (inst->opcode == FS_OPCODE_LINTERP && inst->src[0].file == VGRF && - fs->alloc.sizes[inst->src[0].nr] == 2) { + fs->alloc.sizes[inst->src[0].nr] == + aligned_bary_size(fs->dispatch_width)) { ra_set_node_class(g, first_vgrf_node + inst->src[0].nr, - compiler->fs_reg_sets[rsi].aligned_pairs_class); + compiler->fs_reg_sets[rsi].aligned_bary_class); } } } -- 2.30.2