From 0b4cd91071fdf9802559974aa9fd32ac4bbd7439 Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Thu, 19 May 2016 21:43:48 -0700 Subject: [PATCH] i965/fs: Extend region width calculation to allow arbitrary execution sizes. Instead of just halving the execution size when the instruction is compressed hoping that it will give a legal source region width, we can calculate the maximum legal width value in closed form from the component size and stride. This makes sure that brw_reg_from_fs_reg() always returns a valid hardware region even for virtual 32-wide instructions (e.g. send-like instructions) that would seem to exceed the hardware region width limit after halving. Reviewed-by: Jason Ekstrand --- .../drivers/dri/i965/brw_fs_generator.cpp | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 67c7aa5aa2f..85b24bea107 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -65,27 +65,34 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed) case VGRF: if (reg->stride == 0) { brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0); - } else if (!compressed && - inst->exec_size * reg->stride * type_sz(reg->type) <= 32) { - brw_reg = brw_vecn_reg(inst->exec_size, brw_file_from_reg(reg), - reg->nr, 0); - brw_reg = stride(brw_reg, inst->exec_size * reg->stride, - inst->exec_size, reg->stride); } else { /* From the Haswell PRM: * - * VertStride must be used to cross GRF register boundaries. This - * rule implies that elements within a 'Width' cannot cross GRF - * boundaries. + * "VertStride must be used to cross GRF register boundaries. This + * rule implies that elements within a 'Width' cannot cross GRF + * boundaries." * - * So, for registers that are large enough, we have to split the exec - * size in two and trust the compression state to sort it out. + * The maximum width value that could satisfy this restriction is: */ - assert(inst->exec_size / 2 * reg->stride * type_sz(reg->type) <= 32); - brw_reg = brw_vecn_reg(inst->exec_size / 2, brw_file_from_reg(reg), - reg->nr, 0); - brw_reg = stride(brw_reg, inst->exec_size / 2 * reg->stride, - inst->exec_size / 2, reg->stride); + const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type)); + + /* Because the hardware can only split source regions at a whole + * multiple of width during decompression (i.e. vertically), clamp + * the value obtained above to the physical execution size of a + * single decompressed chunk of the instruction: + */ + const unsigned phys_width = compressed ? inst->exec_size / 2 : + inst->exec_size; + + /* XXX - The equation above is strictly speaking not correct on + * hardware that supports unbalanced GRF writes -- On Gen9+ + * each decompressed chunk of the instruction may have a + * different execution size when the number of components + * written to each destination GRF is not the same. + */ + const unsigned width = MIN2(reg_width, phys_width); + brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); + brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); } brw_reg = retype(brw_reg, reg->type); -- 2.30.2