From a21dc2b500cff6e0aaf31867c5b42651306ddaf1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Samuel=20Iglesias=20Gons=C3=A1lvez?= Date: Mon, 29 Aug 2016 10:10:30 +0200 Subject: [PATCH] i965/vec4: split DF instructions and later double its execsize in IVB/BYT MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We need to split DF instructions in two on IVB/BYT as it needs an execsize 8 to process 4 DF values (one GRF in total). v2: - Rename helper and make it static inline function (Matt). - Fix indention and add braces (Matt). v3: - Don't edit IR instruction when doubling exec_size (Curro) - Add comment into the code (Curro). - Manage ARF registers like the others (Curro) v4: - Add get_exec_type() function and use it to calculate the execution size. Signed-off-by: Samuel Iglesias Gonsálvez [ Francisco Jerez: Fix bogus 'type != BAD_FILE' check. Take destination type as execution type where there is no valid source. Assert-fail if the deduced execution type is byte. Clarify comment in get_lowered_simd_width(). Move SIMD width workaround outside of 'if (...inst->size_written > REG_SIZE)' conditional block, since the problem should be independent of whether the amount of data written by the instruction is greater or lower than a GRF. Drop redundant is_ivb_df definition. Drop bogus inst->exec_size < 8 check. Simplify channel group assertion. ] Reviewed-by: Francisco Jerez --- src/intel/compiler/brw_ir_vec4.h | 33 +++++++++++++++++++++++ src/intel/compiler/brw_vec4.cpp | 10 +++++++ src/intel/compiler/brw_vec4_generator.cpp | 11 +++++++- 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_ir_vec4.h b/src/intel/compiler/brw_ir_vec4.h index bd026eb2aeb..56548c38830 100644 --- a/src/intel/compiler/brw_ir_vec4.h +++ b/src/intel/compiler/brw_ir_vec4.h @@ -404,6 +404,39 @@ regs_read(const vec4_instruction *inst, unsigned i) reg_size); } +static inline enum brw_reg_type +get_exec_type(const vec4_instruction *inst) +{ + enum brw_reg_type exec_type = BRW_REGISTER_TYPE_B; + + for (int i = 0; i < 3; i++) { + if (inst->src[i].file != BAD_FILE) { + const brw_reg_type t = get_exec_type(brw_reg_type(inst->src[i].type)); + if (type_sz(t) > type_sz(exec_type)) + exec_type = t; + else if (type_sz(t) == type_sz(exec_type) && + brw_reg_type_is_floating_point(t)) + exec_type = t; + } + } + + if (exec_type == BRW_REGISTER_TYPE_B) + exec_type = inst->dst.type; + + /* TODO: We need to handle half-float conversions. */ + assert(exec_type != BRW_REGISTER_TYPE_HF || + inst->dst.type == BRW_REGISTER_TYPE_HF); + assert(exec_type != BRW_REGISTER_TYPE_B); + + return exec_type; +} + +static inline unsigned +get_exec_type_size(const vec4_instruction *inst) +{ + return type_sz(get_exec_type(inst)); +} + } /* namespace brw */ #endif diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp index d7c09093032..adbd85036e0 100644 --- a/src/intel/compiler/brw_vec4.cpp +++ b/src/intel/compiler/brw_vec4.cpp @@ -2115,6 +2115,16 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, } } + /* IvyBridge can manage a maximum of 4 DFs per SIMD4x2 instruction, since + * it doesn't support compression in Align16 mode, no matter if it has + * force_writemask_all enabled or disabled (the latter is affected by the + * compressed instruction bug in gen7, which is another reason to enforce + * this limit). + */ + if (devinfo->gen == 7 && !devinfo->is_haswell && + (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) + lowered_width = MIN2(lowered_width, 4); + return lowered_width; } diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp index d3192ab7db3..15d6d290fc2 100644 --- a/src/intel/compiler/brw_vec4_generator.cpp +++ b/src/intel/compiler/brw_vec4_generator.cpp @@ -1522,7 +1522,6 @@ generate_code(struct brw_codegen *p, brw_set_default_saturate(p, inst->saturate); brw_set_default_mask_control(p, inst->force_writemask_all); brw_set_default_acc_write_control(p, inst->writes_accumulator); - brw_set_default_exec_size(p, cvt(inst->exec_size) - 1); assert(inst->group % inst->exec_size == 0); assert(inst->group % 8 == 0 || @@ -1530,6 +1529,16 @@ generate_code(struct brw_codegen *p, inst->src[0].type == BRW_REGISTER_TYPE_DF || inst->src[1].type == BRW_REGISTER_TYPE_DF || inst->src[2].type == BRW_REGISTER_TYPE_DF); + + unsigned exec_size = inst->exec_size; + if (devinfo->gen == 7 && + !devinfo->is_haswell && + (get_exec_type_size(inst) == 8 || + inst->dst.type == BRW_REGISTER_TYPE_DF)) + exec_size *= 2; + + brw_set_default_exec_size(p, cvt(exec_size) - 1); + if (!inst->force_writemask_all) brw_set_default_group(p, inst->group); -- 2.30.2