From 4ec09c77471e39e6ff81c99f1edde2e1713a7f24 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Tue, 12 May 2015 15:51:44 -0700 Subject: [PATCH] i965/fs: Support integer multiplication in SIMD16 on Haswell. Ivybridge (and presumably Baytrail) have a bug that prevents this from working. Reviewed-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_fs.cpp | 52 +++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index cb13fcb1cc8..01e3139229d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -3561,10 +3561,42 @@ fs_visitor::lower_integer_multiplication() insert(MUL(inst->dst, inst->src[0], inst->src[1])); } } else { - if (devinfo->gen >= 7) + /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot + * do 32-bit integer multiplication in one instruction, but instead + * must do a sequence (which actually calculates a 64-bit result): + * + * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D + * mach(8) null g3<8,8,1>D g4<8,8,1>D + * mov(8) g2<1>D acc0<8,8,1>D + * + * But on Gen > 6, the ability to use second accumulator register + * (acc1) for non-float data types was removed, preventing a simple + * implementation in SIMD16. A 16-channel result can be calculated by + * executing the three instructions twice in SIMD8, once with quarter + * control of 1Q for the first eight channels and again with 2Q for + * the second eight channels. + * + * Which accumulator register is implicitly accessed (by AccWrEnable + * for instance) is determined by the quarter control. Unfortunately + * Ivybridge (and presumably Baytrail) has a hardware bug in which an + * implicit accumulator access by an instruction with 2Q will access + * acc1 regardless of whether the data type is usable in acc1. + * + * Specifically, the 2Q mach(8) writes acc1 which does not exist for + * integer data types. + */ + if (devinfo->gen == 7 && !devinfo->is_haswell) no16("SIMD16 integer multiply unsupported\n"); - const unsigned channels = dispatch_width; + /* From the IVB PRM, volume 4 part 3, page 42: + * + * "For any DWord operation, including DWord multiply, accumulator + * can store up to 8 channels of data, with only acc0 supported." + * + * So make the accumulator (and null register) only 8-channels wide on + * Gen7+. + */ + const unsigned channels = devinfo->gen >= 7 ? 8 : dispatch_width; const enum brw_reg_type type = inst->dst.type; const fs_reg acc(retype(brw_acc_reg(channels), type)); const fs_reg null(retype(brw_null_vec(channels), type)); @@ -3572,9 +3604,19 @@ fs_visitor::lower_integer_multiplication() const fs_reg &src0 = inst->src[0]; const fs_reg &src1 = inst->src[1]; - insert(MUL(acc, src0, src1)); - insert(MACH(null, src0, src1)); - insert(MOV(inst->dst, acc)); + if (devinfo->gen >= 7 && dispatch_width == 16) { + insert(MUL(acc, half(src0, 0), half(src1, 0))); + insert(MACH(null, half(src0, 0), half(src1, 0))); + insert(MOV(half(inst->dst, 0), acc)); + + insert(set_sechalf(MUL(acc, half(src0, 1), half(src1, 1)))); + insert(set_sechalf(MACH(null, half(src0, 1), half(src1, 1)))); + insert(set_sechalf(MOV(half(inst->dst, 1), acc))); + } else { + insert(MUL(acc, src0, src1)); + insert(MACH(null, src0, src1)); + insert(MOV(inst->dst, acc)); + } } #undef insert -- 2.30.2