nv50/ir: fix s32 x s32 -> high s32 multiply logic

author Ilia Mirkin <imirkin@alum.mit.edu>

Thu, 15 May 2014 03:30:16 +0000 (23:30 -0400)

committer Ilia Mirkin <imirkin@alum.mit.edu>

Wed, 21 May 2014 13:31:16 +0000 (09:31 -0400)
author Ilia Mirkin <imirkin@alum.mit.edu>
Thu, 15 May 2014 03:30:16 +0000 (23:30 -0400)
committer Ilia Mirkin <imirkin@alum.mit.edu>
Wed, 21 May 2014 13:31:16 +0000 (09:31 -0400)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp

index b17d57d0bfd2cb51a8ff129184855a81853f224d..0fb76663ffe5c9c24e5987cc2badaafb8dca8439 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -37,18 +37,25 @@ namespace nv50_ir {
  //    ah*bl 00
  //
  // fffe0001 + fffe0001
+//
+// Note that this sort of splitting doesn't work for signed values, so we
+// compute the sign on those manually and then perform an unsigned multiply.
  static bool
  expandIntegerMUL(BuildUtil *bld, Instruction *mul)
  {
     const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
  
-   DataType fTy = mul->sType; // full type
-   DataType hTy;
+   DataType fTy; // full type
+   switch (mul->sType) {
+   case TYPE_S32: fTy = TYPE_U32; break;
+   case TYPE_S64: fTy = TYPE_U64; break;
+   default: fTy = mul->sType; break;
+   }
+
+   DataType hTy; // half type
     switch (fTy) {
-   case TYPE_S32: hTy = TYPE_S16; break;
     case TYPE_U32: hTy = TYPE_U16; break;
     case TYPE_U64: hTy = TYPE_U32; break;
-   case TYPE_S64: hTy = TYPE_S32; break;
     default:
        return false;
     }
@@ -59,15 +66,25 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
  
     bld->setPosition(mul, true);
  
+   Value *s[2];
     Value *a[2], *b[2];
-   Value *c[2];
     Value *t[4];
     for (int j = 0; j < 4; ++j)
        t[j] = bld->getSSA(fullSize);
  
+   s[0] = mul->getSrc(0);
+   s[1] = mul->getSrc(1);
+
+   if (isSignedType(mul->sType)) {
+      s[0] = bld->getSSA(fullSize);
+      s[1] = bld->getSSA(fullSize);
+      bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
+      bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
+   }
+
     // split sources into halves
-   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
-   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
+   i[0] = bld->mkSplit(a, halfSize, s[0]);
+   i[1] = bld->mkSplit(b, halfSize, s[1]);
  
     i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
     i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
@@ -75,24 +92,76 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
     i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
  
     if (highResult) {
-      Value *r[4];
+      Value *c[2];
+      Value *r[5];
        Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
        c[0] = bld->getSSA(1, FILE_FLAGS);
        c[1] = bld->getSSA(1, FILE_FLAGS);
-      for (int j = 0; j < 4; ++j)
+      for (int j = 0; j < 5; ++j)
           r[j] = bld->getSSA(fullSize);
  
        i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
        i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
        bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
        bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
-      i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+      i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
  
        // set carry defs / sources
        i[3]->setFlagsDef(1, c[0]);
-      i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+      // actual result required in negative case, but ignored for
+      // unsigned. for some reason the compiler ends up dropping the whole
+      // instruction if the destination is unused but the flags are.
+      if (isSignedType(mul->sType))
+         i[4]->setFlagsDef(1, c[1]);
+      else
+         i[4]->setFlagsDef(0, c[1]);
        i[6]->setPredicate(CC_C, c[0]);
        i[5]->setFlagsSrc(3, c[1]);
+
+      if (isSignedType(mul->sType)) {
+         Value *cc[2];
+         Value *rr[7];
+         Value *one = bld->getSSA(fullSize);
+         bld->loadImm(one, 1);
+         for (int j = 0; j < 7; j++)
+            rr[j] = bld->getSSA(fullSize);
+
+         // NOTE: this logic uses predicates because splitting basic blocks is
+         // ~impossible during the SSA phase. The RA relies on a correlation
+         // between edge order and phi node sources.
+
+         // Set the sign of the result based on the inputs
+         bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
+            ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
+
+         // 1s complement of 64-bit value
+         bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
+            ->setPredicate(CC_S, cc[0]);
+         bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
+            ->setPredicate(CC_S, cc[0]);
+
+         // add to low 32-bits, keep track of the carry
+         Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
+         n->setPredicate(CC_S, cc[0]);
+         n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
+
+         // If there was a carry, add 1 to the upper 32 bits
+         // XXX: These get executed even if they shouldn't be
+         bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
+            ->setPredicate(CC_C, cc[1]);
+         bld->mkMov(rr[3], rr[0])
+            ->setPredicate(CC_NC, cc[1]);
+         bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
+
+         // Merge the results from the negative and non-negative paths
+         bld->mkMov(rr[5], rr[4])
+            ->setPredicate(CC_S, cc[0]);
+         bld->mkMov(rr[6], r[4])
+            ->setPredicate(CC_NS, cc[0]);
+         bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
+      } else {
+         bld->mkMov(mul->getDef(0), r[4]);
+      }
     } else {
        bld->mkMov(mul->getDef(0), t[3]);
     }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp

index 799ac2fd2ab46d4fda4d737b2ecf1578910e5937..abadc7fb4e4b1df818d04bc09c9326afa5bd162d 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -332,6 +332,8 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
           return false;
        if (sf == FILE_IMMEDIATE)
           return false;
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH && sf == FILE_MEMORY_CONST)
+         return false;
        ldSize = 2;
     } else {
        ldSize = typeSizeof(ld->dType);
author	Ilia Mirkin <imirkin@alum.mit.edu>
	Thu, 15 May 2014 03:30:16 +0000 (23:30 -0400)
committer	Ilia Mirkin <imirkin@alum.mit.edu>
	Wed, 21 May 2014 13:31:16 +0000 (09:31 -0400)
src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp		patch \| blob \| history
src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp		patch \| blob \| history