freedreno/ir3: shared variable support
authorRob Clark <robdclark@gmail.com>
Thu, 9 Nov 2017 15:56:43 +0000 (10:56 -0500)
committerRob Clark <robdclark@gmail.com>
Sun, 12 Nov 2017 17:28:59 +0000 (12:28 -0500)
Signed-off-by: Rob Clark <robdclark@gmail.com>
src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
src/gallium/drivers/freedreno/ir3/ir3_cp.c
src/gallium/drivers/freedreno/ir3/ir3_legalize.c

index 0d642772f9e5b1db08ef4f13b631103c2599d476..640805a4f68f980f6ed1c1248b8ac74f2071e658 100644 (file)
@@ -1432,6 +1432,149 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
        return atomic;
 }
 
+/* src[] = { offset }. const_index[] = { base } */
+static void
+emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+               struct ir3_instruction **dst)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *ldl, *offset;
+       unsigned base;
+
+       offset = get_src(ctx, &intr->src[0])[0];
+       base   = intr->const_index[0];
+
+       ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0);
+       ldl->cat6.src_offset = base;
+       ldl->cat6.type = TYPE_U32;
+       ldl->regs[0]->wrmask = MASK(intr->num_components);
+
+       mark_read(ctx, ldl);
+
+       split_dest(b, dst, ldl, 0, intr->num_components);
+}
+
+/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+static void
+emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *stl, *offset;
+       struct ir3_instruction * const *value;
+       unsigned base, wrmask;
+
+       value  = get_src(ctx, &intr->src[0]);
+       offset = get_src(ctx, &intr->src[1])[0];
+
+       base   = intr->const_index[0];
+       wrmask = intr->const_index[1];
+
+       /* Combine groups of consecutive enabled channels in one write
+        * message. We use ffs to find the first enabled channel and then ffs on
+        * the bit-inverse, down-shifted writemask to determine the length of
+        * the block of enabled bits.
+        *
+        * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
+        */
+       while (wrmask) {
+               unsigned first_component = ffs(wrmask) - 1;
+               unsigned length = ffs(~(wrmask >> first_component)) - 1;
+
+               stl = ir3_STL(b, offset, 0,
+                       create_collect(b, &value[first_component], length), 0,
+                       create_immed(b, length), 0);
+               stl->cat6.dst_offset = first_component + base;
+               stl->cat6.type = TYPE_U32;
+
+               mark_write(ctx, stl);
+               array_insert(b, b->keeps, stl);
+
+               /* Clear the bits in the writemask that we just wrote, then try
+                * again to see if more channels are left.
+                */
+               wrmask &= (15 << (first_component + length));
+       }
+}
+
+/*
+ * CS shared variable atomic intrinsics
+ *
+ * All of the shared variable atomic memory operations read a value from
+ * memory, compute a new value using one of the operations below, write the
+ * new value to memory, and return the original value read.
+ *
+ * All operations take 2 sources except CompSwap that takes 3. These
+ * sources represent:
+ *
+ * 0: The offset into the shared variable storage region that the atomic
+ *    operation will operate on.
+ * 1: The data parameter to the atomic function (i.e. the value to add
+ *    in shared_atomic_add, etc).
+ * 2: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *atomic, *src0, *src1;
+       type_t type = TYPE_U32;
+
+       src0 = get_src(ctx, &intr->src[0])[0];   /* offset */
+       src1 = get_src(ctx, &intr->src[1])[0];   /* value */
+
+       switch (intr->intrinsic) {
+       case nir_intrinsic_shared_atomic_add:
+               atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_imin:
+               atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+               type = TYPE_S32;
+               break;
+       case nir_intrinsic_shared_atomic_umin:
+               atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_imax:
+               atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+               type = TYPE_S32;
+               break;
+       case nir_intrinsic_shared_atomic_umax:
+               atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_and:
+               atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_or:
+               atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_xor:
+               atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_exchange:
+               atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_comp_swap:
+               /* for cmpxchg, src1 is [ui]vec2(data, compare): */
+               src1 = create_collect(b, (struct ir3_instruction*[]){
+                       get_src(ctx, &intr->src[2])[0],
+                       src1,
+               }, 2);
+               atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
+               break;
+       default:
+               unreachable("boo");
+       }
+
+       atomic->cat6.iim_val = 1;
+       atomic->cat6.d = 1;
+       atomic->cat6.type = type;
+       mark_write(ctx, atomic);
+
+       /* even if nothing consume the result, we can't DCE the instruction: */
+       array_insert(b, b->keeps, atomic);
+
+       return atomic;
+}
+
 static void
 emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
@@ -1586,6 +1729,24 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
        case nir_intrinsic_ssbo_atomic_comp_swap:
                dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
                break;
+       case nir_intrinsic_load_shared:
+               emit_intrinsic_load_shared(ctx, intr, dst);
+               break;
+       case nir_intrinsic_store_shared:
+               emit_intrinsic_store_shared(ctx, intr);
+               break;
+       case nir_intrinsic_shared_atomic_add:
+       case nir_intrinsic_shared_atomic_imin:
+       case nir_intrinsic_shared_atomic_umin:
+       case nir_intrinsic_shared_atomic_imax:
+       case nir_intrinsic_shared_atomic_umax:
+       case nir_intrinsic_shared_atomic_and:
+       case nir_intrinsic_shared_atomic_or:
+       case nir_intrinsic_shared_atomic_xor:
+       case nir_intrinsic_shared_atomic_exchange:
+       case nir_intrinsic_shared_atomic_comp_swap:
+               dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
+               break;
        case nir_intrinsic_barrier:
        case nir_intrinsic_memory_barrier:
        case nir_intrinsic_group_memory_barrier:
index 8c907eb5a534d9be15b7d6849d6793b1c78d1741..61b4b20121511f7a434f1a1450c71081885c723c 100644 (file)
@@ -194,11 +194,20 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
                        if (is_store(instr) && (n == 1))
                                return false;
 
+                       if ((instr->opc == OPC_LDL) && (n != 1))
+                               return false;
+
+                       if ((instr->opc == OPC_STL) && (n != 2))
+                               return false;
+
                        /* disallow CP into anything but the SSBO slot argument for
                         * atomics:
                         */
                        if (is_atomic(instr->opc) && (n != 0))
                                return false;
+
+                       if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
+                               return false;
                }
 
                break;
index d6850eb12a09edb9ca85c545ee31a98f4f72c183..a206837ef8431dcd26aba15701b206f3a1941f90 100644 (file)
@@ -191,13 +191,18 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
                        /* seems like ldlv needs (ss) bit instead??  which is odd but
                         * makes a bunch of flat-varying tests start working on a4xx.
                         */
-                       if (n->opc == OPC_LDLV)
+                       if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
                                regmask_set(&needs_ss, n->regs[0]);
                        else
                                regmask_set(&needs_sy, n->regs[0]);
+               } else if (is_atomic(n->opc)) {
+                       if (n->flags & IR3_INSTR_G)
+                               regmask_set(&needs_sy, n->regs[0]);
+                       else
+                               regmask_set(&needs_ss, n->regs[0]);
                }
 
-               if ((n->opc == OPC_LDGB) || (n->opc == OPC_STGB) || is_atomic(n->opc))
+               if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
                        ctx->has_ssbo = true;
 
                /* both tex/sfu appear to not always immediately consume