From eaae81058cdd0ed103c55be7a1722546d63c86da Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 9 Nov 2017 10:56:43 -0500 Subject: [PATCH] freedreno/ir3: shared variable support Signed-off-by: Rob Clark --- .../drivers/freedreno/ir3/ir3_compiler_nir.c | 161 ++++++++++++++++++ src/gallium/drivers/freedreno/ir3/ir3_cp.c | 9 + .../drivers/freedreno/ir3/ir3_legalize.c | 9 +- 3 files changed, 177 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 0d642772f9e..640805a4f68 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -1432,6 +1432,149 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) return atomic; } +/* src[] = { offset }. const_index[] = { base } */ +static void +emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *ldl, *offset; + unsigned base; + + offset = get_src(ctx, &intr->src[0])[0]; + base = intr->const_index[0]; + + ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0); + ldl->cat6.src_offset = base; + ldl->cat6.type = TYPE_U32; + ldl->regs[0]->wrmask = MASK(intr->num_components); + + mark_read(ctx, ldl); + + split_dest(b, dst, ldl, 0, intr->num_components); +} + +/* src[] = { value, offset }. const_index[] = { base, write_mask } */ +static void +emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *stl, *offset; + struct ir3_instruction * const *value; + unsigned base, wrmask; + + value = get_src(ctx, &intr->src[0]); + offset = get_src(ctx, &intr->src[1])[0]; + + base = intr->const_index[0]; + wrmask = intr->const_index[1]; + + /* Combine groups of consecutive enabled channels in one write + * message. We use ffs to find the first enabled channel and then ffs on + * the bit-inverse, down-shifted writemask to determine the length of + * the block of enabled bits. + * + * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic()) + */ + while (wrmask) { + unsigned first_component = ffs(wrmask) - 1; + unsigned length = ffs(~(wrmask >> first_component)) - 1; + + stl = ir3_STL(b, offset, 0, + create_collect(b, &value[first_component], length), 0, + create_immed(b, length), 0); + stl->cat6.dst_offset = first_component + base; + stl->cat6.type = TYPE_U32; + + mark_write(ctx, stl); + array_insert(b, b->keeps, stl); + + /* Clear the bits in the writemask that we just wrote, then try + * again to see if more channels are left. + */ + wrmask &= (15 << (first_component + length)); + } +} + +/* + * CS shared variable atomic intrinsics + * + * All of the shared variable atomic memory operations read a value from + * memory, compute a new value using one of the operations below, write the + * new value to memory, and return the original value read. + * + * All operations take 2 sources except CompSwap that takes 3. These + * sources represent: + * + * 0: The offset into the shared variable storage region that the atomic + * operation will operate on. + * 1: The data parameter to the atomic function (i.e. the value to add + * in shared_atomic_add, etc). + * 2: For CompSwap only: the second data parameter. + */ +static struct ir3_instruction * +emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *atomic, *src0, *src1; + type_t type = TYPE_U32; + + src0 = get_src(ctx, &intr->src[0])[0]; /* offset */ + src1 = get_src(ctx, &intr->src[1])[0]; /* value */ + + switch (intr->intrinsic) { + case nir_intrinsic_shared_atomic_add: + atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_imin: + atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); + type = TYPE_S32; + break; + case nir_intrinsic_shared_atomic_umin: + atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_imax: + atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); + type = TYPE_S32; + break; + case nir_intrinsic_shared_atomic_umax: + atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_and: + atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_or: + atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_xor: + atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_exchange: + atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0); + break; + case nir_intrinsic_shared_atomic_comp_swap: + /* for cmpxchg, src1 is [ui]vec2(data, compare): */ + src1 = create_collect(b, (struct ir3_instruction*[]){ + get_src(ctx, &intr->src[2])[0], + src1, + }, 2); + atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0); + break; + default: + unreachable("boo"); + } + + atomic->cat6.iim_val = 1; + atomic->cat6.d = 1; + atomic->cat6.type = type; + mark_write(ctx, atomic); + + /* even if nothing consume the result, we can't DCE the instruction: */ + array_insert(b, b->keeps, atomic); + + return atomic; +} + static void emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr) { @@ -1586,6 +1729,24 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) case nir_intrinsic_ssbo_atomic_comp_swap: dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr); break; + case nir_intrinsic_load_shared: + emit_intrinsic_load_shared(ctx, intr, dst); + break; + case nir_intrinsic_store_shared: + emit_intrinsic_store_shared(ctx, intr); + break; + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + dst[0] = emit_intrinsic_atomic_shared(ctx, intr); + break; case nir_intrinsic_barrier: case nir_intrinsic_memory_barrier: case nir_intrinsic_group_memory_barrier: diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index 8c907eb5a53..61b4b201215 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -194,11 +194,20 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n, if (is_store(instr) && (n == 1)) return false; + if ((instr->opc == OPC_LDL) && (n != 1)) + return false; + + if ((instr->opc == OPC_STL) && (n != 2)) + return false; + /* disallow CP into anything but the SSBO slot argument for * atomics: */ if (is_atomic(instr->opc) && (n != 0)) return false; + + if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G)) + return false; } break; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c index d6850eb12a0..a206837ef84 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c @@ -191,13 +191,18 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) /* seems like ldlv needs (ss) bit instead?? which is odd but * makes a bunch of flat-varying tests start working on a4xx. */ - if (n->opc == OPC_LDLV) + if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL)) regmask_set(&needs_ss, n->regs[0]); else regmask_set(&needs_sy, n->regs[0]); + } else if (is_atomic(n->opc)) { + if (n->flags & IR3_INSTR_G) + regmask_set(&needs_sy, n->regs[0]); + else + regmask_set(&needs_ss, n->regs[0]); } - if ((n->opc == OPC_LDGB) || (n->opc == OPC_STGB) || is_atomic(n->opc)) + if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G))) ctx->has_ssbo = true; /* both tex/sfu appear to not always immediately consume -- 2.30.2