From edde00f5f1a1a6b3f9827af0475f6ff097705c1f Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 17 Apr 2017 11:04:00 -0400 Subject: [PATCH] freedreno/ir3: SSBO/atomic support TODO cwabbott pointed out a write-after-read hazzard, which effects both this and arrays. A write needs to depend on *all* reads since the last write, not just the last read. Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/ir3/ir3.c | 74 ++++++- src/gallium/drivers/freedreno/ir3/ir3.h | 37 +++- .../drivers/freedreno/ir3/ir3_compiler_nir.c | 195 +++++++++++++++++- src/gallium/drivers/freedreno/ir3/ir3_cp.c | 6 + .../drivers/freedreno/ir3/ir3_legalize.c | 7 +- .../drivers/freedreno/ir3/ir3_shader.h | 3 + 6 files changed, 309 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index ff2c342c357..d703f4e7f38 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -475,6 +475,13 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr, struct ir3_register *dst, *src1, *src2; instr_cat6_t *cat6 = ptr; + cat6->type = instr->cat6.type; + cat6->opc = instr->opc; + cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat6->sync = !!(instr->flags & IR3_INSTR_SY); + cat6->g = !!(instr->flags & IR3_INSTR_G); + cat6->opc_cat = 6; + /* the "dst" for a store instruction is (from the perspective * of data flow in the shader, ie. register use/def, etc) in * fact a register that is read by the instruction, rather @@ -500,7 +507,65 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr, * indicate to use the src_off encoding even if offset is zero * (but then what to do about dst_off?) */ - if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) { + if ((instr->opc == OPC_LDGB) || is_atomic(instr->opc)) { + struct ir3_register *src3 = instr->regs[3]; + instr_cat6ldgb_t *ldgb = ptr; + + /* maybe these two bits both determine the instruction encoding? */ + cat6->src_off = false; + + ldgb->d = 4 - 1; /* always .4d ? */ + ldgb->typed = false; /* TODO true for images */ + ldgb->type_size = instr->cat6.iim_val - 1; + + ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + + /* first src is src_ssbo: */ + iassert(src1->flags & IR3_REG_IMMED); + ldgb->src_ssbo = src1->uim_val; + + /* then next two are src1/src2: */ + ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED); + ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED); + ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED); + ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED); + + if (is_atomic(instr->opc)) { + struct ir3_register *src4 = instr->regs[4]; + ldgb->src3 = reg(src4, info, instr->repeat, 0); + ldgb->pad0 = 0x1; + ldgb->pad3 = 0x3; + } else { + ldgb->pad0 = 0x0; + ldgb->pad3 = 0x2; + } + + return 0; + } else if (instr->opc == OPC_STGB) { + struct ir3_register *src3 = instr->regs[4]; + instr_cat6stgb_t *stgb = ptr; + + /* maybe these two bits both determine the instruction encoding? */ + cat6->src_off = true; + stgb->pad3 = 0x2; + + stgb->d = 4 - 1; /* always .4d ? */ + stgb->typed = false; + stgb->type_size = instr->cat6.iim_val - 1; + + /* first src is dst_ssbo: */ + iassert(dst->flags & IR3_REG_IMMED); + stgb->dst_ssbo = dst->uim_val; + + /* then src1/src2/src3: */ + stgb->src1 = reg(src1, info, instr->repeat, 0); + stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED); + stgb->src2_im = !!(src2->flags & IR3_REG_IMMED); + stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED); + stgb->src3_im = !!(src3->flags & IR3_REG_IMMED); + + return 0; + } else if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) { instr_cat6a_t *cat6a = ptr; cat6->src_off = true; @@ -536,13 +601,6 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr, cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); } - cat6->type = instr->cat6.type; - cat6->opc = instr->opc; - cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat6->sync = !!(instr->flags & IR3_INSTR_SY); - cat6->g = !!(instr->flags & IR3_INSTR_G); - cat6->opc_cat = 6; - return 0; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 8d75ec168cb..beb125c5e97 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -226,7 +226,7 @@ struct ir3_instruction { type_t type; int src_offset; int dst_offset; - int iim_val; + int iim_val; /* for ldgb/stgb, # of components */ } cat6; /* for meta-instructions, just used to hold extra data * before instruction scheduling, etc @@ -602,6 +602,7 @@ is_store(struct ir3_instruction *instr) */ switch (instr->opc) { case OPC_STG: + case OPC_STGB: case OPC_STP: case OPC_STL: case OPC_STLW: @@ -617,6 +618,7 @@ static inline bool is_load(struct ir3_instruction *instr) { switch (instr->opc) { case OPC_LDG: + case OPC_LDGB: case OPC_LDL: case OPC_LDP: case OPC_L2G: @@ -931,7 +933,7 @@ int ir3_ra(struct ir3 *ir3, enum shader_t type, bool frag_coord, bool frag_face); /* legalize: */ -void ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary); +void ir3_legalize(struct ir3 *ir, bool *has_samp, bool *has_ssbo, int *max_bary); /* ************************************************************************* */ /* instruction helpers */ @@ -1025,6 +1027,24 @@ ir3_##name(struct ir3_block *block, \ return instr; \ } +#define INSTR4(name) \ +static inline struct ir3_instruction * \ +ir3_##name(struct ir3_block *block, \ + struct ir3_instruction *a, unsigned aflags, \ + struct ir3_instruction *b, unsigned bflags, \ + struct ir3_instruction *c, unsigned cflags, \ + struct ir3_instruction *d, unsigned dflags) \ +{ \ + struct ir3_instruction *instr = \ + ir3_instr_create2(block, OPC_##name, 5); \ + ir3_reg_create(instr, 0, 0); /* dst */ \ + ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a; \ + ir3_reg_create(instr, 0, IR3_REG_SSA | bflags)->instr = b; \ + ir3_reg_create(instr, 0, IR3_REG_SSA | cflags)->instr = c; \ + ir3_reg_create(instr, 0, IR3_REG_SSA | dflags)->instr = d; \ + return instr; \ +} + /* cat0 instructions: */ INSTR0(BR); INSTR0(JUMP); @@ -1142,6 +1162,19 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, INSTR2(LDLV) INSTR2(LDG) INSTR3(STG) +INSTR3(LDGB); +INSTR4(STGB); +INSTR4(ATOMIC_ADD); +INSTR4(ATOMIC_SUB); +INSTR4(ATOMIC_XCHG); +INSTR4(ATOMIC_INC); +INSTR4(ATOMIC_DEC); +INSTR4(ATOMIC_CMPXCHG); +INSTR4(ATOMIC_MIN); +INSTR4(ATOMIC_MAX); +INSTR4(ATOMIC_AND); +INSTR4(ATOMIC_OR); +INSTR4(ATOMIC_XOR); /* ************************************************************************* */ /* split this out or find some helper to use.. like main/bitset.h.. */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 22619e852c2..a164675ed24 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -71,6 +71,20 @@ struct ir3_compile { /* For vertex shaders, keep track of the system values sources */ struct ir3_instruction *vertex_id, *basevertex, *instance_id; + /* For SSBO's and atomics, we need to preserve order, such + * that reads don't overtake writes, and the order of writes + * is preserved. Atomics are considered as a write. + * + * To do this, we track last write and last access, in a + * similar way to ir3_array. But since we don't know whether + * the same SSBO is bound to multiple slots, so we simply + * track this globally rather than per-SSBO. + * + * TODO should we track this per block instead? I guess it + * shouldn't matter much? + */ + struct ir3_instruction *last_write, *last_access; + /* mapping from nir_register to defining instruction: */ struct hash_table *def_ht; @@ -430,7 +444,7 @@ create_uniform_indirect(struct ir3_compile *ctx, int n, } static struct ir3_instruction * -create_collect(struct ir3_block *block, struct ir3_instruction **arr, +create_collect(struct ir3_block *block, struct ir3_instruction *const *arr, unsigned arrsz) { struct ir3_instruction *collect; @@ -1136,6 +1150,165 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) } } +static void +mark_ssbo_read(struct ir3_compile *ctx, struct ir3_instruction *instr) +{ + instr->regs[0]->instr = ctx->last_write; + instr->regs[0]->flags |= IR3_REG_SSA; + ctx->last_access = instr; +} + +static void +mark_ssbo_write(struct ir3_compile *ctx, struct ir3_instruction *instr) +{ + instr->regs[0]->instr = ctx->last_access; + instr->regs[0]->flags |= IR3_REG_SSA; + ctx->last_write = ctx->last_access = instr; +} + +static void +emit_intrinsic_load_ssbo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *ldgb, *src0, *src1, *offset; + nir_const_value *const_offset; + + /* can this be non-const buffer_index? how do we handle that? */ + const_offset = nir_src_as_const_value(intr->src[0]); + compile_assert(ctx, const_offset); + + offset = get_src(ctx, &intr->src[1])[0]; + + /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */ + src0 = create_collect(b, (struct ir3_instruction*[]){ + offset, + create_immed(b, 0), + }, 2); + src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + + ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0, + src0, 0, src1, 0); + ldgb->regs[0]->wrmask = (1 << intr->num_components) - 1; + ldgb->cat6.iim_val = intr->num_components; + ldgb->cat6.type = TYPE_U32; + mark_ssbo_read(ctx, ldgb); + + split_dest(b, dst, ldgb, 0, intr->num_components); +} + +/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */ +static void +emit_intrinsic_store_ssbo(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *stgb, *src0, *src1, *src2, *offset; + nir_const_value *const_offset; + unsigned ncomp = ffs(~intr->const_index[0]) - 1; + + /* can this be non-const buffer_index? how do we handle that? */ + const_offset = nir_src_as_const_value(intr->src[1]); + compile_assert(ctx, const_offset); + + offset = get_src(ctx, &intr->src[2])[0]; + + /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0).. + * nir already *= 4: + */ + src0 = create_collect(b, get_src(ctx, &intr->src[0]), ncomp); + src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + src2 = create_collect(b, (struct ir3_instruction*[]){ + offset, + create_immed(b, 0), + }, 2); + + stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0, + src0, 0, src1, 0, src2, 0); + stgb->cat6.iim_val = ncomp; + stgb->cat6.type = TYPE_U32; + mark_ssbo_write(ctx, stgb); + + array_insert(b, b->keeps, stgb); +} + +static void +emit_intrinsic_atomic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset; + nir_const_value *const_offset; + type_t type = TYPE_U32; + + /* can this be non-const buffer_index? how do we handle that? */ + const_offset = nir_src_as_const_value(intr->src[0]); + compile_assert(ctx, const_offset); + ssbo = create_immed(b, const_offset->u32[0]); + + offset = get_src(ctx, &intr->src[1])[0]; + + /* src0 is data (or uvec2(data, compare) + * src1 is offset + * src2 is uvec2(offset*4, 0) + * + * Note that nir already multiplies the offset by four + */ + src0 = get_src(ctx, &intr->src[2])[0]; + src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + src2 = create_collect(b, (struct ir3_instruction*[]){ + offset, + create_immed(b, 0), + }, 2); + + switch (intr->intrinsic) { + case nir_intrinsic_ssbo_atomic_add: + atomic = ir3_ATOMIC_ADD(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_imin: + atomic = ir3_ATOMIC_MIN(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + type = TYPE_S32; + break; + case nir_intrinsic_ssbo_atomic_umin: + atomic = ir3_ATOMIC_MIN(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_imax: + atomic = ir3_ATOMIC_MAX(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + type = TYPE_S32; + break; + case nir_intrinsic_ssbo_atomic_umax: + atomic = ir3_ATOMIC_MAX(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_and: + atomic = ir3_ATOMIC_AND(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_or: + atomic = ir3_ATOMIC_OR(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_xor: + atomic = ir3_ATOMIC_XOR(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_exchange: + atomic = ir3_ATOMIC_XCHG(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + /* for cmpxchg, src0 is [ui]vec2(data, compare): */ + src0 = create_collect(b, (struct ir3_instruction*[]){ + src0, + get_src(ctx, &intr->src[3])[0], + }, 2); + atomic = ir3_ATOMIC_CMPXCHG(b, ssbo, 0, src0, 0, src1, 0, src2, 0); + break; + default: + unreachable("boo"); + } + + atomic->cat6.iim_val = 1; + atomic->cat6.type = type; + mark_ssbo_write(ctx, atomic); + + /* even if nothing consume the result, we can't DCE the instruction: */ + array_insert(b, b->keeps, atomic); +} + static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, struct ir3_instruction *instr) { @@ -1225,6 +1398,24 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) case nir_intrinsic_store_var: emit_intrinsic_store_var(ctx, intr); break; + case nir_intrinsic_load_ssbo: + emit_intrinsic_load_ssbo(ctx, intr, dst); + break; + case nir_intrinsic_store_ssbo: + emit_intrinsic_store_ssbo(ctx, intr); + break; + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + emit_intrinsic_atomic(ctx, intr); + break; case nir_intrinsic_store_output: idx = nir_intrinsic_base(intr); const_offset = nir_src_as_const_value(intr->src[1]); @@ -2541,7 +2732,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, /* We need to do legalize after (for frag shader's) the "bary.f" * offsets (inloc) have been assigned. */ - ir3_legalize(ir, &so->has_samp, &max_bary); + ir3_legalize(ir, &so->has_samp, &so->has_ssbo, &max_bary); if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("AFTER LEGALIZE:\n"); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index 7bb858df4d4..8c907eb5a53 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -193,6 +193,12 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n, */ if (is_store(instr) && (n == 1)) return false; + + /* disallow CP into anything but the SSBO slot argument for + * atomics: + */ + if (is_atomic(instr->opc) && (n != 0)) + return false; } break; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c index 6acea011d5c..fffa76504da 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c @@ -43,6 +43,7 @@ struct ir3_legalize_ctx { bool has_samp; + bool has_ssbo; int max_bary; }; @@ -192,6 +193,9 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) regmask_set(&needs_sy, n->regs[0]); } + if ((n->opc == OPC_LDGB) || (n->opc == OPC_STGB) || is_atomic(n->opc)) + ctx->has_ssbo = true; + /* both tex/sfu appear to not always immediately consume * their src register(s): */ @@ -388,7 +392,7 @@ mark_convergence_points(struct ir3 *ir) } void -ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary) +ir3_legalize(struct ir3 *ir, bool *has_samp, bool *has_ssbo, int *max_bary) { struct ir3_legalize_ctx ctx = { .max_bary = -1, @@ -399,6 +403,7 @@ ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary) } *has_samp = ctx.has_samp; + *has_ssbo = ctx.has_ssbo; *max_bary = ctx.max_bary; do { diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index e5dcb739783..a06dd04904c 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -249,6 +249,9 @@ struct ir3_shader_variant { /* do we have one or more texture sample instructions: */ bool has_samp; + /* do we have one or more SSBO instructions: */ + bool has_ssbo; + /* do we have kill instructions: */ bool has_kill; -- 2.30.2