struct ir3_register *dst, *src1, *src2;
instr_cat6_t *cat6 = ptr;
+ cat6->type = instr->cat6.type;
+ cat6->opc = instr->opc;
+ cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+ cat6->sync = !!(instr->flags & IR3_INSTR_SY);
+ cat6->g = !!(instr->flags & IR3_INSTR_G);
+ cat6->opc_cat = 6;
+
/* the "dst" for a store instruction is (from the perspective
* of data flow in the shader, ie. register use/def, etc) in
* fact a register that is read by the instruction, rather
* indicate to use the src_off encoding even if offset is zero
* (but then what to do about dst_off?)
*/
- if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) {
+ if ((instr->opc == OPC_LDGB) || is_atomic(instr->opc)) {
+ struct ir3_register *src3 = instr->regs[3];
+ instr_cat6ldgb_t *ldgb = ptr;
+
+ /* maybe these two bits both determine the instruction encoding? */
+ cat6->src_off = false;
+
+ ldgb->d = 4 - 1; /* always .4d ? */
+ ldgb->typed = false; /* TODO true for images */
+ ldgb->type_size = instr->cat6.iim_val - 1;
+
+ ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+ /* first src is src_ssbo: */
+ iassert(src1->flags & IR3_REG_IMMED);
+ ldgb->src_ssbo = src1->uim_val;
+
+ /* then next two are src1/src2: */
+ ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+ ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+ ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+ ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+ if (is_atomic(instr->opc)) {
+ struct ir3_register *src4 = instr->regs[4];
+ ldgb->src3 = reg(src4, info, instr->repeat, 0);
+ ldgb->pad0 = 0x1;
+ ldgb->pad3 = 0x3;
+ } else {
+ ldgb->pad0 = 0x0;
+ ldgb->pad3 = 0x2;
+ }
+
+ return 0;
+ } else if (instr->opc == OPC_STGB) {
+ struct ir3_register *src3 = instr->regs[4];
+ instr_cat6stgb_t *stgb = ptr;
+
+ /* maybe these two bits both determine the instruction encoding? */
+ cat6->src_off = true;
+ stgb->pad3 = 0x2;
+
+ stgb->d = 4 - 1; /* always .4d ? */
+ stgb->typed = false;
+ stgb->type_size = instr->cat6.iim_val - 1;
+
+ /* first src is dst_ssbo: */
+ iassert(dst->flags & IR3_REG_IMMED);
+ stgb->dst_ssbo = dst->uim_val;
+
+ /* then src1/src2/src3: */
+ stgb->src1 = reg(src1, info, instr->repeat, 0);
+ stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+ stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+ stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+ stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
+
+ return 0;
+ } else if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) {
instr_cat6a_t *cat6a = ptr;
cat6->src_off = true;
cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
}
- cat6->type = instr->cat6.type;
- cat6->opc = instr->opc;
- cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
- cat6->sync = !!(instr->flags & IR3_INSTR_SY);
- cat6->g = !!(instr->flags & IR3_INSTR_G);
- cat6->opc_cat = 6;
-
return 0;
}
/* For vertex shaders, keep track of the system values sources */
struct ir3_instruction *vertex_id, *basevertex, *instance_id;
+ /* For SSBO's and atomics, we need to preserve order, such
+ * that reads don't overtake writes, and the order of writes
+ * is preserved. Atomics are considered as a write.
+ *
+ * To do this, we track last write and last access, in a
+ * similar way to ir3_array. But since we don't know whether
+ * the same SSBO is bound to multiple slots, so we simply
+ * track this globally rather than per-SSBO.
+ *
+ * TODO should we track this per block instead? I guess it
+ * shouldn't matter much?
+ */
+ struct ir3_instruction *last_write, *last_access;
+
/* mapping from nir_register to defining instruction: */
struct hash_table *def_ht;
}
static struct ir3_instruction *
-create_collect(struct ir3_block *block, struct ir3_instruction **arr,
+create_collect(struct ir3_block *block, struct ir3_instruction *const *arr,
unsigned arrsz)
{
struct ir3_instruction *collect;
}
}
+static void
+mark_ssbo_read(struct ir3_compile *ctx, struct ir3_instruction *instr)
+{
+ instr->regs[0]->instr = ctx->last_write;
+ instr->regs[0]->flags |= IR3_REG_SSA;
+ ctx->last_access = instr;
+}
+
+static void
+mark_ssbo_write(struct ir3_compile *ctx, struct ir3_instruction *instr)
+{
+ instr->regs[0]->instr = ctx->last_access;
+ instr->regs[0]->flags |= IR3_REG_SSA;
+ ctx->last_write = ctx->last_access = instr;
+}
+
+static void
+emit_intrinsic_load_ssbo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *ldgb, *src0, *src1, *offset;
+ nir_const_value *const_offset;
+
+ /* can this be non-const buffer_index? how do we handle that? */
+ const_offset = nir_src_as_const_value(intr->src[0]);
+ compile_assert(ctx, const_offset);
+
+ offset = get_src(ctx, &intr->src[1])[0];
+
+ /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
+ src0 = create_collect(b, (struct ir3_instruction*[]){
+ offset,
+ create_immed(b, 0),
+ }, 2);
+ src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+ ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
+ src0, 0, src1, 0);
+ ldgb->regs[0]->wrmask = (1 << intr->num_components) - 1;
+ ldgb->cat6.iim_val = intr->num_components;
+ ldgb->cat6.type = TYPE_U32;
+ mark_ssbo_read(ctx, ldgb);
+
+ split_dest(b, dst, ldgb, 0, intr->num_components);
+}
+
+/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
+static void
+emit_intrinsic_store_ssbo(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *stgb, *src0, *src1, *src2, *offset;
+ nir_const_value *const_offset;
+ unsigned ncomp = ffs(~intr->const_index[0]) - 1;
+
+ /* can this be non-const buffer_index? how do we handle that? */
+ const_offset = nir_src_as_const_value(intr->src[1]);
+ compile_assert(ctx, const_offset);
+
+ offset = get_src(ctx, &intr->src[2])[0];
+
+ /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
+ * nir already *= 4:
+ */
+ src0 = create_collect(b, get_src(ctx, &intr->src[0]), ncomp);
+ src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+ src2 = create_collect(b, (struct ir3_instruction*[]){
+ offset,
+ create_immed(b, 0),
+ }, 2);
+
+ stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0,
+ src0, 0, src1, 0, src2, 0);
+ stgb->cat6.iim_val = ncomp;
+ stgb->cat6.type = TYPE_U32;
+ mark_ssbo_write(ctx, stgb);
+
+ array_insert(b, b->keeps, stgb);
+}
+
+static void
+emit_intrinsic_atomic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+{
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset;
+ nir_const_value *const_offset;
+ type_t type = TYPE_U32;
+
+ /* can this be non-const buffer_index? how do we handle that? */
+ const_offset = nir_src_as_const_value(intr->src[0]);
+ compile_assert(ctx, const_offset);
+ ssbo = create_immed(b, const_offset->u32[0]);
+
+ offset = get_src(ctx, &intr->src[1])[0];
+
+ /* src0 is data (or uvec2(data, compare)
+ * src1 is offset
+ * src2 is uvec2(offset*4, 0)
+ *
+ * Note that nir already multiplies the offset by four
+ */
+ src0 = get_src(ctx, &intr->src[2])[0];
+ src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+ src2 = create_collect(b, (struct ir3_instruction*[]){
+ offset,
+ create_immed(b, 0),
+ }, 2);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_ssbo_atomic_add:
+ atomic = ir3_ATOMIC_ADD(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_imin:
+ atomic = ir3_ATOMIC_MIN(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ type = TYPE_S32;
+ break;
+ case nir_intrinsic_ssbo_atomic_umin:
+ atomic = ir3_ATOMIC_MIN(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_imax:
+ atomic = ir3_ATOMIC_MAX(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ type = TYPE_S32;
+ break;
+ case nir_intrinsic_ssbo_atomic_umax:
+ atomic = ir3_ATOMIC_MAX(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_and:
+ atomic = ir3_ATOMIC_AND(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_or:
+ atomic = ir3_ATOMIC_OR(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_xor:
+ atomic = ir3_ATOMIC_XOR(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_exchange:
+ atomic = ir3_ATOMIC_XCHG(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ /* for cmpxchg, src0 is [ui]vec2(data, compare): */
+ src0 = create_collect(b, (struct ir3_instruction*[]){
+ src0,
+ get_src(ctx, &intr->src[3])[0],
+ }, 2);
+ atomic = ir3_ATOMIC_CMPXCHG(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+ break;
+ default:
+ unreachable("boo");
+ }
+
+ atomic->cat6.iim_val = 1;
+ atomic->cat6.type = type;
+ mark_ssbo_write(ctx, atomic);
+
+ /* even if nothing consume the result, we can't DCE the instruction: */
+ array_insert(b, b->keeps, atomic);
+}
+
static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
struct ir3_instruction *instr)
{
case nir_intrinsic_store_var:
emit_intrinsic_store_var(ctx, intr);
break;
+ case nir_intrinsic_load_ssbo:
+ emit_intrinsic_load_ssbo(ctx, intr, dst);
+ break;
+ case nir_intrinsic_store_ssbo:
+ emit_intrinsic_store_ssbo(ctx, intr);
+ break;
+ case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ emit_intrinsic_atomic(ctx, intr);
+ break;
case nir_intrinsic_store_output:
idx = nir_intrinsic_base(intr);
const_offset = nir_src_as_const_value(intr->src[1]);
/* We need to do legalize after (for frag shader's) the "bary.f"
* offsets (inloc) have been assigned.
*/
- ir3_legalize(ir, &so->has_samp, &max_bary);
+ ir3_legalize(ir, &so->has_samp, &so->has_ssbo, &max_bary);
if (fd_mesa_debug & FD_DBG_OPTMSGS) {
printf("AFTER LEGALIZE:\n");