}
static struct ir3_instruction *
-create_frag_input(struct ir3_context *ctx, bool use_ldlv)
+create_frag_input(struct ir3_context *ctx, bool use_ldlv, unsigned n)
{
struct ir3_block *block = ctx->block;
struct ir3_instruction *instr;
- /* actual inloc is assigned and fixed up later: */
- struct ir3_instruction *inloc = create_immed(block, 0);
+ /* packed inloc is fixed up later: */
+ struct ir3_instruction *inloc = create_immed(block, n);
if (use_ldlv) {
instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
instr->cat6.type = TYPE_U32;
instr->cat6.iim_val = 1;
} else {
- instr = ir3_BARY_F(block, inloc, 0, ctx->frag_vcoord, 0);
+ instr = ir3_BARY_F(block, inloc, 0, ctx->ij_pixel, 0);
instr->regs[2]->wrmask = 0x3;
}
{
/* first four vec4 sysval's reserved for UBOs: */
/* NOTE: dp is in scalar, but there can be >4 dp components: */
- unsigned n = ctx->so->constbase.driver_param;
+ struct ir3_const_state *const_state = &ctx->so->shader->const_state;
+ unsigned n = const_state->offsets.driver_param;
unsigned r = regid(n + dp / 4, dp % 4);
return create_uniform(ctx->block, r);
}
unsigned bs[info->num_inputs]; /* bit size */
struct ir3_block *b = ctx->block;
unsigned dst_sz, wrmask;
+ type_t dst_type = nir_dest_bit_size(alu->dest.dest) < 32 ?
+ TYPE_U16 : TYPE_U32;
if (alu->dest.dest.is_ssa) {
dst_sz = alu->dest.dest.ssa.num_components;
src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
if (!src[i])
- src[i] = create_immed(ctx->block, 0);
- dst[i] = ir3_MOV(b, src[i], TYPE_U32);
+ src[i] = create_immed_typed(ctx->block, 0, dst_type);
+ dst[i] = ir3_MOV(b, src[i], dst_type);
}
ir3_put_dst(ctx, &alu->dest.dest);
/* We also get mov's with more than one component for mov's so
* handle those specially:
*/
- if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) {
- type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32;
+ if (alu->op == nir_op_mov) {
nir_alu_src *asrc = &alu->src[0];
struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);
for (unsigned i = 0; i < dst_sz; i++) {
if (wrmask & (1 << i)) {
- dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type);
+ dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
} else {
dst[i] = NULL;
}
dst[0] = ir3_n2b(b, dst[0]);
break;
case nir_op_b2f16:
+ dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F16);
+ break;
case nir_op_b2f32:
dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
break;
(list_length(&alu->src[0].src.ssa->uses) == 1) &&
((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) {
src[0]->flags |= IR3_INSTR_SAT;
- dst[0] = ir3_MOV(b, src[0], TYPE_U32);
+ dst[0] = ir3_MOV(b, src[0], dst_type);
} else {
/* otherwise generate a max.f that saturates.. blob does
* similar (generating a cat2 mov using max.f)
case nir_op_umin:
dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
break;
- case nir_op_imul:
- /*
- * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
- * mull.u tmp0, a, b ; mul low, i.e. al * bl
- * madsh.m16 tmp1, a, b, tmp0 ; mul-add shift high mix, i.e. ah * bl << 16
- * madsh.m16 dst, b, a, tmp1 ; i.e. al * bh << 16
- */
- dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
- ir3_MADSH_M16(b, src[0], 0, src[1], 0,
- ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
+ case nir_op_umul_low:
+ dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
+ break;
+ case nir_op_imadsh_mix16:
+ dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
break;
case nir_op_ineg:
dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
break;
}
- case nir_op_bit_count:
- dst[0] = ir3_CBITS_B(b, src[0], 0);
+ case nir_op_bit_count: {
+ // TODO, we need to do this 16b at a time on a5xx+a6xx.. need to
+ // double check on earlier gen's. Once half-precision support is
+ // in place, this should probably move to a NIR lowering pass:
+ struct ir3_instruction *hi, *lo;
+
+ hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0),
+ TYPE_U32, TYPE_U16);
+ lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
+
+ hi = ir3_CBITS_B(b, hi, 0);
+ lo = ir3_CBITS_B(b, lo, 0);
+
+ // TODO maybe the builders should default to making dst half-precision
+ // if the src's were half precision, to make this less awkward.. otoh
+ // we should probably just do this lowering in NIR.
+ hi->regs[0]->flags |= IR3_REG_HALF;
+ lo->regs[0]->flags |= IR3_REG_HALF;
+
+ dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
+ dst[0]->regs[0]->flags |= IR3_REG_HALF;
+ dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
break;
+ }
case nir_op_ifind_msb: {
struct ir3_instruction *cmp;
dst[0] = ir3_CLZ_S(b, src[0], 0);
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
- nir_const_value *const_offset;
- /* UBO addresses are the first driver params: */
- unsigned ubo = regid(ctx->so->constbase.ubo, 0);
- const unsigned ptrsz = ir3_pointer_size(ctx);
+ /* UBO addresses are the first driver params, but subtract 2 here to
+ * account for nir_lower_uniforms_to_ubo rebasing the UBOs such that UBO 0
+ * is the uniforms: */
+ struct ir3_const_state *const_state = &ctx->so->shader->const_state;
+ unsigned ubo = regid(const_state->offsets.ubo, 0) - 2;
+ const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
int off = 0;
base_lo = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz));
base_hi = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
} else {
- base_lo = create_uniform_indirect(b, ubo, ir3_get_addr(ctx, src0, 4));
- base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr(ctx, src0, 4));
+ base_lo = create_uniform_indirect(b, ubo, ir3_get_addr(ctx, src0, ptrsz));
+ base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr(ctx, src0, ptrsz));
+
+ /* NOTE: since relative addressing is used, make sure constlen is
+ * at least big enough to cover all the UBO addresses, since the
+ * assembler won't know what the max address reg is.
+ */
+ ctx->so->constlen = MAX2(ctx->so->constlen,
+ const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));
}
/* note: on 32bit gpu's base_hi is ignored and DCE'd */
addr = base_lo;
- const_offset = nir_src_as_const_value(intr->src[1]);
- if (const_offset) {
- off += const_offset->u32[0];
+ if (nir_src_is_const(intr->src[1])) {
+ off += nir_src_as_uint(intr->src[1]);
} else {
/* For load_ubo_indirect, second src is indirect offset: */
src1 = ir3_get_src(ctx, &intr->src[1])[0];
struct ir3_instruction **dst)
{
/* SSBO size stored as a const starting at ssbo_sizes: */
- unsigned blk_idx = nir_src_as_const_value(intr->src[0])->u32[0];
- unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) +
- ctx->so->const_layout.ssbo_size.off[blk_idx];
+ struct ir3_const_state *const_state = &ctx->so->shader->const_state;
+ unsigned blk_idx = nir_src_as_uint(intr->src[0]);
+ unsigned idx = regid(const_state->offsets.ssbo_sizes, 0) +
+ const_state->ssbo_size.off[blk_idx];
- debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx));
+ debug_assert(const_state->ssbo_size.mask & (1 << blk_idx));
dst[0] = create_uniform(ctx->block, idx);
}
* bytes-per-pixel should have been emitted in 2nd slot of
* image_dims. See ir3_shader::emit_image_dims().
*/
- unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
- ctx->so->const_layout.image_dims.off[var->data.driver_location];
+ struct ir3_const_state *const_state = &ctx->so->shader->const_state;
+ unsigned cb = regid(const_state->offsets.image_dims, 0) +
+ const_state->image_dims.off[var->data.driver_location];
struct ir3_instruction *aux = create_uniform(b, cb + 1);
tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0);
barrier->cat7.g = true;
barrier->cat7.r = true;
barrier->cat7.w = true;
+ barrier->cat7.l = true;
barrier->barrier_class = IR3_BARRIER_IMAGE_W |
IR3_BARRIER_BUFFER_W;
barrier->barrier_conflict =
add_sysval_input_compmask(ctx, slot, 0x1, instr);
}
+static struct ir3_instruction *
+get_barycentric_centroid(struct ir3_context *ctx)
+{
+ if (!ctx->ij_centroid) {
+ struct ir3_instruction *xy[2];
+ struct ir3_instruction *ij;
+
+ ij = create_input_compmask(ctx, 0, 0x3);
+ ir3_split_dest(ctx->block, xy, ij, 0, 2);
+
+ ctx->ij_centroid = ir3_create_collect(ctx, xy, 2);
+
+ add_sysval_input_compmask(ctx,
+ SYSTEM_VALUE_BARYCENTRIC_CENTROID,
+ 0x3, ij);
+ }
+
+ return ctx->ij_centroid;
+}
+
+static struct ir3_instruction *
+get_barycentric_sample(struct ir3_context *ctx)
+{
+ if (!ctx->ij_sample) {
+ struct ir3_instruction *xy[2];
+ struct ir3_instruction *ij;
+
+ ij = create_input_compmask(ctx, 0, 0x3);
+ ir3_split_dest(ctx->block, xy, ij, 0, 2);
+
+ ctx->ij_sample = ir3_create_collect(ctx, xy, 2);
+
+ add_sysval_input_compmask(ctx,
+ SYSTEM_VALUE_BARYCENTRIC_SAMPLE,
+ 0x3, ij);
+ }
+
+ return ctx->ij_sample;
+}
+
+static struct ir3_instruction *
+get_barycentric_pixel(struct ir3_context *ctx)
+{
+ /* TODO when tgsi_to_nir supports "new-style" FS inputs switch
+ * this to create ij_pixel only on demand:
+ */
+ return ctx->ij_pixel;
+}
+
+static struct ir3_instruction *
+get_frag_coord(struct ir3_context *ctx)
+{
+ if (!ctx->frag_coord) {
+ struct ir3_block *b = ctx->block;
+ struct ir3_instruction *xyzw[4];
+ struct ir3_instruction *hw_frag_coord;
+
+ hw_frag_coord = create_input_compmask(ctx, 0, 0xf);
+ ir3_split_dest(ctx->block, xyzw, hw_frag_coord, 0, 4);
+
+ /* for frag_coord.xy, we get unsigned values.. we need
+ * to subtract (integer) 8 and divide by 16 (right-
+ * shift by 4) then convert to float:
+ *
+ * sub.s tmp, src, 8
+ * shr.b tmp, tmp, 4
+ * mov.u32f32 dst, tmp
+ *
+ */
+ for (int i = 0; i < 2; i++) {
+ xyzw[i] = ir3_SUB_S(b, xyzw[i], 0,
+ create_immed(b, 8), 0);
+ xyzw[i] = ir3_SHR_B(b, xyzw[i], 0,
+ create_immed(b, 4), 0);
+ xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32);
+ }
+
+ ctx->frag_coord = ir3_create_collect(ctx, xyzw, 4);
+
+ add_sysval_input_compmask(ctx,
+ SYSTEM_VALUE_FRAG_COORD,
+ 0xf, hw_frag_coord);
+
+ ctx->so->frag_coord = true;
+ }
+
+ return ctx->frag_coord;
+}
+
static void
emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_instruction **dst;
struct ir3_instruction * const *src;
struct ir3_block *b = ctx->block;
- nir_const_value *const_offset;
int idx, comp;
if (info->has_dest) {
switch (intr->intrinsic) {
case nir_intrinsic_load_uniform:
idx = nir_intrinsic_base(intr);
- const_offset = nir_src_as_const_value(intr->src[0]);
- if (const_offset) {
- idx += const_offset->u32[0];
+ if (nir_src_is_const(intr->src[0])) {
+ idx += nir_src_as_uint(intr->src[0]);
for (int i = 0; i < intr->num_components; i++) {
- unsigned n = idx * 4 + i;
- dst[i] = create_uniform(b, n);
+ dst[i] = create_uniform_typed(b, idx + i,
+ nir_dest_bit_size(intr->dest) < 32 ? TYPE_F16 : TYPE_F32);
}
} else {
src = ir3_get_src(ctx, &intr->src[0]);
for (int i = 0; i < intr->num_components; i++) {
- int n = idx * 4 + i;
- dst[i] = create_uniform_indirect(b, n,
- ir3_get_addr(ctx, src[0], 4));
+ dst[i] = create_uniform_indirect(b, idx + i,
+ ir3_get_addr(ctx, src[0], 1));
}
/* NOTE: if relative addressing is used, we set
* constlen in the compiler (to worst-case value)
* since we don't know in the assembler what the max
* addr reg value can be:
*/
- ctx->so->constlen = ctx->s->num_uniforms;
+ ctx->so->constlen = MAX2(ctx->so->constlen,
+ ctx->so->shader->ubo_state.size / 16);
}
break;
case nir_intrinsic_load_ubo:
emit_intrinsic_load_ubo(ctx, intr, dst);
break;
+ case nir_intrinsic_load_frag_coord:
+ ir3_split_dest(b, dst, get_frag_coord(ctx), 0, 4);
+ break;
+ case nir_intrinsic_load_sample_pos_from_id: {
+ /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
+ * but that doesn't seem necessary.
+ */
+ struct ir3_instruction *offset =
+ ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
+ offset->regs[0]->wrmask = 0x3;
+ offset->cat5.type = TYPE_F32;
+
+ ir3_split_dest(b, dst, offset, 0, 2);
+
+ break;
+ }
+ case nir_intrinsic_load_size_ir3:
+ if (!ctx->ij_size) {
+ ctx->ij_size = create_input(ctx, 0);
+
+ add_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_SIZE,
+ ctx->ij_size);
+ }
+ dst[0] = ctx->ij_size;
+ break;
+ case nir_intrinsic_load_barycentric_centroid:
+ ir3_split_dest(b, dst, get_barycentric_centroid(ctx), 0, 2);
+ break;
+ case nir_intrinsic_load_barycentric_sample:
+ if (ctx->so->key.msaa) {
+ ir3_split_dest(b, dst, get_barycentric_sample(ctx), 0, 2);
+ } else {
+ ir3_split_dest(b, dst, get_barycentric_pixel(ctx), 0, 2);
+ }
+ break;
+ case nir_intrinsic_load_barycentric_pixel:
+ ir3_split_dest(b, dst, get_barycentric_pixel(ctx), 0, 2);
+ break;
+ case nir_intrinsic_load_interpolated_input:
+ idx = nir_intrinsic_base(intr);
+ comp = nir_intrinsic_component(intr);
+ src = ir3_get_src(ctx, &intr->src[0]);
+ if (nir_src_is_const(intr->src[1])) {
+ struct ir3_instruction *coord = ir3_create_collect(ctx, src, 2);
+ idx += nir_src_as_uint(intr->src[1]);
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned inloc = idx * 4 + i + comp;
+ if (ctx->so->inputs[idx].bary &&
+ !ctx->so->inputs[idx].use_ldlv) {
+ dst[i] = ir3_BARY_F(b, create_immed(b, inloc), 0, coord, 0);
+ } else {
+ /* for non-varyings use the pre-setup input, since
+ * that is easier than mapping things back to a
+ * nir_variable to figure out what it is.
+ */
+ dst[i] = ctx->ir->inputs[inloc];
+ }
+ }
+ } else {
+ ir3_context_error(ctx, "unhandled");
+ }
+ break;
case nir_intrinsic_load_input:
idx = nir_intrinsic_base(intr);
comp = nir_intrinsic_component(intr);
- const_offset = nir_src_as_const_value(intr->src[0]);
- if (const_offset) {
- idx += const_offset->u32[0];
+ if (nir_src_is_const(intr->src[0])) {
+ idx += nir_src_as_uint(intr->src[0]);
for (int i = 0; i < intr->num_components; i++) {
unsigned n = idx * 4 + i + comp;
dst[i] = ctx->ir->inputs[n];
+ compile_assert(ctx, ctx->ir->inputs[n]);
}
} else {
src = ir3_get_src(ctx, &intr->src[0]);
}
}
break;
- /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
- * pass and replaced by an ir3-specifc version that adds the
- * dword-offset in the last source.
- */
+ /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
+ * pass and replaced by an ir3-specifc version that adds the
+ * dword-offset in the last source.
+ */
case nir_intrinsic_load_ssbo_ir3:
ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
break;
case nir_intrinsic_store_ssbo_ir3:
+ if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+ !ctx->s->info.fs.early_fragment_tests)
+ ctx->so->no_earlyz = true;
ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
break;
case nir_intrinsic_get_buffer_size:
case nir_intrinsic_ssbo_atomic_xor_ir3:
case nir_intrinsic_ssbo_atomic_exchange_ir3:
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
+ if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+ !ctx->s->info.fs.early_fragment_tests)
+ ctx->so->no_earlyz = true;
dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
break;
case nir_intrinsic_load_shared:
emit_intrinsic_load_image(ctx, intr, dst);
break;
case nir_intrinsic_image_deref_store:
+ if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+ !ctx->s->info.fs.early_fragment_tests)
+ ctx->so->no_earlyz = true;
ctx->funcs->emit_intrinsic_store_image(ctx, intr);
break;
case nir_intrinsic_image_deref_size:
emit_intrinsic_image_size(ctx, intr, dst);
break;
case nir_intrinsic_image_deref_atomic_add:
- case nir_intrinsic_image_deref_atomic_min:
- case nir_intrinsic_image_deref_atomic_max:
+ case nir_intrinsic_image_deref_atomic_imin:
+ case nir_intrinsic_image_deref_atomic_umin:
+ case nir_intrinsic_image_deref_atomic_imax:
+ case nir_intrinsic_image_deref_atomic_umax:
case nir_intrinsic_image_deref_atomic_and:
case nir_intrinsic_image_deref_atomic_or:
case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_image_deref_atomic_comp_swap:
+ if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+ !ctx->s->info.fs.early_fragment_tests)
+ ctx->so->no_earlyz = true;
dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
break;
case nir_intrinsic_barrier:
case nir_intrinsic_store_output:
idx = nir_intrinsic_base(intr);
comp = nir_intrinsic_component(intr);
- const_offset = nir_src_as_const_value(intr->src[1]);
- compile_assert(ctx, const_offset != NULL);
- idx += const_offset->u32[0];
+ compile_assert(ctx, nir_src_is_const(intr->src[1]));
+ idx += nir_src_as_uint(intr->src[1]);
src = ir3_get_src(ctx, &intr->src[0]);
for (int i = 0; i < intr->num_components; i++) {
dst[0] = ctx->instance_id;
break;
case nir_intrinsic_load_sample_id:
+ ctx->so->per_samp = true;
+ /* fall-thru */
case nir_intrinsic_load_sample_id_no_per_sample:
if (!ctx->samp_id) {
ctx->samp_id = create_input(ctx, 0);
array_insert(ctx->ir, ctx->ir->predicates, kill);
array_insert(b, b->keeps, kill);
- ctx->so->has_kill = true;
+ ctx->so->no_earlyz = true;
break;
}
{
struct ir3_instruction **dst = ir3_get_dst_ssa(ctx, &instr->def,
instr->def.num_components);
- type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
- for (int i = 0; i < instr->def.num_components; i++)
- dst[i] = create_immed_typed(ctx->block, instr->value.u32[i], type);
+ if (instr->def.bit_size < 32) {
+ for (int i = 0; i < instr->def.num_components; i++)
+ dst[i] = create_immed_typed(ctx->block,
+ instr->value[i].u16,
+ TYPE_U16);
+ } else {
+ for (int i = 0; i < instr->def.num_components; i++)
+ dst[i] = create_immed_typed(ctx->block,
+ instr->value[i].u32,
+ TYPE_U32);
+ }
+
}
static void
case 3: opc = OPC_GATHER4A; break;
}
break;
+ case nir_texop_txf_ms_fb:
case nir_texop_txf_ms: opc = OPC_ISAMM; break;
- case nir_texop_txs:
- case nir_texop_query_levels:
- case nir_texop_texture_samples:
- case nir_texop_samples_identical:
- case nir_texop_txf_ms_mcs:
+ default:
ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
return;
}
/* NOTE a3xx (and possibly a4xx?) might be different, using isaml
* with scaled x coord according to requested sample:
*/
- if (tex->op == nir_texop_txf_ms) {
+ if (opc == OPC_ISAMM) {
if (ctx->compiler->txf_ms_with_isaml) {
/* the samples are laid out in x dimension as
* 0 1 2 3
if (opc == OPC_GETLOD)
type = TYPE_U32;
- struct ir3_instruction *samp_tex = get_tex_samp_tex_src(ctx, tex);
+ struct ir3_instruction *samp_tex;
+
+ if (tex->op == nir_texop_txf_ms_fb) {
+ /* only expect a single txf_ms_fb per shader: */
+ compile_assert(ctx, !ctx->so->fb_read);
+ compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
+
+ ctx->so->fb_read = true;
+ samp_tex = ir3_create_collect(ctx, (struct ir3_instruction*[]){
+ create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
+ create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
+ }, 2);
+
+ ctx->so->num_samp++;
+ } else {
+ samp_tex = get_tex_samp_tex_src(ctx, tex);
+ }
+
struct ir3_instruction *col0 = ir3_create_collect(ctx, src0, nsrc0);
struct ir3_instruction *col1 = ir3_create_collect(ctx, src1, nsrc1);
{
struct ir3_block *block;
struct hash_entry *hentry;
- unsigned i;
hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
if (hentry)
block->nblock = nblock;
_mesa_hash_table_insert(ctx->block_ht, nblock, block);
- block->predecessors_count = nblock->predecessors->entries;
- block->predecessors = ralloc_array_size(block,
- sizeof(block->predecessors[0]), block->predecessors_count);
- i = 0;
+ block->predecessors = _mesa_pointer_set_create(block);
set_foreach(nblock->predecessors, sentry) {
- block->predecessors[i++] = get_block(ctx, sentry->key);
+ _mesa_set_add(block->predecessors, get_block(ctx, sentry->key));
}
return block;
emit_loop(struct ir3_context *ctx, nir_loop *nloop)
{
emit_cf_list(ctx, &nloop->body);
+ ctx->so->loops++;
}
static void
static void
emit_stream_out(struct ir3_context *ctx)
{
- struct ir3_shader_variant *v = ctx->so;
struct ir3 *ir = ctx->ir;
struct ir3_stream_output_info *strmout =
&ctx->so->shader->stream_output;
* stripped out in the backend.
*/
for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
+ struct ir3_const_state *const_state = &ctx->so->shader->const_state;
unsigned stride = strmout->stride[i];
struct ir3_instruction *base, *off;
- base = create_uniform(ctx->block, regid(v->constbase.tfbo, i));
+ base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));
/* 24-bit should be enough: */
off = ir3_MUL_U(ctx->block, vtxcnt, 0,
ir3_END(ctx->block);
}
-static struct ir3_instruction *
-create_frag_coord(struct ir3_context *ctx, unsigned comp)
-{
- struct ir3_block *block = ctx->block;
- struct ir3_instruction *instr;
-
- if (!ctx->frag_coord) {
- ctx->frag_coord = create_input_compmask(ctx, 0, 0xf);
- /* defer add_sysval_input() until after all inputs created */
- }
-
- ir3_split_dest(block, &instr, ctx->frag_coord, comp, 1);
-
- switch (comp) {
- case 0: /* .x */
- case 1: /* .y */
- /* for frag_coord, we get unsigned values.. we need
- * to subtract (integer) 8 and divide by 16 (right-
- * shift by 4) then convert to float:
- *
- * sub.s tmp, src, 8
- * shr.b tmp, tmp, 4
- * mov.u32f32 dst, tmp
- *
- */
- instr = ir3_SUB_S(block, instr, 0,
- create_immed(block, 8), 0);
- instr = ir3_SHR_B(block, instr, 0,
- create_immed(block, 4), 0);
- instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
-
- return instr;
- case 2: /* .z */
- case 3: /* .w */
- default:
- /* seems that we can use these as-is: */
- return instr;
- }
-}
-
static void
setup_input(struct ir3_context *ctx, nir_variable *in)
{
so->inputs[n].compmask = (1 << (ncomp + frac)) - 1;
so->inputs_count = MAX2(so->inputs_count, n + 1);
so->inputs[n].interpolate = in->data.interpolation;
+ so->inputs[n].ncomp = ncomp;
if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+
+ /* if any varyings have 'sample' qualifer, that triggers us
+ * to run in per-sample mode:
+ */
+ so->per_samp |= in->data.sample;
+
for (int i = 0; i < ncomp; i++) {
struct ir3_instruction *instr = NULL;
unsigned idx = (n * 4) + i + frac;
if (slot == VARYING_SLOT_POS) {
- so->inputs[n].bary = false;
- so->frag_coord = true;
- instr = create_frag_coord(ctx, i);
+ ir3_context_error(ctx, "fragcoord should be a sysval!\n");
} else if (slot == VARYING_SLOT_PNTC) {
/* see for example st_nir_fixup_varying_slots().. this is
* maybe a bit mesa/st specific. But we need things to line
*/
so->inputs[n].slot = VARYING_SLOT_VAR8;
so->inputs[n].bary = true;
- instr = create_frag_input(ctx, false);
+ instr = create_frag_input(ctx, false, idx);
} else {
- bool use_ldlv = false;
-
/* detect the special case for front/back colors where
* we need to do flat vs smooth shading depending on
* rast state:
if (ctx->compiler->flat_bypass) {
if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
(so->inputs[n].rasterflat && ctx->so->key.rasterflat))
- use_ldlv = true;
+ so->inputs[n].use_ldlv = true;
}
so->inputs[n].bary = true;
- instr = create_frag_input(ctx, use_ldlv);
+ instr = create_frag_input(ctx, so->inputs[n].use_ldlv, idx);
}
compile_assert(ctx, idx < ctx->ir->ninputs);
}
}
+/* Initially we assign non-packed inloc's for varyings, as we don't really
+ * know up-front which components will be unused. After all the compilation
+ * stages we scan the shader to see which components are actually used, and
+ * re-pack the inlocs to eliminate unneeded varyings.
+ */
+static void
+pack_inlocs(struct ir3_context *ctx)
+{
+ struct ir3_shader_variant *so = ctx->so;
+ uint8_t used_components[so->inputs_count];
+
+ memset(used_components, 0, sizeof(used_components));
+
+ /*
+ * First Step: scan shader to find which bary.f/ldlv remain:
+ */
+
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (is_input(instr)) {
+ unsigned inloc = instr->regs[1]->iim_val;
+ unsigned i = inloc / 4;
+ unsigned j = inloc % 4;
+
+ compile_assert(ctx, instr->regs[1]->flags & IR3_REG_IMMED);
+ compile_assert(ctx, i < so->inputs_count);
+
+ used_components[i] |= 1 << j;
+ }
+ }
+ }
+
+ /*
+ * Second Step: reassign varying inloc/slots:
+ */
+
+ unsigned actual_in = 0;
+ unsigned inloc = 0;
+
+ for (unsigned i = 0; i < so->inputs_count; i++) {
+ unsigned compmask = 0, maxcomp = 0;
+
+ so->inputs[i].ncomp = 0;
+ so->inputs[i].inloc = inloc;
+ so->inputs[i].bary = false;
+
+ for (unsigned j = 0; j < 4; j++) {
+ if (!(used_components[i] & (1 << j)))
+ continue;
+
+ compmask |= (1 << j);
+ actual_in++;
+ so->inputs[i].ncomp++;
+ maxcomp = j + 1;
+
+ /* at this point, since used_components[i] mask is only
+ * considering varyings (ie. not sysvals) we know this
+ * is a varying:
+ */
+ so->inputs[i].bary = true;
+ }
+
+ if (so->inputs[i].bary) {
+ so->varying_in++;
+ so->inputs[i].compmask = (1 << maxcomp) - 1;
+ inloc += maxcomp;
+ }
+ }
+
+ /*
+ * Third Step: reassign packed inloc's:
+ */
+
+ list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+ list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ if (is_input(instr)) {
+ unsigned inloc = instr->regs[1]->iim_val;
+ unsigned i = inloc / 4;
+ unsigned j = inloc % 4;
+
+ instr->regs[1]->iim_val = so->inputs[i].inloc + j;
+ }
+ }
+ }
+}
+
static void
setup_output(struct ir3_context *ctx, nir_variable *out)
{
case FRAG_RESULT_COLOR:
so->color0_mrt = 1;
break;
+ case FRAG_RESULT_SAMPLE_MASK:
+ so->writes_smask = true;
+ break;
default:
if (slot >= FRAG_RESULT_DATA0)
break;
*/
ninputs += max_sysvals[ctx->so->type];
- ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
+ ctx->ir = ir3_create(ctx->compiler, ctx->so->type, ninputs, noutputs);
/* Create inputs in first block: */
ctx->block = get_block(ctx, nir_start_block(fxn));
/* for fragment shader, the vcoord input register is used as the
* base for bary.f varying fetch instrs:
+ *
+ * TODO defer creating ctx->ij_pixel and corresponding sysvals
+ * until emit_intrinsic when we know they are actually needed.
+ * For now, we defer creating ctx->ij_centroid, etc, since we
+ * only need ij_pixel for "old style" varying inputs (ie.
+ * tgsi_to_nir)
*/
struct ir3_instruction *vcoord = NULL;
if (ctx->so->type == MESA_SHADER_FRAGMENT) {
vcoord = create_input_compmask(ctx, 0, 0x3);
ir3_split_dest(ctx->block, xy, vcoord, 0, 2);
- ctx->frag_vcoord = ir3_create_collect(ctx, xy, 2);
+ ctx->ij_pixel = ir3_create_collect(ctx, xy, 2);
}
/* Setup inputs: */
* because sysvals need to be appended after varyings:
*/
if (vcoord) {
- add_sysval_input_compmask(ctx, SYSTEM_VALUE_VARYING_COORD,
+ add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PIXEL,
0x3, vcoord);
}
- if (ctx->frag_coord) {
- add_sysval_input_compmask(ctx, SYSTEM_VALUE_FRAG_COORD,
- 0xf, ctx->frag_coord);
- }
-
/* Setup outputs: */
nir_foreach_variable(var, &ctx->s->outputs) {
setup_output(ctx, var);
ctx->so->num_samp += glsl_type_get_image_count(var->type);
}
- /* Setup registers (which should only be arrays): */
- nir_foreach_register(reg, &ctx->s->registers) {
- ir3_declare_array(ctx, reg);
- }
-
/* NOTE: need to do something more clever when we support >1 fxn */
nir_foreach_register(reg, &fxn->registers) {
ir3_declare_array(ctx, reg);
struct ir3_context *ctx;
struct ir3 *ir;
struct ir3_instruction **inputs;
- unsigned i, actual_in, inloc;
+ unsigned i;
int ret = 0, max_bary;
assert(!so->ir);
if (so->binning_pass && (ctx->compiler->gpu_id >= 600))
fixup_binning_pass(ctx);
+ /* for a6xx+, binning and draw pass VS use same VBO state, so we
+ * need to make sure not to remove any inputs that are used by
+ * the nonbinning VS.
+ */
+ if (ctx->compiler->gpu_id >= 600 && so->binning_pass) {
+ debug_assert(so->type == MESA_SHADER_VERTEX);
+ for (int i = 0; i < ir->ninputs; i++) {
+ struct ir3_instruction *in = ir->inputs[i];
+
+ if (!in)
+ continue;
+
+ unsigned n = i / 4;
+ unsigned c = i % 4;
+
+ debug_assert(n < so->nonbinning->inputs_count);
+
+ if (so->nonbinning->inputs[n].sysval)
+ continue;
+
+ /* be sure to keep inputs, even if only used in VS */
+ if (so->nonbinning->inputs[n].compmask & (1 << c))
+ array_insert(in->block, in->block->keeps, in);
+ }
+ }
+
/* Insert mov if there's same instruction for each output.
* eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow
*/
ir3_print(ir);
}
- ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
+ ret = ir3_ra(so);
if (ret) {
DBG("RA failed!");
goto out;
ir3_print(ir);
}
+ if (so->type == MESA_SHADER_FRAGMENT)
+ pack_inlocs(ctx);
+
/* fixup input/outputs: */
for (i = 0; i < so->outputs_count; i++) {
/* sometimes we get outputs that don't write the .x coord, like:
struct ir3_instruction *instr = ir->outputs[(i*4) + j];
if (instr) {
so->outputs[i].regid = instr->regs[0]->num;
+ so->outputs[i].half = !!(instr->regs[0]->flags & IR3_REG_HALF);
break;
}
}
}
/* Note that some or all channels of an input may be unused: */
- actual_in = 0;
- inloc = 0;
for (i = 0; i < so->inputs_count; i++) {
- unsigned j, reg = regid(63,0), compmask = 0, maxcomp = 0;
- so->inputs[i].ncomp = 0;
- so->inputs[i].inloc = inloc;
+ unsigned j, reg = regid(63,0);
+ bool half = false;
for (j = 0; j < 4; j++) {
struct ir3_instruction *in = inputs[(i*4) + j];
- if (in && !(in->flags & IR3_INSTR_UNUSED)) {
- compmask |= (1 << j);
- reg = in->regs[0]->num - j;
- actual_in++;
- so->inputs[i].ncomp++;
- if ((so->type == MESA_SHADER_FRAGMENT) && so->inputs[i].bary) {
- /* assign inloc: */
- assert(in->regs[1]->flags & IR3_REG_IMMED);
- in->regs[1]->iim_val = inloc + j;
- maxcomp = j + 1;
- }
+
+ if (!in)
+ continue;
+
+ if (in->flags & IR3_INSTR_UNUSED)
+ continue;
+
+ reg = in->regs[0]->num - j;
+ if (half) {
+ compile_assert(ctx, in->regs[0]->flags & IR3_REG_HALF);
+ } else {
+ half = !!(in->regs[0]->flags & IR3_REG_HALF);
}
}
- if ((so->type == MESA_SHADER_FRAGMENT) && compmask && so->inputs[i].bary) {
- so->varying_in++;
- so->inputs[i].compmask = (1 << maxcomp) - 1;
- inloc += maxcomp;
- } else if (!so->inputs[i].sysval) {
- so->inputs[i].compmask = compmask;
- }
so->inputs[i].regid = reg;
+ so->inputs[i].half = half;
}
if (ctx->astc_srgb)
/* We need to do legalize after (for frag shader's) the "bary.f"
* offsets (inloc) have been assigned.
*/
- ir3_legalize(ir, &so->has_ssbo, &max_bary);
+ ir3_legalize(ir, &so->has_ssbo, &so->need_pixlod, &max_bary);
if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
printf("AFTER LEGALIZE:\n");
so->branchstack = ctx->max_stack;
/* Note that actual_in counts inputs that are not bary.f'd for FS: */
- if (so->type == MESA_SHADER_VERTEX)
- so->total_in = actual_in;
- else
+ if (so->type == MESA_SHADER_FRAGMENT)
so->total_in = max_bary + 1;
so->max_sun = ir->max_sun;