dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
break;
}
- case nir_op_bit_count:
- dst[0] = ir3_CBITS_B(b, src[0], 0);
+ case nir_op_bit_count: {
+ // TODO, we need to do this 16b at a time on a5xx+a6xx.. need to
+ // double check on earlier gen's. Once half-precision support is
+ // in place, this should probably move to a NIR lowering pass:
+ struct ir3_instruction *hi, *lo;
+
+ hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0),
+ TYPE_U32, TYPE_U16);
+ lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
+
+ hi = ir3_CBITS_B(b, hi, 0);
+ lo = ir3_CBITS_B(b, lo, 0);
+
+ // TODO maybe the builders should default to making dst half-precision
+ // if the src's were half precision, to make this less awkward.. otoh
+ // we should probably just do this lowering in NIR.
+ hi->regs[0]->flags |= IR3_REG_HALF;
+ lo->regs[0]->flags |= IR3_REG_HALF;
+
+ dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
+ dst[0]->regs[0]->flags |= IR3_REG_HALF;
+ dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
break;
+ }
case nir_op_ifind_msb: {
struct ir3_instruction *cmp;
dst[0] = ir3_CLZ_S(b, src[0], 0);
struct ir3_block *b = ctx->block;
struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
nir_const_value *const_offset;
- /* UBO addresses are the first driver params: */
- unsigned ubo = regid(ctx->so->constbase.ubo, 0);
+ /* UBO addresses are the first driver params, but subtract 2 here to
+ * account for nir_lower_uniforms_to_ubo rebasing the UBOs such that UBO 0
+ * is the uniforms: */
+ unsigned ubo = regid(ctx->so->constbase.ubo, 0) - 2;
const unsigned ptrsz = ir3_pointer_size(ctx);
int off = 0;
base_lo = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz));
base_hi = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
} else {
- base_lo = create_uniform_indirect(b, ubo, ir3_get_addr(ctx, src0, 4));
- base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr(ctx, src0, 4));
+ base_lo = create_uniform_indirect(b, ubo, ir3_get_addr(ctx, src0, ptrsz));
+ base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr(ctx, src0, ptrsz));
}
/* note: on 32bit gpu's base_hi is ignored and DCE'd */
if (const_offset) {
idx += const_offset->u32[0];
for (int i = 0; i < intr->num_components; i++) {
- unsigned n = idx * 4 + i;
- dst[i] = create_uniform(b, n);
+ dst[i] = create_uniform(b, idx + i);
}
} else {
src = ir3_get_src(ctx, &intr->src[0]);
for (int i = 0; i < intr->num_components; i++) {
- int n = idx * 4 + i;
- dst[i] = create_uniform_indirect(b, n,
- ir3_get_addr(ctx, src[0], 4));
+ dst[i] = create_uniform_indirect(b, idx + i,
+ ir3_get_addr(ctx, src[0], 1));
}
/* NOTE: if relative addressing is used, we set
* constlen in the compiler (to worst-case value)
}
}
break;
- /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
- * pass and replaced by an ir3-specifc version that adds the
- * dword-offset in the last source.
- */
+ /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
+ * pass and replaced by an ir3-specifc version that adds the
+ * dword-offset in the last source.
+ */
case nir_intrinsic_load_ssbo_ir3:
ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
break;
case nir_intrinsic_store_ssbo_ir3:
+ if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+ !ctx->s->info.fs.early_fragment_tests)
+ ctx->so->no_earlyz = true;
ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
break;
case nir_intrinsic_get_buffer_size:
case nir_intrinsic_ssbo_atomic_xor_ir3:
case nir_intrinsic_ssbo_atomic_exchange_ir3:
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
+ if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+ !ctx->s->info.fs.early_fragment_tests)
+ ctx->so->no_earlyz = true;
dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
break;
case nir_intrinsic_load_shared:
emit_intrinsic_load_image(ctx, intr, dst);
break;
case nir_intrinsic_image_deref_store:
+ if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+ !ctx->s->info.fs.early_fragment_tests)
+ ctx->so->no_earlyz = true;
ctx->funcs->emit_intrinsic_store_image(ctx, intr);
break;
case nir_intrinsic_image_deref_size:
case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_image_deref_atomic_comp_swap:
+ if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
+ !ctx->s->info.fs.early_fragment_tests)
+ ctx->so->no_earlyz = true;
dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
break;
case nir_intrinsic_barrier:
array_insert(ctx->ir, ctx->ir->predicates, kill);
array_insert(b, b->keeps, kill);
- ctx->so->has_kill = true;
+ ctx->so->no_earlyz = true;
break;
}
*/
ninputs += max_sysvals[ctx->so->type];
- ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
+ ctx->ir = ir3_create(ctx->compiler, ctx->so->type, ninputs, noutputs);
/* Create inputs in first block: */
ctx->block = get_block(ctx, nir_start_block(fxn));
/* We need to do legalize after (for frag shader's) the "bary.f"
* offsets (inloc) have been assigned.
*/
- ir3_legalize(ir, &so->has_ssbo, &max_bary);
+ ir3_legalize(ir, &so->has_ssbo, &so->need_pixlod, &max_bary);
if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
printf("AFTER LEGALIZE:\n");