From 0e51082cfa733b3b8255bbd77fc4af46f4108c1d Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 9 Apr 2020 10:45:24 -0700 Subject: [PATCH] freedreno/ir3: Leave bools as 1-bit, storing them in full regs. If use NIR's 1-bit bool representation , we get exactly the bool behavior the hardware provides: CMPS produces true or false, AND/OR/XOR work as intended without extra absnegs, and we can pass those half values directly to other CMPS. We emit an absneg for b2b1 ("turn a memory load into a 1-bit NIR boolean"), but we would have done so for the ir3_n2b() on the use of that value anyway. The most awkward bit is that inot(a@1) is now a sub(1, a), but we can encode the 1 as an immediate so it's fine. No significant changes to GL_TIME_ELAPSED on my set of traces (n=21). instructions in affected programs: 1570638 -> 1548702 (-1.40%) nops in affected programs: 624053 -> 611381 (-2.03%) non-nops in affected programs: 959061 -> 949797 (-0.97%) mov in affected programs: 5258 -> 5252 (-0.11%) cov in affected programs: 15099 -> 15902 (5.32%) dwords in affected programs: 469600 -> 452768 (-3.58%) last-baryf in affected programs: 162211 -> 154726 (-4.61%) full in affected programs: 4881 -> 4797 (-1.72%) sstall in affected programs: 173953 -> 174545 (0.34%) (ss) in affected programs: 10922 -> 10934 (0.11%) (sy) in affected programs: 728 -> 745 (2.34%) Part-of: --- src/freedreno/ir3/ir3_compiler_nir.c | 224 +++++++++++++-------------- src/freedreno/ir3/ir3_context.c | 6 +- 2 files changed, 111 insertions(+), 119 deletions(-) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 2027fb88c7d..5f55596bb63 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -107,48 +107,13 @@ create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp) } /* - * Adreno uses uint rather than having dedicated bool type, - * which (potentially) requires some conversion, in particular - * when using output of an bool instr to int input, or visa - * versa. - * - * | Adreno | NIR | - * -------+---------+-------+- - * true | 1 | ~0 | - * false | 0 | 0 | - * - * To convert from an adreno bool (uint) to nir, use: - * - * absneg.s dst, (neg)src - * - * To convert back in the other direction: - * - * absneg.s dst, (abs)arc - * - * The CP step can clean up the absneg.s that cancel each other - * out, and with a slight bit of extra cleverness (to recognize - * the instructions which produce either a 0 or 1) can eliminate - * the absneg.s's completely when an instruction that wants - * 0/1 consumes the result. For example, when a nir 'bcsel' - * consumes the result of 'feq'. So we should be able to get by - * without a boolean resolve step, and without incuring any - * extra penalty in instruction count. + * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or + * 32-bit registers. We use NIR's 1-bit integers to represent bools, and + * trust that we will only see and/or/xor on those 1-bit values, so we can + * safely store NIR i1s in a 32-bit reg while always containing either a 1 or + * 0. */ -/* NIR bool -> native (adreno): */ -static struct ir3_instruction * -ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr) -{ - return ir3_ABSNEG_S(block, instr, IR3_REG_SABS); -} - -/* native (adreno) -> NIR bool: */ -static struct ir3_instruction * -ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr) -{ - return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG); -} - /* * alu/sfu instructions: */ @@ -222,6 +187,14 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src, } break; + case nir_op_b2f16: + case nir_op_b2f32: + case nir_op_b2i8: + case nir_op_b2i16: + case nir_op_b2i32: + src_type = TYPE_U32; + break; + default: ir3_context_error(ctx, "invalid conversion op: %u", op); } @@ -230,6 +203,7 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src, case nir_op_f2f32: case nir_op_i2f32: case nir_op_u2f32: + case nir_op_b2f32: dst_type = TYPE_F32; break; @@ -238,21 +212,25 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src, case nir_op_f2f16: case nir_op_i2f16: case nir_op_u2f16: + case nir_op_b2f16: dst_type = TYPE_F16; break; case nir_op_f2i32: case nir_op_i2i32: + case nir_op_b2i32: dst_type = TYPE_S32; break; case nir_op_f2i16: case nir_op_i2i16: + case nir_op_b2i16: dst_type = TYPE_S16; break; case nir_op_f2i8: case nir_op_i2i8: + case nir_op_b2i8: dst_type = TYPE_S8; break; @@ -275,6 +253,9 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src, ir3_context_error(ctx, "invalid conversion op: %u", op); } + if (src_type == dst_type) + return src; + struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type); @@ -292,7 +273,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) unsigned bs[info->num_inputs]; /* bit size */ struct ir3_block *b = ctx->block; unsigned dst_sz, wrmask; - type_t dst_type = nir_dest_bit_size(alu->dest.dest) < 32 ? + type_t dst_type = nir_dest_bit_size(alu->dest.dest) == 16 ? TYPE_U16 : TYPE_U32; if (alu->dest.dest.is_ssa) { @@ -383,43 +364,52 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) case nir_op_u2u32: case nir_op_u2u16: case nir_op_u2u8: + case nir_op_b2f16: + case nir_op_b2f32: + case nir_op_b2i8: + case nir_op_b2i16: + case nir_op_b2i32: dst[0] = create_cov(ctx, src[0], bs[0], alu->op); break; + case nir_op_fquantize2f16: dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16), 16, nir_op_f2f32); break; - case nir_op_f2b16: { - struct ir3_instruction *zero = create_immed_typed(b, 0, TYPE_F16); - dst[0] = ir3_CMPS_F(b, src[0], 0, zero, 0); + case nir_op_f2b1: + dst[0] = ir3_CMPS_F(b, + src[0], 0, + create_immed_typed(b, 0, bs[0] == 16 ? TYPE_F16 : TYPE_F32), 0); dst[0]->cat2.condition = IR3_COND_NE; break; - } - case nir_op_f2b32: - dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0); + + case nir_op_i2b1: + /* i2b1 will appear when translating from nir_load_ubo or + * nir_intrinsic_load_ssbo, where any non-zero value is true. + */ + dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); dst[0]->cat2.condition = IR3_COND_NE; break; - case nir_op_b2f16: - dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F16); - break; - case nir_op_b2f32: - dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32); - break; - case nir_op_b2i8: - case nir_op_b2i16: - case nir_op_b2i32: - dst[0] = ir3_b2n(b, src[0]); - break; - case nir_op_i2b16: { - struct ir3_instruction *zero = create_immed_typed(b, 0, TYPE_S16); - dst[0] = ir3_CMPS_S(b, src[0], 0, zero, 0); - dst[0]->cat2.condition = IR3_COND_NE; + + case nir_op_b2b1: + /* b2b1 will appear when translating from + * + * - nir_intrinsic_load_shared of a 32-bit 0/~0 value. + * - nir_intrinsic_load_constant of a 32-bit 0/~0 value + * + * A negate can turn those into a 1 or 0 for us. + */ + dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); break; - } - case nir_op_i2b32: - dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); - dst[0]->cat2.condition = IR3_COND_NE; + + case nir_op_b2b32: + /* b2b32 will appear when converting our 1-bit bools to a store_shared + * argument. + * + * A negate can turn those into a ~0 for us. + */ + dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); break; case nir_op_fneg: @@ -486,23 +476,19 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) dst[0] = ir3_DSYPP_1(b, src[0], 0); dst[0]->cat5.type = TYPE_F32; break; - case nir_op_flt16: - case nir_op_flt32: + case nir_op_flt: dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_LT; break; - case nir_op_fge16: - case nir_op_fge32: + case nir_op_fge: dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_GE; break; - case nir_op_feq16: - case nir_op_feq32: + case nir_op_feq: dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_EQ; break; - case nir_op_fne16: - case nir_op_fne32: + case nir_op_fne: dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_NE; break; @@ -581,7 +567,11 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); break; case nir_op_inot: - dst[0] = ir3_NOT_B(b, src[0], 0); + if (bs[0] == 1) { + dst[0] = ir3_SUB_U(b, create_immed(ctx->block, 1), 0, src[0], 0); + } else { + dst[0] = ir3_NOT_B(b, src[0], 0); + } break; case nir_op_ior: dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0); @@ -601,39 +591,32 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) case nir_op_ushr: dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0); break; - case nir_op_ilt16: - case nir_op_ilt32: + case nir_op_ilt: dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_LT; break; - case nir_op_ige16: - case nir_op_ige32: + case nir_op_ige: dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_GE; break; - case nir_op_ieq16: - case nir_op_ieq32: + case nir_op_ieq: dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_EQ; break; - case nir_op_ine16: - case nir_op_ine32: + case nir_op_ine: dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_NE; break; - case nir_op_ult16: - case nir_op_ult32: + case nir_op_ult: dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_LT; break; - case nir_op_uge16: - case nir_op_uge32: + case nir_op_uge: dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); dst[0]->cat2.condition = IR3_COND_GE; break; - case nir_op_b16csel: - case nir_op_b32csel: { + case nir_op_bcsel: { struct ir3_instruction *cond = src[0]; /* If src[0] is a negation (likely as a result of an ir3_b2n(cond)), @@ -647,24 +630,21 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) } compile_assert(ctx, bs[1] == bs[2]); - if (bs[1] != bs[0]) { + /* The condition's size has to match the other two arguments' size, so + * convert down if necessary. + */ + if (bs[1] == 16) { struct hash_entry *prev_entry = _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]); if (prev_entry) { cond = prev_entry->data; } else { - /* Make sure the boolean condition has the same bit size as the other - * two arguments, adding a conversion if necessary. - */ - if (bs[1] < bs[0]) - cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16); - else if (bs[1] > bs[0]) - cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32); + cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16); _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond); } } - if (bs[1] > 16) + if (bs[1] != 16) dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0); else dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0); @@ -725,12 +705,23 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) } if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) { + assert(nir_dest_bit_size(alu->dest.dest) == 1 || + alu->op == nir_op_b2b32); assert(dst_sz == 1); - - if (nir_dest_bit_size(alu->dest.dest) < 32) - dst[0]->regs[0]->flags |= IR3_REG_HALF; - - dst[0] = ir3_n2b(b, dst[0]); + } else { + /* 1-bit values stored in 32-bit registers are only valid for certain + * ALU ops. + */ + switch (alu->op) { + case nir_op_iand: + case nir_op_ior: + case nir_op_ixor: + case nir_op_inot: + case nir_op_bcsel: + break; + default: + compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) != 1); + } } ir3_put_dst(ctx, &alu->dest.dest); @@ -1218,7 +1209,7 @@ emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, struct tex_src_info info = get_image_samp_tex_src(ctx, intr); struct ir3_instruction *sam, *lod; unsigned flags, ncoords = ir3_get_image_coords(intr, &flags); - type_t dst_type = nir_dest_bit_size(intr->dest) < 32 ? + type_t dst_type = nir_dest_bit_size(intr->dest) == 16 ? TYPE_U16 : TYPE_U32; info.flags |= flags; @@ -1477,7 +1468,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) idx += nir_src_as_uint(intr->src[0]); for (int i = 0; i < intr->num_components; i++) { dst[i] = create_uniform_typed(b, idx + i, - nir_dest_bit_size(intr->dest) < 32 ? TYPE_F16 : TYPE_F32); + nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32); } } else { src = ir3_get_src(ctx, &intr->src[0]); @@ -1868,8 +1859,10 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) /* for fragface, we get -1 for back and 0 for front. However this is * the inverse of what nir expects (where ~0 is true). */ - dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32); - dst[0] = ir3_NOT_B(b, dst[0], 0); + dst[0] = ir3_CMPS_S(b, + ctx->frag_face, 0, + create_immed_typed(b, 0, TYPE_U16), 0); + dst[0]->cat2.condition = IR3_COND_EQ; break; case nir_intrinsic_load_local_invocation_id: if (!ctx->local_invocation_id) { @@ -1903,7 +1896,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) if (intr->intrinsic == nir_intrinsic_discard_if) { /* conditional discard: */ src = ir3_get_src(ctx, &intr->src[0]); - cond = ir3_b2n(b, src[0]); + cond = src[0]; } else { /* unconditional discard: */ cond = create_immed(b, 1); @@ -1931,7 +1924,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) struct ir3_instruction *cond, *kill; src = ir3_get_src(ctx, &intr->src[0]); - cond = ir3_b2n(b, src[0]); + cond = src[0]; /* NOTE: only cmps.*.* can write p0.x: */ cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0); @@ -1975,7 +1968,7 @@ emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr) struct ir3_instruction **dst = ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components); - if (instr->def.bit_size < 32) { + if (instr->def.bit_size == 16) { for (int i = 0; i < instr->def.num_components; i++) dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, @@ -1994,7 +1987,7 @@ emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef) { struct ir3_instruction **dst = ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components); - type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32; + type_t type = (undef->def.bit_size == 16) ? TYPE_U16 : TYPE_U32; /* backend doesn't want undefined instructions, so just plug * in 0.0.. @@ -2015,14 +2008,14 @@ get_tex_dest_type(nir_tex_instr *tex) switch (nir_alu_type_get_base_type(tex->dest_type)) { case nir_type_invalid: case nir_type_float: - type = nir_dest_bit_size(tex->dest) < 32 ? TYPE_F16 : TYPE_F32; + type = nir_dest_bit_size(tex->dest) == 16 ? TYPE_F16 : TYPE_F32; break; case nir_type_int: - type = nir_dest_bit_size(tex->dest) < 32 ? TYPE_S16 : TYPE_S32; + type = nir_dest_bit_size(tex->dest) == 16 ? TYPE_S16 : TYPE_S32; break; case nir_type_uint: case nir_type_bool: - type = nir_dest_bit_size(tex->dest) < 32 ? TYPE_U16 : TYPE_U32; + type = nir_dest_bit_size(tex->dest) == 16 ? TYPE_U16 : TYPE_U32; break; default: unreachable("bad dest_type"); @@ -2701,8 +2694,7 @@ emit_if(struct ir3_context *ctx, nir_if *nif) { struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0]; - ctx->block->condition = - ir3_get_predicate(ctx, ir3_b2n(condition->block, condition)); + ctx->block->condition = ir3_get_predicate(ctx, condition); emit_cf_list(ctx, &nif->then_list); emit_cf_list(ctx, &nif->else_list); diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index f4511ec77da..38f870321a7 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -80,7 +80,6 @@ ir3_context_init(struct ir3_compiler *compiler, /* this needs to be the last pass run, so do this here instead of * in ir3_optimize_nir(): */ - NIR_PASS_V(ctx->s, nir_lower_bool_to_bitsize); bool progress = false; NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs); @@ -216,7 +215,8 @@ ir3_put_dst(struct ir3_context *ctx, nir_dest *dst) } } - if (bit_size < 32) { + /* Note: 1-bit bools are stored in 32-bit regs */ + if (bit_size == 16) { for (unsigned i = 0; i < ctx->last_dst_n; i++) { struct ir3_instruction *dst = ctx->last_dst[i]; dst->regs[0]->flags |= IR3_REG_HALF; @@ -556,7 +556,7 @@ ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n, unsigned flags = 0; mov = ir3_instr_create(block, OPC_MOV); - if (bitsize < 32) { + if (bitsize == 16) { mov->cat1.src_type = TYPE_U16; mov->cat1.dst_type = TYPE_U16; flags |= IR3_REG_HALF; -- 2.30.2