From abcfb6437062f469335d27d5ef60ecf20272dc26 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 15 Apr 2020 14:18:03 +0200 Subject: [PATCH] ir3: Fix LDC offset units I had missed that LDC actually uses vec4 units for its offset. This means that we have to create a new instruction, and lower it in ir3_nir_lower_io_offsets, similar to the existing SSBO instructions. Unfortunately we can't assume that loads are always vec4-aligned, so we have to use the alignment information that NIR gives us. Unfortunately, it's currently woefully inadequate, and will have to be fixed to give us good codegen in the future. Part-of: --- src/compiler/nir/nir_intrinsics.py | 6 ++ src/freedreno/ir3/disasm-a3xx.c | 2 + src/freedreno/ir3/ir3.c | 2 +- src/freedreno/ir3/ir3.h | 2 +- src/freedreno/ir3/ir3_compiler_nir.c | 14 ++-- src/freedreno/ir3/ir3_nir_lower_io_offsets.c | 84 ++++++++++++++++++++ src/freedreno/ir3/ir3_print.c | 2 + 7 files changed, 101 insertions(+), 11 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index ac8d39a2b41..b2cb0371efc 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -802,6 +802,12 @@ intrinsic("ssbo_atomic_xor_ir3", src_comp=[1, 1, 1, 1], dest_comp=1) intrinsic("ssbo_atomic_exchange_ir3", src_comp=[1, 1, 1, 1], dest_comp=1) intrinsic("ssbo_atomic_comp_swap_ir3", src_comp=[1, 1, 1, 1, 1], dest_comp=1) +# IR3-specific instruction for UBO loads using the ldc instruction. The second +# source is the indirect offset, in units of four dwords. The base is a +# component offset, in dword units. +intrinsic("load_ubo_ir3", src_comp=[1, 1], bit_sizes=[32], dest_comp=0, indices=[BASE], + flags=[CAN_REORDER, CAN_ELIMINATE]) + # System values for freedreno geometry shaders. system_value("vs_primitive_stride_ir3", 1) system_value("vs_vertex_stride_ir3", 1) diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c index 674b4d57475..23c3bc76ab2 100644 --- a/src/freedreno/ir3/disasm-a3xx.c +++ b/src/freedreno/ir3/disasm-a3xx.c @@ -940,6 +940,8 @@ static void print_instr_cat6_a6xx(struct disasm_ctx *ctx, instr_t *instr) fprintf(ctx->out, ".%s", cat6->typed ? "typed" : "untyped"); fprintf(ctx->out, ".%dd", cat6->d + 1); fprintf(ctx->out, ".%s", type[cat6->type]); + } else { + fprintf(ctx->out, ".offset%d", cat6->d); } fprintf(ctx->out, ".%u", cat6->type_size + 1); diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index dcd5a5082a6..9678389e8b5 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -561,7 +561,7 @@ static int emit_cat6_a6xx(struct ir3_instruction *instr, void *ptr, } cat6->type = instr->cat6.type; - cat6->d = instr->cat6.d - 1; + cat6->d = instr->cat6.d - (instr->opc == OPC_LDC ? 0 : 1); cat6->typed = instr->cat6.typed; cat6->type_size = instr->cat6.iim_val - 1; cat6->opc = instr->opc; diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 54d740e75db..351490aecf7 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -267,7 +267,7 @@ struct ir3_instruction { int src_offset; int dst_offset; int iim_val : 3; /* for ldgb/stgb, # of components */ - unsigned d : 3; + unsigned d : 3; /* for ldc, component offset */ bool typed : 1; unsigned base : 3; } cat6; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 8c676b2f3ce..d9152416e4d 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -748,8 +748,8 @@ emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr, struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0]; struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0); ldc->regs[0]->wrmask = MASK(ncomp); - ldc->cat6.iim_val = intr->num_components; - ldc->cat6.d = 1; + ldc->cat6.iim_val = ncomp; + ldc->cat6.d = nir_intrinsic_base(intr); ldc->cat6.type = TYPE_U32; nir_intrinsic_instr *bindless = ir3_bindless_resource(intr->src[0]); @@ -768,13 +768,6 @@ static void emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, struct ir3_instruction **dst) { - if (ir3_bindless_resource(intr->src[0])) { - /* TODO: We should be using ldc for non-bindless things on a6xx as - * well. - */ - emit_intrinsic_load_ubo_ldc(ctx, intr, dst); - return; - } struct ir3_block *b = ctx->block; struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1; /* UBO addresses are the first driver params, but subtract 2 here to @@ -1612,6 +1605,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) case nir_intrinsic_load_ubo: emit_intrinsic_load_ubo(ctx, intr, dst); break; + case nir_intrinsic_load_ubo_ir3: + emit_intrinsic_load_ubo_ldc(ctx, intr, dst); + break; case nir_intrinsic_load_frag_coord: ir3_split_dest(b, dst, get_frag_coord(ctx), 0, 4); break; diff --git a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c index ba40a9f4194..8e80c40eeb8 100644 --- a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c +++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c @@ -250,6 +250,84 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b, return true; } +static bool +lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b) +{ + /* We only need to lower offset if using LDC. Currently, we only use LDC + * in the bindless mode. Also, LDC is introduced on A6xx, but currently we + * only use bindless in turnip which is A6xx only. + * + * TODO: We should be using LDC always on A6xx+. + */ + if (!ir3_bindless_resource(intrinsic->src[0])) + return false; + + /* TODO handle other bitsizes, including non-dword-aligned loads */ + assert(intrinsic->dest.ssa.bit_size == 32); + + b->cursor = nir_before_instr(&intrinsic->instr); + + nir_intrinsic_instr *new_intrinsic = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_ir3); + + debug_assert(intrinsic->dest.is_ssa); + new_intrinsic->src[0] = nir_src_for_ssa(intrinsic->src[0].ssa); + + nir_ssa_def *offset = intrinsic->src[1].ssa; + nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -4); + + if (!new_offset) + new_offset = nir_ushr(b, offset, nir_imm_int(b, 4)); + + new_intrinsic->src[1] = nir_src_for_ssa(new_offset); + + unsigned align_mul = nir_intrinsic_align_mul(intrinsic); + unsigned align_offset = nir_intrinsic_align_offset(intrinsic); + + unsigned components = intrinsic->num_components; + + if (align_mul % 16 != 0) + components = 4; + + new_intrinsic->num_components = components; + + nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest, + components, 32, NULL); + + nir_builder_instr_insert(b, &new_intrinsic->instr); + + nir_ssa_def *new_dest; + if (align_mul % 16 == 0) { + /* We know that the low 4 bits of the offset are constant and equal to + * align_offset. Use the component offset. + */ + unsigned component = align_offset / 4; + nir_intrinsic_set_base(new_intrinsic, component); + new_dest = &new_intrinsic->dest.ssa; + } else { + /* We have to assume it isn't aligned, and extract the components + * dynamically. + */ + nir_intrinsic_set_base(new_intrinsic, 0); + nir_ssa_def *component = + nir_iand(b, nir_ushr(b, offset, nir_imm_int(b, 2)), nir_imm_int(b, 3)); + nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS]; + for (unsigned i = 0; i < intrinsic->num_components; i++) { + nir_ssa_def *idx = nir_iadd(b, nir_imm_int(b, i), component); + channels[i] = nir_vector_extract(b, &new_intrinsic->dest.ssa, idx); + } + + new_dest = nir_vec(b, channels, intrinsic->num_components); + } + + nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, + nir_src_for_ssa(new_dest)); + + nir_instr_remove(&intrinsic->instr); + + return true; +} + static bool lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx) { @@ -261,6 +339,12 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx) nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + /* UBO */ + if (intr->intrinsic == nir_intrinsic_load_ubo) { + progress |= lower_offset_for_ubo(intr, b); + continue; + } + /* SSBO */ int ir3_intrinsic; uint8_t offset_src_idx; diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c index 753a9919ca0..1b6908edefb 100644 --- a/src/freedreno/ir3/ir3_print.c +++ b/src/freedreno/ir3/ir3_print.c @@ -127,6 +127,8 @@ static void print_instr_name(struct ir3_instruction *instr, bool flags) printf(".s"); if (instr->flags & IR3_INSTR_A1EN) printf(".a1en"); + if (instr->opc == OPC_LDC) + printf(".offset%d", instr->cat6.d); if (instr->flags & IR3_INSTR_B) { printf(".base%d", is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base); -- 2.30.2