From: Alyssa Rosenzweig Date: Wed, 14 Aug 2019 19:28:01 +0000 (-0700) Subject: pan/bifrost: Style format the disassembler X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d8d8b08fe5847755ed0f5ec4723809414dcee290;p=mesa.git pan/bifrost: Style format the disassembler $ astyle *.c *.h --style=linux -s8 Signed-off-by: Alyssa Rosenzweig --- diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c new file mode 100644 index 00000000000..061eab11a9d --- /dev/null +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -0,0 +1,1051 @@ +/* + * Copyright (C) 2019 Ryan Houdek + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler/nir/nir_builder.h" +#include "bifrost_compile.h" +#include "bifrost_opts.h" +#include "bifrost_sched.h" +#include "compiler_defines.h" +#include "disassemble.h" +#include "bifrost_print.h" + +#define BI_DEBUG + +static int +glsl_type_size(const struct glsl_type *type, bool bindless) +{ + return glsl_count_attribute_slots(type, false); +} + +static void +optimize_nir(nir_shader *nir) +{ + bool progress; + + NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0); + NIR_PASS(progress, nir, nir_lower_regs_to_ssa); + + do { + progress = false; + + NIR_PASS(progress, nir, nir_lower_io, nir_var_all, glsl_type_size, 0); + + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_constant_folding); + + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL); + NIR_PASS(progress, nir, nir_opt_if, true); + + } while (progress); + + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); +} + +static unsigned +nir_src_index(compiler_context *ctx, nir_src *src) +{ + if (src->is_ssa) + return src->ssa->index; + else + return ctx->func->impl->ssa_alloc + src->reg.reg->index; +} + +static unsigned +nir_dest_index(compiler_context *ctx, nir_dest *dst) +{ + if (dst->is_ssa) + return dst->ssa.index; + else + return ctx->func->impl->ssa_alloc + dst->reg.reg->index; +} + +static unsigned +nir_alu_src_index(compiler_context *ctx, nir_alu_src *src) +{ + return nir_src_index(ctx, &src->src); +} + +struct bifrost_instruction * +mir_alloc_ins(struct bifrost_instruction instr) +{ + struct bifrost_instruction *heap_ins = malloc(sizeof(instr)); + memcpy(heap_ins, &instr, sizeof(instr)); + return heap_ins; +} + +static void +emit_mir_instruction(struct compiler_context *ctx, struct bifrost_instruction instr) +{ + list_addtail(&(mir_alloc_ins(instr))->link, &ctx->current_block->instructions); +} + +static void +bifrost_block_add_successor(bifrost_block *block, bifrost_block *successor) +{ + assert(block->num_successors < ARRAY_SIZE(block->successors)); + block->successors[block->num_successors++] = successor; +} + +static void +emit_load_const(struct compiler_context *ctx, nir_load_const_instr *instr) +{ + nir_ssa_def def = instr->def; + + float *v = ralloc_array(NULL, float, 1); + nir_const_load_to_arr(v, instr, f32); + _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v); +} + +static uint32_t +alloc_mir_temp(struct compiler_context *ctx) +{ + return SSA_TEMP_VALUE(ctx->mir_temp++); +} + +static uint32_t +emit_ld_vary_addr_constant(struct compiler_context *ctx, uint32_t location) +{ + // LD_VAR_ADDR.f32 {R0, T1}, R61, R62, location:1, R12 + // ... + // ST_VAR.v4 T1, R12, R13, R14, R4 + + // R61-R62 is filled with information needed for varying interpolation + // This loads a vec3 with the information that ST_VAR needs to work + + uint32_t mir_temp_location = alloc_mir_temp(ctx); + // This instruction loads a vec3 starting from the initial register + struct bifrost_instruction instr = { + .op = op_ld_var_addr, + .dest_components = 3, + .ssa_args = { + .dest = mir_temp_location, + .src0 = SSA_FIXED_REGISTER(61), + .src1 = SSA_FIXED_REGISTER(62), + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = location, + }; + emit_mir_instruction(ctx, instr); + + return mir_temp_location; +} + +// XXX: Doesn't support duplicated values in the components! +// RA WILL fail! +static void +emit_create_vector(struct compiler_context *ctx, unsigned dest, unsigned num_comps, uint32_t *comps) +{ + assert(num_comps <= 4 && "Can't make a vector larger than 4 components"); + + // This instruction loads a vec3 starting from the initial register + struct bifrost_instruction instr = { + .op = op_create_vector, + .dest_components = num_comps, + .ssa_args = { + .dest = dest, + } + }; + + uint32_t *srcs[4] = { + &instr.ssa_args.src0, + &instr.ssa_args.src1, + &instr.ssa_args.src2, + &instr.ssa_args.src3, + }; + + for (unsigned i = 0; i < 4; ++i) { + if (i < num_comps) + *srcs[i] = comps[i]; + else + *srcs[i] = SSA_INVALID_VALUE; + } + emit_mir_instruction(ctx, instr); +} + +static uint32_t +emit_extract_vector_element(struct compiler_context *ctx, unsigned ssa_vector, unsigned element) +{ + uint32_t mir_temp_location = alloc_mir_temp(ctx); + // This instruction loads a vec3 starting from the initial register + struct bifrost_instruction instr = { + .op = op_extract_element, + .dest_components = 1, + .ssa_args = { + .dest = mir_temp_location, + .src0 = ssa_vector, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = element, + }; + emit_mir_instruction(ctx, instr); + + return mir_temp_location; +} +static uint32_t +emit_movi(struct compiler_context *ctx, uint32_t literal) +{ + uint32_t mir_temp_location = alloc_mir_temp(ctx); + // This instruction loads a vec3 starting from the initial register + struct bifrost_instruction instr = { + .op = op_movi, + .dest_components = 1, + .ssa_args = { + .dest = mir_temp_location, + .src0 = SSA_INVALID_VALUE, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = literal, + }; + emit_mir_instruction(ctx, instr); + + return mir_temp_location; +} + +static unsigned +nir_alu_src_index_scalar(compiler_context *ctx, nir_alu_instr *nir_instr, unsigned src) +{ + // NIR uses a combination of single channels plus swizzles to determine which component is pulled out of a source + for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; c++) { + if (!nir_alu_instr_channel_used(nir_instr, src, c)) + continue; + // Pull the swizzle from this element that is active and use it as the source + unsigned element = nir_instr->src[src].swizzle[c]; + + // Create an op that extracts an element from a vector + return emit_extract_vector_element(ctx, nir_alu_src_index(ctx, &nir_instr->src[src]), element); + } + assert(0); + return 0; +} + +static void +emit_intrinsic(struct compiler_context *ctx, nir_intrinsic_instr *nir_instr) +{ + nir_const_value *const_offset; + unsigned offset, reg; + + switch (nir_instr->intrinsic) { + case nir_intrinsic_load_ubo: { + nir_const_value *location = nir_src_as_const_value(nir_instr->src[0]); + const_offset = nir_src_as_const_value(nir_instr->src[1]); + assert (location && "no indirect ubo selection"); + assert (const_offset && "no indirect inputs"); + + enum bifrost_ir_ops op; + + // load_ubo , + // ld_ubo , + switch (nir_dest_num_components(nir_instr->dest)) { + case 1: + op = op_ld_ubo_v1; + break; + case 2: + op = op_ld_ubo_v2; + break; + case 3: + op = op_ld_ubo_v3; + break; + case 4: + op = op_ld_ubo_v4; + break; + default: + assert(0); + break; + } + + reg = nir_dest_index(ctx, &nir_instr->dest); + struct bifrost_instruction instr = { + .op = op, + .dest_components = nir_dest_num_components(nir_instr->dest), + .ssa_args = { + .dest = reg, + .src0 = SSA_INVALID_VALUE, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = nir_src_as_uint(nir_instr->src[1]), + .literal_args[1] = nir_src_as_uint(nir_instr->src[0]), + }; + + emit_mir_instruction(ctx, instr); + break; + } + case nir_intrinsic_store_ssbo: { + nir_const_value *location = nir_src_as_const_value(nir_instr->src[1]); + const_offset = nir_src_as_const_value(nir_instr->src[2]); + assert (location && "no indirect ubo selection"); + assert (const_offset && "no indirect inputs"); + + // store_ssbo , , + // store_vN , + reg = nir_src_index(ctx, &nir_instr->src[0]); + + enum bifrost_ir_ops op; + switch (nir_src_num_components(nir_instr->src[0])) { + case 1: + op = op_store_v1; + break; + case 2: + op = op_store_v2; + break; + case 3: + op = op_store_v3; + break; + case 4: + op = op_store_v4; + break; + default: + assert(0); + break; + } + + struct bifrost_instruction instr = { + .op = op, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = reg, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .literal_args[0] = nir_src_as_uint(nir_instr->src[2]), + }; + emit_mir_instruction(ctx, instr); + break; + } + case nir_intrinsic_load_uniform: + offset = nir_intrinsic_base(nir_instr); + + if (nir_src_is_const(nir_instr->src[0])) { + offset += nir_src_as_uint(nir_instr->src[0]); + } else { + assert(0 && "Can't handle indirect load_uniform"); + } + + reg = nir_dest_index(ctx, &nir_instr->dest); + + unsigned num_components = nir_dest_num_components(nir_instr->dest); + if (num_components == 1) { + struct bifrost_instruction instr = { + .op = op_mov, + .dest_components = 1, + .ssa_args = { + .dest = reg, + .src0 = SSA_FIXED_UREGISTER(offset), + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + }; + emit_mir_instruction(ctx, instr); + } else { + uint32_t comps[4]; + + for (unsigned i = 0; i < nir_dest_num_components(nir_instr->dest); ++i) { + uint32_t temp_dest = alloc_mir_temp(ctx); + comps[i] = temp_dest; + struct bifrost_instruction instr = { + .op = op_mov, + .dest_components = 1, + .ssa_args = { + .dest = temp_dest, + .src0 = SSA_FIXED_UREGISTER(offset + (i * 4)), + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + }; + emit_mir_instruction(ctx, instr); + } + + emit_create_vector(ctx, reg, num_components, comps); + } + break; + + case nir_intrinsic_load_input: { + const_offset = nir_src_as_const_value(nir_instr->src[0]); + assert (const_offset && "no indirect inputs"); + + offset = nir_intrinsic_base(nir_instr) + nir_src_as_uint(nir_instr->src[0]); + + reg = nir_dest_index(ctx, &nir_instr->dest); + + enum bifrost_ir_ops op; + switch (nir_dest_num_components(nir_instr->dest)) { + case 1: + op = op_ld_attr_v1; + break; + case 2: + op = op_ld_attr_v2; + break; + case 3: + op = op_ld_attr_v3; + break; + case 4: + op = op_ld_attr_v4; + break; + default: + assert(0); + break; + } + + struct bifrost_instruction instr = { + .op = op, + .dest_components = nir_dest_num_components(nir_instr->dest), + .ssa_args = { + .dest = reg, + .src0 = offset, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + } + }; + + emit_mir_instruction(ctx, instr); + break; + } + case nir_intrinsic_store_output: { + const_offset = nir_src_as_const_value(nir_instr->src[1]); + assert(const_offset && "no indirect outputs"); + + offset = nir_intrinsic_base(nir_instr); + if (ctx->stage == MESA_SHADER_FRAGMENT) { + int comp = nir_intrinsic_component(nir_instr); + offset += comp; + // XXX: Once we support more than colour output then this will need to change + void *entry = _mesa_hash_table_u64_search(ctx->outputs_nir_to_bi, offset + FRAG_RESULT_DATA0 + 1); + + if (!entry) { + printf("WARNING: skipping fragment output\n"); + break; + } + + offset = (uintptr_t) (entry) - 1; + reg = nir_src_index(ctx, &nir_instr->src[0]); + + enum bifrost_ir_ops op; + switch (nir_src_num_components(nir_instr->src[0])) { + case 1: + op = op_store_v1; + break; + case 2: + op = op_store_v2; + break; + case 3: + op = op_store_v3; + break; + case 4: + op = op_store_v4; + break; + default: + assert(0); + break; + } + + // XXX: All offsets aren't vec4 aligned. Will need to adjust this in the future + // XXX: This needs to offset correctly in to memory so the blend step can pick it up + uint32_t movi = emit_movi(ctx, offset * 16); + uint32_t movi2 = emit_movi(ctx, 0); + + uint32_t comps[2] = { + movi, movi2, + }; + uint32_t offset_val = alloc_mir_temp(ctx); + emit_create_vector(ctx, offset_val, 2, comps); + + struct bifrost_instruction instr = { + .op = op, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = offset_val, + .src1 = reg, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + } + }; + emit_mir_instruction(ctx, instr); + } else if (ctx->stage == MESA_SHADER_VERTEX) { + int comp = nir_intrinsic_component(nir_instr); + offset += comp; + void *entry = _mesa_hash_table_u64_search(ctx->varying_nir_to_bi, offset + 2); + + if (!entry) { + printf("WARNING: skipping varying\n"); + break; + } + + offset = (uintptr_t) (entry) - 1; + + reg = nir_src_index(ctx, &nir_instr->src[0]); + // LD_VAR_ADDR.f32 {R0, T1}, R61, R62, location:1, R12 + // ... + // ST_VAR.v4 T1, R12, R13, R14, R4 + + offset = emit_ld_vary_addr_constant(ctx, offset); + enum bifrost_ir_ops op; + switch (nir_src_num_components(nir_instr->src[0])) { + case 1: + op = op_st_vary_v1; + break; + case 2: + op = op_st_vary_v2; + break; + case 3: + op = op_st_vary_v3; + break; + case 4: + op = op_st_vary_v4; + break; + default: + assert(0); + break; + } + + struct bifrost_instruction instr = { + .op = op, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = offset, + .src1 = reg, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + } + }; + emit_mir_instruction(ctx, instr); + } else { + assert(0 && "Unknown store_output stage"); + } + break; + } + default: + printf ("Unhandled intrinsic %s\n", nir_intrinsic_infos[nir_instr->intrinsic].name); + break; + } +} + +#define ALU_CASE(arguments, nir, name) \ + case nir_op_##nir: \ + argument_count = arguments; \ + op = op_##name; \ + break +#define ALU_CASE_MOD(arguments, nir, name, modifiers) \ + case nir_op_##nir: \ + argument_count = arguments; \ + op = op_##name; \ + src_modifiers = modifiers; \ + break + +static void +emit_alu(struct compiler_context *ctx, nir_alu_instr *nir_instr) +{ + unsigned dest = nir_dest_index(ctx, &nir_instr->dest.dest); + unsigned op = ~0U, argument_count; + unsigned src_modifiers = 0; + + switch (nir_instr->op) { + ALU_CASE(2, fmul, fmul_f32); + ALU_CASE(2, fadd, fadd_f32); + ALU_CASE_MOD(2, fsub, fadd_f32, SOURCE_MODIFIER(1, SRC_MOD_NEG)); + ALU_CASE(1, ftrunc, trunc); + ALU_CASE(1, fceil, ceil); + ALU_CASE(1, ffloor, floor); + ALU_CASE(1, fround_even, roundeven); + ALU_CASE(1, frcp, frcp_fast_f32); + ALU_CASE(2, fmax, max_f32); + ALU_CASE(2, fmin, min_f32); + ALU_CASE(2, iadd, add_i32); + ALU_CASE(2, isub, sub_i32); + ALU_CASE(2, imul, mul_i32); + ALU_CASE(2, iand, and_i32); + ALU_CASE(2, ior, or_i32); + ALU_CASE(2, ixor, xor_i32); + ALU_CASE(2, ishl, lshift_i32); + ALU_CASE(2, ushr, rshift_i32); + ALU_CASE(2, ishr, arshift_i32); + case nir_op_ineg: { + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + printf("ineg 0x%08x\n", src0); + struct bifrost_instruction instr = { + .op = op_sub_i32, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = SSA_FIXED_CONST_0, + .src1 = src0, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + }; + + emit_mir_instruction(ctx, instr); + return; + + } + case nir_op_vec2: { + uint32_t comps[3] = { + nir_alu_src_index(ctx, &nir_instr->src[0]), + nir_alu_src_index(ctx, &nir_instr->src[1]), + }; + emit_create_vector(ctx, dest, 2, comps); + return; + break; + } + case nir_op_vec3: { + uint32_t comps[3] = { + nir_alu_src_index(ctx, &nir_instr->src[0]), + nir_alu_src_index(ctx, &nir_instr->src[1]), + nir_alu_src_index(ctx, &nir_instr->src[2]), + }; + emit_create_vector(ctx, dest, 3, comps); + return; + break; + } + case nir_op_vec4: { + uint32_t comps[4] = { + nir_alu_src_index(ctx, &nir_instr->src[0]), + nir_alu_src_index(ctx, &nir_instr->src[1]), + nir_alu_src_index(ctx, &nir_instr->src[2]), + nir_alu_src_index(ctx, &nir_instr->src[3]), + }; + emit_create_vector(ctx, dest, 4, comps); + return; + break; + } + case nir_op_fdiv: { + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + unsigned src1 = nir_alu_src_index_scalar(ctx, nir_instr, 1); + uint32_t mir_temp_location = alloc_mir_temp(ctx); + { + struct bifrost_instruction instr = { + .op = op_frcp_fast_f32, + .dest_components = 1, + .ssa_args = { + .dest = mir_temp_location, + .src0 = src1, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + }; + emit_mir_instruction(ctx, instr); + } + + struct bifrost_instruction instr = { + .op = op_fmul_f32, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .src_modifiers = src_modifiers, + }; + + emit_mir_instruction(ctx, instr); + return; + break; + } + case nir_op_umin: + case nir_op_imin: + case nir_op_umax: + case nir_op_imax: { + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + unsigned src1 = nir_alu_src_index_scalar(ctx, nir_instr, 1); + struct bifrost_instruction instr = { + .op = op_csel_i32, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = src0, + .src3 = src1, + }, + .src_modifiers = src_modifiers, + .literal_args[0] = 0, /* XXX: Comparison operator */ + }; + + emit_mir_instruction(ctx, instr); + return; + break; + } + case nir_op_umin3: + case nir_op_imin3: + case nir_op_umax3: + case nir_op_imax3: { + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + unsigned src1 = nir_alu_src_index_scalar(ctx, nir_instr, 1); + unsigned src2 = nir_alu_src_index_scalar(ctx, nir_instr, 2); + + unsigned op = 0; + if (nir_instr->op == nir_op_umin3) + op = op_umin3_i32; + else if (nir_instr->op == nir_op_imin3) + op = op_imin3_i32; + else if (nir_instr->op == nir_op_umax3) + op = op_umax3_i32; + else if (nir_instr->op == nir_op_imax3) + op = op_imax3_i32; + struct bifrost_instruction instr = { + .op = op, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = src2, + .src3 = SSA_INVALID_VALUE, + }, + .src_modifiers = src_modifiers, + }; + + emit_mir_instruction(ctx, instr); + + return; + break; + } + case nir_op_ine: { + uint32_t movi = emit_movi(ctx, ~0U); + unsigned src0 = nir_alu_src_index(ctx, &nir_instr->src[0]); + unsigned src1 = nir_alu_src_index(ctx, &nir_instr->src[1]); + struct bifrost_instruction instr = { + .op = op_csel_i32, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = movi, + .src3 = SSA_FIXED_CONST_0, + }, + .src_modifiers = src_modifiers, + .literal_args[0] = CSEL_IEQ, /* XXX: Comparison operator */ + }; + + emit_mir_instruction(ctx, instr); + return; + break; + } + default: + printf("Unhandled ALU op %s\n", nir_op_infos[nir_instr->op].name); + return; + } + + unsigned src0 = nir_alu_src_index_scalar(ctx, nir_instr, 0); + unsigned src1 = argument_count >= 2 ? nir_alu_src_index_scalar(ctx, nir_instr, 1) : SSA_INVALID_VALUE; + unsigned src2 = argument_count >= 3 ? nir_alu_src_index_scalar(ctx, nir_instr, 2) : SSA_INVALID_VALUE; + unsigned src3 = argument_count >= 4 ? nir_alu_src_index_scalar(ctx, nir_instr, 3) : SSA_INVALID_VALUE; + + struct bifrost_instruction instr = { + .op = op, + .dest_components = 1, + .ssa_args = { + .dest = dest, + .src0 = src0, + .src1 = src1, + .src2 = src2, + .src3 = src3, + }, + .src_modifiers = src_modifiers, + }; + + emit_mir_instruction(ctx, instr); +} + +static void +emit_instr(struct compiler_context *ctx, struct nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_load_const: + emit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_intrinsic: + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_alu: + emit_alu(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_tex: + printf("Unhandled NIR inst tex\n"); + break; + case nir_instr_type_jump: + printf("Unhandled NIR inst jump\n"); + break; + case nir_instr_type_ssa_undef: + printf("Unhandled NIR inst ssa_undef\n"); + break; + default: + printf("Unhandled instruction type\n"); + break; + } + +} + +static bifrost_block * +emit_block(struct compiler_context *ctx, nir_block *block) +{ + bifrost_block *this_block = calloc(sizeof(bifrost_block), 1); + list_addtail(&this_block->link, &ctx->blocks); + + ++ctx->block_count; + + /* Add this block to be a successor to the previous block */ + if (ctx->current_block) + bifrost_block_add_successor(ctx->current_block, this_block); + + /* Set up current block */ + list_inithead(&this_block->instructions); + ctx->current_block = this_block; + + nir_foreach_instr(instr, block) { + emit_instr(ctx, instr); + ++ctx->instruction_count; + } + +#ifdef BI_DEBUG + print_mir_block(this_block, false); +#endif + return this_block; +} + +void +emit_if(struct compiler_context *ctx, nir_if *nir_inst); + +static struct bifrost_block * +emit_cf_list(struct compiler_context *ctx, struct exec_list *list) +{ + struct bifrost_block *start_block = NULL; + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: { + bifrost_block *block = emit_block(ctx, nir_cf_node_as_block(node)); + + if (!start_block) + start_block = block; + + break; + } + + case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; + + default: + case nir_cf_node_loop: + case nir_cf_node_function: + assert(0); + break; + } + } + + return start_block; +} + +void +emit_if(struct compiler_context *ctx, nir_if *nir_inst) +{ + + // XXX: Conditional branch instruction can do a variety of comparisons with the sources + // Merge the source instruction `ine` with our conditional branch + { + uint32_t movi = emit_movi(ctx, ~0U); + struct bifrost_instruction instr = { + .op = op_branch, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = nir_src_index(ctx, &nir_inst->condition), + .src1 = movi, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .src_modifiers = 0, + .literal_args[0] = BR_COND_EQ, /* XXX: Comparison Arg type */ + .literal_args[1] = 0, /* XXX: Branch target */ + }; + + emit_mir_instruction(ctx, instr); + } + + bifrost_instruction *true_branch = mir_last_instr_in_block(ctx->current_block); + + bifrost_block *true_block = emit_cf_list(ctx, &nir_inst->then_list); + + { + struct bifrost_instruction instr = { + .op = op_branch, + .dest_components = 0, + .ssa_args = { + .dest = SSA_INVALID_VALUE, + .src0 = SSA_INVALID_VALUE, + .src1 = SSA_INVALID_VALUE, + .src2 = SSA_INVALID_VALUE, + .src3 = SSA_INVALID_VALUE, + }, + .src_modifiers = 0, + .literal_args[0] = BR_ALWAYS, /* XXX: ALWAYS */ + .literal_args[1] = 0, /* XXX: Branch target */ + }; + + emit_mir_instruction(ctx, instr); + } + bifrost_instruction *true_exit_branch = mir_last_instr_in_block(ctx->current_block); + + unsigned false_idx = ctx->block_count; + unsigned inst_count = ctx->instruction_count; + + bifrost_block *false_block = emit_cf_list(ctx, &nir_inst->else_list); + + unsigned if_footer_idx = ctx->block_count; + assert(true_block); + assert(false_block); + + + if (ctx->instruction_count == inst_count) { + // If the else branch didn't have anything in it then we can remove the dead jump + mir_remove_instr(true_exit_branch); + } else { + true_exit_branch->literal_args[1] = if_footer_idx; + } + + true_branch->literal_args[1] = false_idx; +} + +int +bifrost_compile_shader_nir(nir_shader *nir, struct bifrost_program *program) +{ + struct compiler_context ictx = { + .nir = nir, + .stage = nir->info.stage, + }; + + struct compiler_context *ctx = &ictx; + + ctx->mir_temp = 0; + + /* Initialize at a global (not block) level hash tables */ + ctx->ssa_constants = _mesa_hash_table_u64_create(NULL); + ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); + + /* Assign actual uniform location, skipping over samplers */ + ctx->uniform_nir_to_bi = _mesa_hash_table_u64_create(NULL); + + nir_foreach_variable(var, &nir->uniforms) { + if (glsl_get_base_type(var->type) == GLSL_TYPE_SAMPLER) continue; + + for (int col = 0; col < glsl_get_matrix_columns(var->type); ++col) { + int id = ctx->uniform_count++; + _mesa_hash_table_u64_insert(ctx->uniform_nir_to_bi, var->data.driver_location + col + 1, (void *) ((uintptr_t) (id + 1))); + } + } + + if (ctx->stage == MESA_SHADER_VERTEX) { + ctx->varying_nir_to_bi = _mesa_hash_table_u64_create(NULL); + nir_foreach_variable(var, &nir->outputs) { + if (var->data.location < VARYING_SLOT_VAR0) { + if (var->data.location == VARYING_SLOT_POS) + ctx->varying_count++; + _mesa_hash_table_u64_insert(ctx->varying_nir_to_bi, var->data.driver_location + 1, (void *) ((uintptr_t) (1))); + + continue; + } + + for (int col = 0; col < glsl_get_matrix_columns(var->type); ++col) { + for (int comp = 0; comp < 4; ++comp) { + int id = comp + ctx->varying_count++; + _mesa_hash_table_u64_insert(ctx->varying_nir_to_bi, var->data.driver_location + col + comp + 1, (void *) ((uintptr_t) (id + 1))); + } + } + } + + } else if (ctx->stage == MESA_SHADER_FRAGMENT) { + ctx->outputs_nir_to_bi = _mesa_hash_table_u64_create(NULL); + nir_foreach_variable(var, &nir->outputs) { + if (var->data.location >= FRAG_RESULT_DATA0 && var->data.location <= FRAG_RESULT_DATA7) { + int id = ctx->outputs_count++; + printf("Driver location: %d with id %d\n", var->data.location + 1, id); + _mesa_hash_table_u64_insert(ctx->outputs_nir_to_bi, var->data.location + 1, (void *) ((uintptr_t) (id + 1))); + } + } + } + + /* Optimisation passes */ + optimize_nir(nir); + +#ifdef BI_DEBUG + nir_print_shader(nir, stdout); +#endif + + /* Generate machine IR for shader */ + nir_foreach_function(func, nir) { + nir_builder _b; + ctx->b = &_b; + nir_builder_init(ctx->b, func->impl); + + list_inithead(&ctx->blocks); + ctx->block_count = 0; + ctx->func = func; + + emit_cf_list(ctx, &func->impl->body); + + break; // XXX: Once we support multi function shaders then implement + } + + util_dynarray_init(&program->compiled, NULL); + + // MIR pre-RA optimizations + + bool progress = false; + + do { + progress = false; + mir_foreach_block(ctx, block) { + // XXX: Not yet working +// progress |= bifrost_opt_branch_fusion(ctx, block); + } + } while (progress); + + schedule_program(ctx); + +#ifdef BI_DEBUG + nir_print_shader(nir, stdout); + disassemble_bifrost(program->compiled.data, program->compiled.size, false); +#endif + return 0; +} diff --git a/src/panfrost/bifrost/bifrost_compile.h b/src/panfrost/bifrost/bifrost_compile.h new file mode 100644 index 00000000000..e687f64f7eb --- /dev/null +++ b/src/panfrost/bifrost/bifrost_compile.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2018 Ryan Houdek + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __bifrost_compile_h__ +#define __bifrost_compile_h__ + +#include "compiler/nir/nir.h" +#include "util/u_dynarray.h" + +struct bifrost_program { + struct util_dynarray compiled; +}; + +int +bifrost_compile_shader_nir(nir_shader *nir, struct bifrost_program *program); + +static const nir_shader_compiler_options bifrost_nir_options = { + .fuse_ffma = true, + .lower_flrp16 = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_fmod = true, + .lower_bitfield_extract = true, + .lower_bitfield_extract_to_shifts = true, + .lower_bitfield_insert = true, + .lower_bitfield_insert_to_shifts = true, + .lower_bitfield_reverse = true, + .lower_idiv = true, + .lower_isign = true, + .lower_fsign = true, + .lower_ffract = true, + .lower_pack_half_2x16 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_pack_snorm_4x8 = true, + .lower_unpack_half_2x16 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_unpack_snorm_4x8 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_all_io_to_temps = true, + .lower_all_io_to_elements = true, + .vertex_id_zero_based = true, +}; + +#endif diff --git a/src/panfrost/bifrost/cmdline.c b/src/panfrost/bifrost/cmdline.c index 4a30d7ec9f6..1abd932530b 100644 --- a/src/panfrost/bifrost/cmdline.c +++ b/src/panfrost/bifrost/cmdline.c @@ -30,6 +30,8 @@ #include "compiler/nir_types.h" #include "util/u_dynarray.h" +#include "bifrost_compile.h" + static void compile_shader(char **argv) { @@ -100,8 +102,7 @@ main(int argc, char **argv) } if (strcmp(argv[1], "compile") == 0) { compile_shader(&argv[2]); - } - else if (strcmp(argv[1], "disasm") == 0) { + } else if (strcmp(argv[1], "disasm") == 0) { disassemble(argv[2]); } return 0; diff --git a/src/panfrost/bifrost/disassemble.c b/src/panfrost/bifrost/disassemble.c index 02a60448bd8..c7e131d5d5b 100644 --- a/src/panfrost/bifrost/disassemble.c +++ b/src/panfrost/bifrost/disassemble.c @@ -81,7 +81,7 @@ enum bifrost_reg_write_unit { }; // this represents the decoded version of the ctrl register field. -struct bifrost_reg_ctrl{ +struct bifrost_reg_ctrl { bool read_reg0; bool read_reg1; bool read_reg3; @@ -192,7 +192,8 @@ void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_r unsigned data_reg, unsigned offset, bool verbose); bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose); -void dump_header(struct bifrost_header header, bool verbose) { +void dump_header(struct bifrost_header header, bool verbose) +{ if (header.clause_type != 0) { printf("id(%du) ", header.scoreboard_index); } @@ -249,7 +250,7 @@ void dump_header(struct bifrost_header header, bool verbose) { if (verbose) { printf("# clause type %d, next clause type %d\n", - header.clause_type, header.next_clause_type); + header.clause_type, header.next_clause_type); } } @@ -266,49 +267,49 @@ static struct bifrost_reg_ctrl DecodeRegCtrl(struct bifrost_regs regs) decoded.read_reg0 = decoded.read_reg1 = true; } switch (ctrl) { - case 1: - decoded.fma_write_unit = REG_WRITE_TWO; - break; - case 2: - case 3: - decoded.fma_write_unit = REG_WRITE_TWO; - decoded.read_reg3 = true; - break; - case 4: - decoded.read_reg3 = true; - break; - case 5: - decoded.add_write_unit = REG_WRITE_TWO; - break; - case 6: - decoded.add_write_unit = REG_WRITE_TWO; - decoded.read_reg3 = true; - break; - case 8: - decoded.clause_start = true; - break; - case 9: - decoded.fma_write_unit = REG_WRITE_TWO; - decoded.clause_start = true; - break; - case 11: - break; - case 12: - decoded.read_reg3 = true; - decoded.clause_start = true; - break; - case 13: - decoded.add_write_unit = REG_WRITE_TWO; - decoded.clause_start = true; - break; - - case 7: - case 15: - decoded.fma_write_unit = REG_WRITE_THREE; - decoded.add_write_unit = REG_WRITE_TWO; - break; - default: - printf("# unknown reg ctrl %d\n", ctrl); + case 1: + decoded.fma_write_unit = REG_WRITE_TWO; + break; + case 2: + case 3: + decoded.fma_write_unit = REG_WRITE_TWO; + decoded.read_reg3 = true; + break; + case 4: + decoded.read_reg3 = true; + break; + case 5: + decoded.add_write_unit = REG_WRITE_TWO; + break; + case 6: + decoded.add_write_unit = REG_WRITE_TWO; + decoded.read_reg3 = true; + break; + case 8: + decoded.clause_start = true; + break; + case 9: + decoded.fma_write_unit = REG_WRITE_TWO; + decoded.clause_start = true; + break; + case 11: + break; + case 12: + decoded.read_reg3 = true; + decoded.clause_start = true; + break; + case 13: + decoded.add_write_unit = REG_WRITE_TWO; + decoded.clause_start = true; + break; + + case 7: + case 15: + decoded.fma_write_unit = REG_WRITE_THREE; + decoded.add_write_unit = REG_WRITE_TWO; + break; + default: + printf("# unknown reg ctrl %d\n", ctrl); } return decoded; @@ -319,13 +320,13 @@ static struct bifrost_reg_ctrl DecodeRegCtrl(struct bifrost_regs regs) static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs) { switch (unit) { - case REG_WRITE_TWO: - return regs.reg2; - case REG_WRITE_THREE: - return regs.reg3; - default: /* REG_WRITE_NONE */ - assert(0); - return 0; + case REG_WRITE_TWO: + return regs.reg2; + case REG_WRITE_THREE: + return regs.reg3; + default: /* REG_WRITE_NONE */ + assert(0); + return 0; } } @@ -373,13 +374,27 @@ static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs) unsigned low_bits = srcs.uniform_const & 0xf; uint64_t imm; switch (srcs.uniform_const >> 4) { - case 4: imm = consts[0]; break; - case 5: imm = consts[1]; break; - case 6: imm = consts[2]; break; - case 7: imm = consts[3]; break; - case 2: imm = consts[4]; break; - case 3: imm = consts[5]; break; - default: assert(0); break; + case 4: + imm = consts[0]; + break; + case 5: + imm = consts[1]; + break; + case 6: + imm = consts[2]; + break; + case 7: + imm = consts[3]; + break; + case 2: + imm = consts[4]; + break; + case 3: + imm = consts[5]; + break; + default: + assert(0); + break; } return imm | low_bits; } @@ -397,22 +412,28 @@ static void dump_uniform_const_src(struct bifrost_regs srcs, uint64_t *consts, b dump_const_imm(imm); } else { switch (srcs.uniform_const) { - case 0: printf("0"); break; - case 5: printf("atest-data"); break; - case 6: printf("sample-ptr"); break; - case 8: - case 9: - case 10: - case 11: - case 12: - case 13: - case 14: - case 15: - printf("blend-descriptor%u", (unsigned) srcs.uniform_const - 8); - break; - default: - printf("unkConst%u", (unsigned) srcs.uniform_const); - break; + case 0: + printf("0"); + break; + case 5: + printf("atest-data"); + break; + case 6: + printf("sample-ptr"); + break; + case 8: + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + printf("blend-descriptor%u", (unsigned) srcs.uniform_const - 8); + break; + default: + printf("unkConst%u", (unsigned) srcs.uniform_const); + break; } if (high32) @@ -425,94 +446,113 @@ static void dump_uniform_const_src(struct bifrost_regs srcs, uint64_t *consts, b static void dump_src(unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA) { switch (src) { - case 0: printf("R%d", get_reg0(srcs)); break; - case 1: printf("R%d", get_reg1(srcs)); break; - case 2: printf("R%d", srcs.reg3); break; - case 3: - if (isFMA) - printf("0"); - else - printf("T"); // i.e. the output of FMA this cycle - break; - case 4: - dump_uniform_const_src(srcs, consts, false); - break; - case 5: - dump_uniform_const_src(srcs, consts, true); - break; - case 6: printf("T0"); break; - case 7: printf("T1"); break; + case 0: + printf("R%d", get_reg0(srcs)); + break; + case 1: + printf("R%d", get_reg1(srcs)); + break; + case 2: + printf("R%d", srcs.reg3); + break; + case 3: + if (isFMA) + printf("0"); + else + printf("T"); // i.e. the output of FMA this cycle + break; + case 4: + dump_uniform_const_src(srcs, consts, false); + break; + case 5: + dump_uniform_const_src(srcs, consts, true); + break; + case 6: + printf("T0"); + break; + case 7: + printf("T1"); + break; } } static void dump_output_mod(unsigned mod) { switch (mod) { - case 0: - break; - case 1: - printf(".clamp_0_inf"); break; // max(out, 0) - case 2: - printf(".clamp_m1_1"); break; // clamp(out, -1, 1) - case 3: - printf(".clamp_0_1"); break; // clamp(out, 0, 1) - default: - break; + case 0: + break; + case 1: + printf(".clamp_0_inf"); + break; // max(out, 0) + case 2: + printf(".clamp_m1_1"); + break; // clamp(out, -1, 1) + case 3: + printf(".clamp_0_1"); + break; // clamp(out, 0, 1) + default: + break; } } static void dump_minmax_mode(unsigned mod) { switch (mod) { - case 0: - /* Same as fmax() and fmin() -- return the other number if any - * number is NaN. Also always return +0 if one argument is +0 and - * the other is -0. - */ - break; - case 1: - /* Instead of never returning a NaN, always return one. The - * "greater"/"lesser" NaN is always returned, first by checking the - * sign and then the mantissa bits. - */ - printf(".nan_wins"); break; - case 2: - /* For max, implement src0 > src1 ? src0 : src1 - * For min, implement src0 < src1 ? src0 : src1 - * - * This includes handling NaN's and signedness of 0 differently - * from above, since +0 and -0 compare equal and comparisons always - * return false for NaN's. As a result, this mode is *not* - * commutative. - */ - printf(".src1_wins"); break; - case 3: - /* For max, implement src0 < src1 ? src1 : src0 - * For min, implement src0 > src1 ? src1 : src0 - */ - printf(".src0_wins"); break; - default: - break; + case 0: + /* Same as fmax() and fmin() -- return the other number if any + * number is NaN. Also always return +0 if one argument is +0 and + * the other is -0. + */ + break; + case 1: + /* Instead of never returning a NaN, always return one. The + * "greater"/"lesser" NaN is always returned, first by checking the + * sign and then the mantissa bits. + */ + printf(".nan_wins"); + break; + case 2: + /* For max, implement src0 > src1 ? src0 : src1 + * For min, implement src0 < src1 ? src0 : src1 + * + * This includes handling NaN's and signedness of 0 differently + * from above, since +0 and -0 compare equal and comparisons always + * return false for NaN's. As a result, this mode is *not* + * commutative. + */ + printf(".src1_wins"); + break; + case 3: + /* For max, implement src0 < src1 ? src1 : src0 + * For min, implement src0 > src1 ? src1 : src0 + */ + printf(".src0_wins"); + break; + default: + break; } } static void dump_round_mode(unsigned mod) { switch (mod) { - case 0: - /* roundTiesToEven, the IEEE default. */ - break; - case 1: - /* roundTowardPositive in the IEEE spec. */ - printf(".round_pos"); break; - case 2: - /* roundTowardNegative in the IEEE spec. */ - printf(".round_neg"); break; - case 3: - /* roundTowardZero in the IEEE spec. */ - printf(".round_zero"); break; - default: - break; + case 0: + /* roundTiesToEven, the IEEE default. */ + break; + case 1: + /* roundTowardPositive in the IEEE spec. */ + printf(".round_pos"); + break; + case 2: + /* roundTowardNegative in the IEEE spec. */ + printf(".round_neg"); + break; + case 3: + /* roundTowardZero in the IEEE spec. */ + printf(".round_zero"); + break; + default: + break; } } @@ -753,39 +793,39 @@ static struct fma_op_info find_fma_op_info(unsigned op) for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) { unsigned opCmp = ~0; switch (FMAOpInfos[i].src_type) { - case FMA_ONE_SRC: - opCmp = op; - break; - case FMA_TWO_SRC: - opCmp = op & ~0x7; - break; - case FMA_FCMP: - case FMA_FCMP16: - opCmp = op & ~0x1fff; - break; - case FMA_THREE_SRC: - case FMA_SHIFT_ADD64: - opCmp = op & ~0x3f; - break; - case FMA_FADD: - case FMA_FMINMAX: - case FMA_FADD16: - case FMA_FMINMAX16: - opCmp = op & ~0x3fff; - break; - case FMA_FMA: - case FMA_FMA16: - opCmp = op & ~0x3ffff; - break; - case FMA_FOUR_SRC: - opCmp = op & ~0x1ff; - break; - case FMA_FMA_MSCALE: - opCmp = op & ~0x7fff; - break; - default: - opCmp = ~0; - break; + case FMA_ONE_SRC: + opCmp = op; + break; + case FMA_TWO_SRC: + opCmp = op & ~0x7; + break; + case FMA_FCMP: + case FMA_FCMP16: + opCmp = op & ~0x1fff; + break; + case FMA_THREE_SRC: + case FMA_SHIFT_ADD64: + opCmp = op & ~0x3f; + break; + case FMA_FADD: + case FMA_FMINMAX: + case FMA_FADD16: + case FMA_FMINMAX16: + opCmp = op & ~0x3fff; + break; + case FMA_FMA: + case FMA_FMA16: + opCmp = op & ~0x3ffff; + break; + case FMA_FOUR_SRC: + opCmp = op & ~0x1ff; + break; + case FMA_FMA_MSCALE: + opCmp = op & ~0x7fff; + break; + default: + opCmp = ~0; + break; } if (FMAOpInfos[i].op == opCmp) return FMAOpInfos[i]; @@ -801,27 +841,27 @@ static struct fma_op_info find_fma_op_info(unsigned op) static void dump_fcmp(unsigned op) { switch (op) { - case 0: - printf(".OEQ"); - break; - case 1: - printf(".OGT"); - break; - case 2: - printf(".OGE"); - break; - case 3: - printf(".UNE"); - break; - case 4: - printf(".OLT"); - break; - case 5: - printf(".OLE"); - break; - default: - printf(".unk%d", op); - break; + case 0: + printf(".OEQ"); + break; + case 1: + printf(".OGT"); + break; + case 2: + printf(".OGE"); + break; + case 3: + printf(".UNE"); + break; + case 4: + printf(".OLT"); + break; + case 5: + printf(".OLE"); + break; + default: + printf(".unk%d", op); + break; } } @@ -835,44 +875,44 @@ static void dump_16swizzle(unsigned swiz) static void dump_fma_expand_src0(unsigned ctrl) { switch (ctrl) { - case 3: - case 4: - case 6: - printf(".x"); - break; - case 5: - case 7: - printf(".y"); - break; - case 0: - case 1: - case 2: - break; - default: - printf(".unk"); - break; + case 3: + case 4: + case 6: + printf(".x"); + break; + case 5: + case 7: + printf(".y"); + break; + case 0: + case 1: + case 2: + break; + default: + printf(".unk"); + break; } } static void dump_fma_expand_src1(unsigned ctrl) { switch (ctrl) { - case 1: - case 3: - printf(".x"); - break; - case 2: - case 4: - case 5: - printf(".y"); - break; - case 0: - case 6: - case 7: - break; - default: - printf(".unk"); - break; + case 1: + case 3: + printf(".x"); + break; + case 2: + case 4: + case 5: + printf(".y"); + break; + case 0: + case 6: + case 7: + break; + default: + printf(".unk"); + break; } } @@ -887,25 +927,25 @@ static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_reg printf("%s", info.name); if (info.src_type == FMA_FADD || - info.src_type == FMA_FMINMAX || - info.src_type == FMA_FMA || - info.src_type == FMA_FADD16 || - info.src_type == FMA_FMINMAX16 || - info.src_type == FMA_FMA16) { + info.src_type == FMA_FMINMAX || + info.src_type == FMA_FMA || + info.src_type == FMA_FADD16 || + info.src_type == FMA_FMINMAX16 || + info.src_type == FMA_FMA16) { dump_output_mod(bits(FMA.op, 12, 14)); switch (info.src_type) { - case FMA_FADD: - case FMA_FMA: - case FMA_FADD16: - case FMA_FMA16: - dump_round_mode(bits(FMA.op, 10, 12)); - break; - case FMA_FMINMAX: - case FMA_FMINMAX16: - dump_minmax_mode(bits(FMA.op, 10, 12)); - break; - default: - assert(0); + case FMA_FADD: + case FMA_FMA: + case FMA_FADD16: + case FMA_FMA16: + dump_round_mode(bits(FMA.op, 10, 12)); + break; + case FMA_FMINMAX: + case FMA_FMINMAX16: + dump_minmax_mode(bits(FMA.op, 10, 12)); + break; + default: + assert(0); } } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) { dump_fcmp(bits(FMA.op, 10, 13)); @@ -916,25 +956,25 @@ static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_reg } else if (info.src_type == FMA_FMA_MSCALE) { if (FMA.op & (1 << 11)) { switch ((FMA.op >> 9) & 0x3) { - case 0: - /* This mode seems to do a few things: - * - Makes 0 * infinity (and incidentally 0 * nan) return 0, - * since generating a nan would poison the result of - * 1/infinity and 1/0. - * - Fiddles with which nan is returned in nan * nan, - * presumably to make sure that the same exact nan is - * returned for 1/nan. - */ - printf(".rcp_mode"); - break; - case 3: - /* Similar to the above, but src0 always wins when multiplying - * 0 by infinity. - */ - printf(".sqrt_mode"); - break; - default: - printf(".unk%d_mode", (int) (FMA.op >> 9) & 0x3); + case 0: + /* This mode seems to do a few things: + * - Makes 0 * infinity (and incidentally 0 * nan) return 0, + * since generating a nan would poison the result of + * 1/infinity and 1/0. + * - Fiddles with which nan is returned in nan * nan, + * presumably to make sure that the same exact nan is + * returned for 1/nan. + */ + printf(".rcp_mode"); + break; + case 3: + /* Similar to the above, but src0 always wins when multiplying + * 0 by infinity. + */ + printf(".sqrt_mode"); + break; + default: + printf(".unk%d_mode", (int) (FMA.op >> 9) & 0x3); } } else { dump_output_mod(bits(FMA.op, 9, 11)); @@ -951,162 +991,162 @@ static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_reg } switch (info.src_type) { - case FMA_ONE_SRC: - dump_src(FMA.src0, regs, consts, true); - break; - case FMA_TWO_SRC: - dump_src(FMA.src0, regs, consts, true); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - break; - case FMA_FADD: - case FMA_FMINMAX: - if (FMA.op & 0x10) - printf("-"); - if (FMA.op & 0x200) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); - dump_fma_expand_src0((FMA.op >> 6) & 0x7); - if (FMA.op & 0x200) - printf(")"); - printf(", "); - if (FMA.op & 0x20) - printf("-"); - if (FMA.op & 0x8) - printf("abs("); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_fma_expand_src1((FMA.op >> 6) & 0x7); - if (FMA.op & 0x8) - printf(")"); - break; - case FMA_FADD16: - case FMA_FMINMAX16: { - bool abs1 = FMA.op & 0x8; - bool abs2 = (FMA.op & 0x7) < FMA.src0; - if (FMA.op & 0x10) - printf("-"); - if (abs1 || abs2) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); - dump_16swizzle((FMA.op >> 6) & 0x3); - if (abs1 || abs2) - printf(")"); - printf(", "); - if (FMA.op & 0x20) - printf("-"); - if (abs1 && abs2) - printf("abs("); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_16swizzle((FMA.op >> 8) & 0x3); - if (abs1 && abs2) - printf(")"); - break; - } - case FMA_FCMP: - if (FMA.op & 0x200) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); - dump_fma_expand_src0((FMA.op >> 6) & 0x7); - if (FMA.op & 0x200) - printf(")"); - printf(", "); - if (FMA.op & 0x20) - printf("-"); - if (FMA.op & 0x8) - printf("abs("); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_fma_expand_src1((FMA.op >> 6) & 0x7); - if (FMA.op & 0x8) - printf(")"); - break; - case FMA_FCMP16: - dump_src(FMA.src0, regs, consts, true); - // Note: this is kinda a guess, I haven't seen the blob set this to - // anything other than the identity, but it matches FMA_TWO_SRCFmod16 - dump_16swizzle((FMA.op >> 6) & 0x3); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_16swizzle((FMA.op >> 8) & 0x3); - break; - case FMA_SHIFT_ADD64: - dump_src(FMA.src0, regs, consts, true); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - printf(", "); - printf("shift:%u", (FMA.op >> 3) & 0x7); - break; - case FMA_THREE_SRC: - dump_src(FMA.src0, regs, consts, true); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - printf(", "); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); - break; - case FMA_FMA: - if (FMA.op & (1 << 14)) - printf("-"); - if (FMA.op & (1 << 9)) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); - dump_fma_expand_src0((FMA.op >> 6) & 0x7); - if (FMA.op & (1 << 9)) - printf(")"); - printf(", "); - if (FMA.op & (1 << 16)) - printf("abs("); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_fma_expand_src1((FMA.op >> 6) & 0x7); - if (FMA.op & (1 << 16)) - printf(")"); - printf(", "); - if (FMA.op & (1 << 15)) - printf("-"); - if (FMA.op & (1 << 17)) - printf("abs("); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); - if (FMA.op & (1 << 17)) - printf(")"); - break; - case FMA_FMA16: - if (FMA.op & (1 << 14)) - printf("-"); - dump_src(FMA.src0, regs, consts, true); - dump_16swizzle((FMA.op >> 6) & 0x3); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - dump_16swizzle((FMA.op >> 8) & 0x3); - printf(", "); - if (FMA.op & (1 << 15)) - printf("-"); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); - dump_16swizzle((FMA.op >> 16) & 0x3); - break; - case FMA_FOUR_SRC: - dump_src(FMA.src0, regs, consts, true); - printf(", "); - dump_src(FMA.op & 0x7, regs, consts, true); - printf(", "); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); - printf(", "); - dump_src((FMA.op >> 6) & 0x7, regs, consts, true); - break; - case FMA_FMA_MSCALE: - if (FMA.op & (1 << 12)) - printf("abs("); - dump_src(FMA.src0, regs, consts, true); - if (FMA.op & (1 << 12)) - printf(")"); - printf(", "); - if (FMA.op & (1 << 13)) - printf("-"); - dump_src(FMA.op & 0x7, regs, consts, true); - printf(", "); - if (FMA.op & (1 << 14)) - printf("-"); - dump_src((FMA.op >> 3) & 0x7, regs, consts, true); - printf(", "); - dump_src((FMA.op >> 6) & 0x7, regs, consts, true); - break; + case FMA_ONE_SRC: + dump_src(FMA.src0, regs, consts, true); + break; + case FMA_TWO_SRC: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + break; + case FMA_FADD: + case FMA_FMINMAX: + if (FMA.op & 0x10) + printf("-"); + if (FMA.op & 0x200) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_fma_expand_src0((FMA.op >> 6) & 0x7); + if (FMA.op & 0x200) + printf(")"); + printf(", "); + if (FMA.op & 0x20) + printf("-"); + if (FMA.op & 0x8) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1((FMA.op >> 6) & 0x7); + if (FMA.op & 0x8) + printf(")"); + break; + case FMA_FADD16: + case FMA_FMINMAX16: { + bool abs1 = FMA.op & 0x8; + bool abs2 = (FMA.op & 0x7) < FMA.src0; + if (FMA.op & 0x10) + printf("-"); + if (abs1 || abs2) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_16swizzle((FMA.op >> 6) & 0x3); + if (abs1 || abs2) + printf(")"); + printf(", "); + if (FMA.op & 0x20) + printf("-"); + if (abs1 && abs2) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 8) & 0x3); + if (abs1 && abs2) + printf(")"); + break; + } + case FMA_FCMP: + if (FMA.op & 0x200) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_fma_expand_src0((FMA.op >> 6) & 0x7); + if (FMA.op & 0x200) + printf(")"); + printf(", "); + if (FMA.op & 0x20) + printf("-"); + if (FMA.op & 0x8) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1((FMA.op >> 6) & 0x7); + if (FMA.op & 0x8) + printf(")"); + break; + case FMA_FCMP16: + dump_src(FMA.src0, regs, consts, true); + // Note: this is kinda a guess, I haven't seen the blob set this to + // anything other than the identity, but it matches FMA_TWO_SRCFmod16 + dump_16swizzle((FMA.op >> 6) & 0x3); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 8) & 0x3); + break; + case FMA_SHIFT_ADD64: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + printf("shift:%u", (FMA.op >> 3) & 0x7); + break; + case FMA_THREE_SRC: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + break; + case FMA_FMA: + if (FMA.op & (1 << 14)) + printf("-"); + if (FMA.op & (1 << 9)) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + dump_fma_expand_src0((FMA.op >> 6) & 0x7); + if (FMA.op & (1 << 9)) + printf(")"); + printf(", "); + if (FMA.op & (1 << 16)) + printf("abs("); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_fma_expand_src1((FMA.op >> 6) & 0x7); + if (FMA.op & (1 << 16)) + printf(")"); + printf(", "); + if (FMA.op & (1 << 15)) + printf("-"); + if (FMA.op & (1 << 17)) + printf("abs("); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + if (FMA.op & (1 << 17)) + printf(")"); + break; + case FMA_FMA16: + if (FMA.op & (1 << 14)) + printf("-"); + dump_src(FMA.src0, regs, consts, true); + dump_16swizzle((FMA.op >> 6) & 0x3); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 8) & 0x3); + printf(", "); + if (FMA.op & (1 << 15)) + printf("-"); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + dump_16swizzle((FMA.op >> 16) & 0x3); + break; + case FMA_FOUR_SRC: + dump_src(FMA.src0, regs, consts, true); + printf(", "); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 6) & 0x7, regs, consts, true); + break; + case FMA_FMA_MSCALE: + if (FMA.op & (1 << 12)) + printf("abs("); + dump_src(FMA.src0, regs, consts, true); + if (FMA.op & (1 << 12)) + printf(")"); + printf(", "); + if (FMA.op & (1 << 13)) + printf("-"); + dump_src(FMA.op & 0x7, regs, consts, true); + printf(", "); + if (FMA.op & (1 << 14)) + printf("-"); + dump_src((FMA.op >> 3) & 0x7, regs, consts, true); + printf(", "); + dump_src((FMA.op >> 6) & 0x7, regs, consts, true); + break; } printf("\n"); } @@ -1332,50 +1372,50 @@ static struct add_op_info find_add_op_info(unsigned op) for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) { unsigned opCmp = ~0; switch (add_op_infos[i].src_type) { - case ADD_ONE_SRC: - case ADD_BLENDING: - opCmp = op; - break; - case ADD_TWO_SRC: - opCmp = op & ~0x7; - break; - case ADD_THREE_SRC: - opCmp = op & ~0x3f; - break; - case ADD_TEX: - opCmp = op & ~0xf; - break; - case ADD_FADD: - case ADD_FMINMAX: - case ADD_FADD16: - opCmp = op & ~0x1fff; - break; - case ADD_FMINMAX16: - case ADD_FADDMscale: - opCmp = op & ~0xfff; - break; - case ADD_FCMP: - case ADD_FCMP16: - opCmp = op & ~0x7ff; - break; - case ADD_TEX_COMPACT: - opCmp = op & ~0x3ff; - break; - case ADD_VARYING_INTERP: - opCmp = op & ~0x7ff; - break; - case ADD_VARYING_ADDRESS: - opCmp = op & ~0xff; - break; - case ADD_LOAD_ATTR: - opCmp = op & ~0x7f; - break; - case ADD_BRANCH: - opCmp = op & ~0xfff; - break; - default: - opCmp = ~0; - break; + case ADD_ONE_SRC: + case ADD_BLENDING: + opCmp = op; + break; + case ADD_TWO_SRC: + opCmp = op & ~0x7; + break; + case ADD_THREE_SRC: + opCmp = op & ~0x3f; + break; + case ADD_TEX: + opCmp = op & ~0xf; + break; + case ADD_FADD: + case ADD_FMINMAX: + case ADD_FADD16: + opCmp = op & ~0x1fff; + break; + case ADD_FMINMAX16: + case ADD_FADDMscale: + opCmp = op & ~0xfff; + break; + case ADD_FCMP: + case ADD_FCMP16: + opCmp = op & ~0x7ff; + break; + case ADD_TEX_COMPACT: + opCmp = op & ~0x3ff; + break; + case ADD_VARYING_INTERP: + opCmp = op & ~0x7ff; + break; + case ADD_VARYING_ADDRESS: + opCmp = op & ~0xff; + break; + case ADD_LOAD_ATTR: + opCmp = op & ~0x7f; + break; + case ADD_BRANCH: + opCmp = op & ~0xfff; + break; + default: + opCmp = ~0; + break; } if (add_op_infos[i].op == opCmp) return add_op_infos[i]; @@ -1390,7 +1430,7 @@ static struct add_op_info find_add_op_info(unsigned op) } static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, - unsigned data_reg, unsigned offset, bool verbose) + unsigned data_reg, unsigned offset, bool verbose) { if (verbose) { printf("# ADD: %016" PRIx64 "\n", word); @@ -1417,18 +1457,29 @@ static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_reg printf(".v2f16"); } else if (info.src_type == ADD_FADDMscale) { switch ((ADD.op >> 6) & 0x7) { - case 0: break; - // causes GPU hangs on G71 - case 1: printf(".invalid"); break; - // Same as usual outmod value. - case 2: printf(".clamp_0_1"); break; - // If src0 is infinite or NaN, flush it to zero so that the other - // source is passed through unmodified. - case 3: printf(".flush_src0_inf_nan"); break; - // Vice versa. - case 4: printf(".flush_src1_inf_nan"); break; - // Every other case seems to behave the same as the above? - default: printf(".unk%d", (ADD.op >> 6) & 0x7); break; + case 0: + break; + // causes GPU hangs on G71 + case 1: + printf(".invalid"); + break; + // Same as usual outmod value. + case 2: + printf(".clamp_0_1"); + break; + // If src0 is infinite or NaN, flush it to zero so that the other + // source is passed through unmodified. + case 3: + printf(".flush_src0_inf_nan"); + break; + // Vice versa. + case 4: + printf(".flush_src1_inf_nan"); + break; + // Every other case seems to behave the same as the above? + default: + printf(".unk%d", (ADD.op >> 6) & 0x7); + break; } } else if (info.src_type == ADD_VARYING_INTERP) { if (ADD.op & 0x200) @@ -1436,10 +1487,17 @@ static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_reg if (ADD.op & 0x400) printf(".flat"); switch ((ADD.op >> 7) & 0x3) { - case 0: printf(".per_frag"); break; - case 1: printf(".centroid"); break; - case 2: break; - case 3: printf(".explicit"); break; + case 0: + printf(".per_frag"); + break; + case 1: + printf(".centroid"); + break; + case 2: + break; + case 3: + printf(".explicit"); + break; } printf(".v%d", ((ADD.op >> 5) & 0x3) + 1); } else if (info.src_type == ADD_BRANCH) { @@ -1466,79 +1524,79 @@ static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_reg portSwapped = !(ADD.op & 1); switch (cond) { - case BR_COND_LT: - if (portSwapped) - printf(".LT.u"); - else - printf(".LT.i"); - break; - case BR_COND_LE: - if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) { - printf(".UNE.f"); - } else { - if (portSwapped) - printf(".LE.u"); - else - printf(".LE.i"); - } - break; - case BR_COND_GT: - if (portSwapped) - printf(".GT.u"); - else - printf(".GT.i"); - break; - case BR_COND_GE: - if (portSwapped) - printf(".GE.u"); - else - printf(".GE.i"); - break; - case BR_COND_EQ: - if (portSwapped) - printf(".NE.i"); - else - printf(".EQ.i"); - break; - case BR_COND_OEQ: - if (portSwapped) - printf(".UNE.f"); - else - printf(".OEQ.f"); - break; - case BR_COND_OGT: - if (portSwapped) - printf(".OGT.unk.f"); - else - printf(".OGT.f"); - break; - case BR_COND_OLT: + case BR_COND_LT: + if (portSwapped) + printf(".LT.u"); + else + printf(".LT.i"); + break; + case BR_COND_LE: + if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) { + printf(".UNE.f"); + } else { if (portSwapped) - printf(".OLT.unk.f"); + printf(".LE.u"); else - printf(".OLT.f"); - break; + printf(".LE.i"); + } + break; + case BR_COND_GT: + if (portSwapped) + printf(".GT.u"); + else + printf(".GT.i"); + break; + case BR_COND_GE: + if (portSwapped) + printf(".GE.u"); + else + printf(".GE.i"); + break; + case BR_COND_EQ: + if (portSwapped) + printf(".NE.i"); + else + printf(".EQ.i"); + break; + case BR_COND_OEQ: + if (portSwapped) + printf(".UNE.f"); + else + printf(".OEQ.f"); + break; + case BR_COND_OGT: + if (portSwapped) + printf(".OGT.unk.f"); + else + printf(".OGT.f"); + break; + case BR_COND_OLT: + if (portSwapped) + printf(".OLT.unk.f"); + else + printf(".OLT.f"); + break; } switch (size) { - case BR_SIZE_32: - case BR_SIZE_32_AND_16X: - case BR_SIZE_32_AND_16Y: - printf("32"); - break; - case BR_SIZE_16XX: - case BR_SIZE_16YY: - case BR_SIZE_16YX0: - case BR_SIZE_16YX1: - printf("16"); - break; - case BR_SIZE_ZERO: { - unsigned ctrl = (ADD.op >> 1) & 0x3; - if (ctrl == 0) - printf("32.Z"); - else - printf("16.Z"); - break; - } + case BR_SIZE_32: + case BR_SIZE_32_AND_16X: + case BR_SIZE_32_AND_16Y: + printf("32"); + break; + case BR_SIZE_16XX: + case BR_SIZE_16YY: + case BR_SIZE_16YX0: + case BR_SIZE_16YX1: + printf("16"); + break; + case BR_SIZE_ZERO: { + unsigned ctrl = (ADD.op >> 1) & 0x3; + if (ctrl == 0) + printf("32.Z"); + else + printf("16.Z"); + break; + } } } } @@ -1552,414 +1610,421 @@ static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_reg } switch (info.src_type) { - case ADD_BLENDING: - // Note: in this case, regs.uniform_const == location | 0x8 - // This probably means we can't load uniforms or immediates in the - // same instruction. This re-uses the encoding that normally means - // "disabled", where the low 4 bits are ignored. Perhaps the extra - // 0x8 or'd in indicates this is happening. - printf("location:%d, ", regs.uniform_const & 0x7); - // fallthrough - case ADD_ONE_SRC: - dump_src(ADD.src0, regs, consts, false); + case ADD_BLENDING: + // Note: in this case, regs.uniform_const == location | 0x8 + // This probably means we can't load uniforms or immediates in the + // same instruction. This re-uses the encoding that normally means + // "disabled", where the low 4 bits are ignored. Perhaps the extra + // 0x8 or'd in indicates this is happening. + printf("location:%d, ", regs.uniform_const & 0x7); + // fallthrough + case ADD_ONE_SRC: + dump_src(ADD.src0, regs, consts, false); + break; + case ADD_TEX: + case ADD_TEX_COMPACT: { + int tex_index; + int sampler_index; + bool dualTex = false; + if (info.src_type == ADD_TEX_COMPACT) { + tex_index = (ADD.op >> 3) & 0x7; + sampler_index = (ADD.op >> 7) & 0x7; + bool unknown = (ADD.op & 0x40); + // TODO: figure out if the unknown bit is ever 0 + if (!unknown) + printf("unknown "); + } else { + uint64_t constVal = get_const(consts, regs); + uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal; + struct bifrost_tex_ctrl ctrl; + memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl)); + + // TODO: figure out what actually triggers dual-tex + if (ctrl.result_type == 9) { + struct bifrost_dual_tex_ctrl dualCtrl; + memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl)); + printf("(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ", + dualCtrl.tex_index0, dualCtrl.sampler_index0, + dualCtrl.tex_index1, dualCtrl.sampler_index1); + if (dualCtrl.unk0 != 3) + printf("unk:%d ", dualCtrl.unk0); + dualTex = true; + } else { + if (ctrl.no_merge_index) { + tex_index = ctrl.tex_index; + sampler_index = ctrl.sampler_index; + } else { + tex_index = sampler_index = ctrl.tex_index; + unsigned unk = ctrl.sampler_index >> 2; + if (unk != 3) + printf("unk:%d ", unk); + if (ctrl.sampler_index & 1) + tex_index = -1; + if (ctrl.sampler_index & 2) + sampler_index = -1; + } + + if (ctrl.unk0 != 3) + printf("unk0:%d ", ctrl.unk0); + if (ctrl.unk1) + printf("unk1 "); + if (ctrl.unk2 != 0xf) + printf("unk2:%x ", ctrl.unk2); + + switch (ctrl.result_type) { + case 0x4: + printf("f32 "); + break; + case 0xe: + printf("i32 "); + break; + case 0xf: + printf("u32 "); + break; + default: + printf("unktype(%x) ", ctrl.result_type); + } + + switch (ctrl.tex_type) { + case 0: + printf("cube "); + break; + case 1: + printf("buffer "); + break; + case 2: + printf("2D "); + break; + case 3: + printf("3D "); + break; + } + + if (ctrl.is_shadow) + printf("shadow "); + if (ctrl.is_array) + printf("array "); + + if (!ctrl.filter) { + if (ctrl.calc_gradients) { + int comp = (controlBits >> 20) & 0x3; + printf("txg comp:%d ", comp); + } else { + printf("txf "); + } + } else { + if (!ctrl.not_supply_lod) { + if (ctrl.compute_lod) + printf("lod_bias "); + else + printf("lod "); + } + + if (!ctrl.calc_gradients) + printf("grad "); + } + + if (ctrl.texel_offset) + printf("offset "); + } + } + + if (!dualTex) { + if (tex_index == -1) + printf("tex:indirect "); + else + printf("tex:%d ", tex_index); + + if (sampler_index == -1) + printf("samp:indirect "); + else + printf("samp:%d ", sampler_index); + } + break; + } + case ADD_VARYING_INTERP: { + unsigned addr = ADD.op & 0x1f; + if (addr < 0b10100) { + // direct addr + printf("%d", addr); + } else if (addr < 0b11000) { + if (addr == 22) + printf("fragw"); + else if (addr == 23) + printf("fragz"); + else + printf("unk%d", addr); + } else { + dump_src(ADD.op & 0x7, regs, consts, false); + } + printf(", "); + dump_src(ADD.src0, regs, consts, false); + break; + } + case ADD_VARYING_ADDRESS: { + dump_src(ADD.src0, regs, consts, false); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + printf(", "); + unsigned location = (ADD.op >> 3) & 0x1f; + if (location < 16) { + printf("location:%d", location); + } else if (location == 20) { + printf("location:%u", (uint32_t) get_const(consts, regs)); + } else if (location == 21) { + printf("location:%u", (uint32_t) (get_const(consts, regs) >> 32)); + } else { + printf("location:%d(unk)", location); + } + break; + } + case ADD_LOAD_ATTR: + printf("location:%d, ", (ADD.op >> 3) & 0xf); + case ADD_TWO_SRC: + dump_src(ADD.src0, regs, consts, false); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + break; + case ADD_THREE_SRC: + dump_src(ADD.src0, regs, consts, false); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + printf(", "); + dump_src((ADD.op >> 3) & 0x7, regs, consts, false); + break; + case ADD_FADD: + case ADD_FMINMAX: + if (ADD.op & 0x10) + printf("-"); + if (ADD.op & 0x1000) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 3: + printf(".x"); break; - case ADD_TEX: - case ADD_TEX_COMPACT: { - int tex_index; - int sampler_index; - bool dualTex = false; - if (info.src_type == ADD_TEX_COMPACT) { - tex_index = (ADD.op >> 3) & 0x7; - sampler_index = (ADD.op >> 7) & 0x7; - bool unknown = (ADD.op & 0x40); - // TODO: figure out if the unknown bit is ever 0 - if (!unknown) - printf("unknown "); - } else { - uint64_t constVal = get_const(consts, regs); - uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal; - struct bifrost_tex_ctrl ctrl; - memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl)); - - // TODO: figure out what actually triggers dual-tex - if (ctrl.result_type == 9) { - struct bifrost_dual_tex_ctrl dualCtrl; - memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl)); - printf("(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ", - dualCtrl.tex_index0, dualCtrl.sampler_index0, - dualCtrl.tex_index1, dualCtrl.sampler_index1); - if (dualCtrl.unk0 != 3) - printf("unk:%d ", dualCtrl.unk0); - dualTex = true; - } else { - if (ctrl.no_merge_index) { - tex_index = ctrl.tex_index; - sampler_index = ctrl.sampler_index; - } else { - tex_index = sampler_index = ctrl.tex_index; - unsigned unk = ctrl.sampler_index >> 2; - if (unk != 3) - printf("unk:%d ", unk); - if (ctrl.sampler_index & 1) - tex_index = -1; - if (ctrl.sampler_index & 2) - sampler_index = -1; - } - - if (ctrl.unk0 != 3) - printf("unk0:%d ", ctrl.unk0); - if (ctrl.unk1) - printf("unk1 "); - if (ctrl.unk2 != 0xf) - printf("unk2:%x ", ctrl.unk2); - - switch (ctrl.result_type) { - case 0x4: - printf("f32 "); break; - case 0xe: - printf("i32 "); break; - case 0xf: - printf("u32 "); break; - default: - printf("unktype(%x) ", ctrl.result_type); - } - - switch (ctrl.tex_type) { - case 0: - printf("cube "); break; - case 1: - printf("buffer "); break; - case 2: - printf("2D "); break; - case 3: - printf("3D "); break; - } - - if (ctrl.is_shadow) - printf("shadow "); - if (ctrl.is_array) - printf("array "); - - if (!ctrl.filter) { - if (ctrl.calc_gradients) { - int comp = (controlBits >> 20) & 0x3; - printf("txg comp:%d ", comp); - } else { - printf("txf "); - } - } else { - if (!ctrl.not_supply_lod) { - if (ctrl.compute_lod) - printf("lod_bias "); - else - printf("lod "); - } - - if (!ctrl.calc_gradients) - printf("grad "); - } - - if (ctrl.texel_offset) - printf("offset "); - } - } - - if (!dualTex) { - if (tex_index == -1) - printf("tex:indirect "); - else - printf("tex:%d ", tex_index); - - if (sampler_index == -1) - printf("samp:indirect "); - else - printf("samp:%d ", sampler_index); - } - break; - } - case ADD_VARYING_INTERP: { - unsigned addr = ADD.op & 0x1f; - if (addr < 0b10100) { - // direct addr - printf("%d", addr); - } else if (addr < 0b11000) { - if (addr == 22) - printf("fragw"); - else if (addr == 23) - printf("fragz"); - else - printf("unk%d", addr); - } else { - dump_src(ADD.op & 0x7, regs, consts, false); - } - printf(", "); - dump_src(ADD.src0, regs, consts, false); - break; - } - case ADD_VARYING_ADDRESS: { - dump_src(ADD.src0, regs, consts, false); - printf(", "); - dump_src(ADD.op & 0x7, regs, consts, false); - printf(", "); - unsigned location = (ADD.op >> 3) & 0x1f; - if (location < 16) { - printf("location:%d", location); - } else if (location == 20) { - printf("location:%u", (uint32_t) get_const(consts, regs)); - } else if (location == 21) { - printf("location:%u", (uint32_t) (get_const(consts, regs) >> 32)); - } else { - printf("location:%d(unk)", location); - } - break; - } - case ADD_LOAD_ATTR: - printf("location:%d, ", (ADD.op >> 3) & 0xf); - case ADD_TWO_SRC: - dump_src(ADD.src0, regs, consts, false); - printf(", "); - dump_src(ADD.op & 0x7, regs, consts, false); - break; - case ADD_THREE_SRC: - dump_src(ADD.src0, regs, consts, false); - printf(", "); - dump_src(ADD.op & 0x7, regs, consts, false); - printf(", "); - dump_src((ADD.op >> 3) & 0x7, regs, consts, false); - break; - case ADD_FADD: - case ADD_FMINMAX: - if (ADD.op & 0x10) - printf("-"); - if (ADD.op & 0x1000) - printf("abs("); - dump_src(ADD.src0, regs, consts, false); - switch ((ADD.op >> 6) & 0x3) { - case 3: - printf(".x"); - break; - default: - break; - } - if (ADD.op & 0x1000) - printf(")"); - printf(", "); - if (ADD.op & 0x20) - printf("-"); - if (ADD.op & 0x8) - printf("abs("); - dump_src(ADD.op & 0x7, regs, consts, false); - switch ((ADD.op >> 6) & 0x3) { - case 1: - case 3: - printf(".x"); - break; - case 2: - printf(".y"); - break; - case 0: - break; - default: - printf(".unk"); - break; - } - if (ADD.op & 0x8) - printf(")"); - break; - case ADD_FADD16: - if (ADD.op & 0x10) - printf("-"); - if (ADD.op & 0x1000) - printf("abs("); - dump_src(ADD.src0, regs, consts, false); - if (ADD.op & 0x1000) - printf(")"); - dump_16swizzle((ADD.op >> 6) & 0x3); - printf(", "); - if (ADD.op & 0x20) - printf("-"); - if (ADD.op & 0x8) - printf("abs("); - dump_src(ADD.op & 0x7, regs, consts, false); - dump_16swizzle((ADD.op >> 8) & 0x3); - if (ADD.op & 0x8) - printf(")"); - break; - case ADD_FMINMAX16: { - bool abs1 = ADD.op & 0x8; - bool abs2 = (ADD.op & 0x7) < ADD.src0; - if (ADD.op & 0x10) - printf("-"); - if (abs1 || abs2) - printf("abs("); - dump_src(ADD.src0, regs, consts, false); - dump_16swizzle((ADD.op >> 6) & 0x3); - if (abs1 || abs2) - printf(")"); - printf(", "); - if (ADD.op & 0x20) - printf("-"); - if (abs1 && abs2) - printf("abs("); - dump_src(ADD.op & 0x7, regs, consts, false); - dump_16swizzle((ADD.op >> 8) & 0x3); - if (abs1 && abs2) - printf(")"); - break; - } - case ADD_FADDMscale: { - if (ADD.op & 0x400) - printf("-"); - if (ADD.op & 0x200) - printf("abs("); - dump_src(ADD.src0, regs, consts, false); - if (ADD.op & 0x200) - printf(")"); - - printf(", "); - - if (ADD.op & 0x800) - printf("-"); - dump_src(ADD.op & 0x7, regs, consts, false); - - printf(", "); - - dump_src((ADD.op >> 3) & 0x7, regs, consts, false); - break; - } - case ADD_FCMP: - if (ADD.op & 0x400) { - printf("-"); - } - if (ADD.op & 0x100) { - printf("abs("); - } - dump_src(ADD.src0, regs, consts, false); - switch ((ADD.op >> 6) & 0x3) { - case 3: - printf(".x"); - break; - default: - break; - } - if (ADD.op & 0x100) { - printf(")"); - } - printf(", "); - if (ADD.op & 0x200) { - printf("abs("); - } - dump_src(ADD.op & 0x7, regs, consts, false); - switch ((ADD.op >> 6) & 0x3) { - case 1: - case 3: - printf(".x"); - break; - case 2: - printf(".y"); - break; - case 0: - break; - default: - printf(".unk"); - break; - } - if (ADD.op & 0x200) { - printf(")"); - } - break; - case ADD_FCMP16: - dump_src(ADD.src0, regs, consts, false); - dump_16swizzle((ADD.op >> 6) & 0x3); - printf(", "); - dump_src(ADD.op & 0x7, regs, consts, false); - dump_16swizzle((ADD.op >> 8) & 0x3); - break; - case ADD_BRANCH: { - enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f); - enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); - if (code != BR_ALWAYS) { - dump_src(ADD.src0, regs, consts, false); - switch (size) { - case BR_SIZE_16XX: - printf(".x"); - break; - case BR_SIZE_16YY: - case BR_SIZE_16YX0: - case BR_SIZE_16YX1: - printf(".y"); - break; - case BR_SIZE_ZERO: { - unsigned ctrl = (ADD.op >> 1) & 0x3; - switch (ctrl) { - case 1: - printf(".y"); - break; - case 2: - printf(".x"); - break; - default: - break; - } - } - default: - break; - } - printf(", "); - } - if (code != BR_ALWAYS && size != BR_SIZE_ZERO) { - dump_src(ADD.op & 0x7, regs, consts, false); - switch (size) { - case BR_SIZE_16XX: - case BR_SIZE_16YX0: - case BR_SIZE_16YX1: - case BR_SIZE_32_AND_16X: - printf(".x"); - break; - case BR_SIZE_16YY: - case BR_SIZE_32_AND_16Y: - printf(".y"); - break; - default: - break; - } - printf(", "); - } - // I haven't had the chance to test if this actually specifies the - // branch offset, since I couldn't get it to produce values other - // than 5 (uniform/const high), but these three bits are always - // consistent across branch instructions, so it makes sense... - int offsetSrc = (ADD.op >> 3) & 0x7; - if (offsetSrc == 4 || offsetSrc == 5) { - // If the offset is known/constant, we can decode it - uint32_t raw_offset; - if (offsetSrc == 4) - raw_offset = get_const(consts, regs); - else - raw_offset = get_const(consts, regs) >> 32; - // The high 4 bits are flags, while the rest is the - // twos-complement offset in bytes (here we convert to - // clauses). - int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8; - - // If high4 is the high 4 bits of the last 64-bit constant, - // this is calculated as (high4 + 4) & 0xf, or 0 if the branch - // offset itself is the last constant. Not sure if this is - // actually used, or just garbage in unused bits, but in any - // case, we can just ignore it here since it's redundant. Note - // that if there is any padding, this will be 4 since the - // padding counts as the last constant. - unsigned flags = raw_offset >> 28; - (void) flags; - - // Note: the offset is in bytes, relative to the beginning of the - // current clause, so a zero offset would be a loop back to the - // same clause (annoyingly different from Midgard). - printf("clause_%d", offset + branch_offset); - } else { - dump_src(offsetSrc, regs, consts, false); - } - } + default: + break; + } + if (ADD.op & 0x1000) + printf(")"); + printf(", "); + if (ADD.op & 0x20) + printf("-"); + if (ADD.op & 0x8) + printf("abs("); + dump_src(ADD.op & 0x7, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 1: + case 3: + printf(".x"); + break; + case 2: + printf(".y"); + break; + case 0: + break; + default: + printf(".unk"); + break; + } + if (ADD.op & 0x8) + printf(")"); + break; + case ADD_FADD16: + if (ADD.op & 0x10) + printf("-"); + if (ADD.op & 0x1000) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + if (ADD.op & 0x1000) + printf(")"); + dump_16swizzle((ADD.op >> 6) & 0x3); + printf(", "); + if (ADD.op & 0x20) + printf("-"); + if (ADD.op & 0x8) + printf("abs("); + dump_src(ADD.op & 0x7, regs, consts, false); + dump_16swizzle((ADD.op >> 8) & 0x3); + if (ADD.op & 0x8) + printf(")"); + break; + case ADD_FMINMAX16: { + bool abs1 = ADD.op & 0x8; + bool abs2 = (ADD.op & 0x7) < ADD.src0; + if (ADD.op & 0x10) + printf("-"); + if (abs1 || abs2) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + dump_16swizzle((ADD.op >> 6) & 0x3); + if (abs1 || abs2) + printf(")"); + printf(", "); + if (ADD.op & 0x20) + printf("-"); + if (abs1 && abs2) + printf("abs("); + dump_src(ADD.op & 0x7, regs, consts, false); + dump_16swizzle((ADD.op >> 8) & 0x3); + if (abs1 && abs2) + printf(")"); + break; + } + case ADD_FADDMscale: { + if (ADD.op & 0x400) + printf("-"); + if (ADD.op & 0x200) + printf("abs("); + dump_src(ADD.src0, regs, consts, false); + if (ADD.op & 0x200) + printf(")"); + + printf(", "); + + if (ADD.op & 0x800) + printf("-"); + dump_src(ADD.op & 0x7, regs, consts, false); + + printf(", "); + + dump_src((ADD.op >> 3) & 0x7, regs, consts, false); + break; + } + case ADD_FCMP: + if (ADD.op & 0x400) { + printf("-"); + } + if (ADD.op & 0x100) { + printf("abs("); + } + dump_src(ADD.src0, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 3: + printf(".x"); + break; + default: + break; + } + if (ADD.op & 0x100) { + printf(")"); + } + printf(", "); + if (ADD.op & 0x200) { + printf("abs("); + } + dump_src(ADD.op & 0x7, regs, consts, false); + switch ((ADD.op >> 6) & 0x3) { + case 1: + case 3: + printf(".x"); + break; + case 2: + printf(".y"); + break; + case 0: + break; + default: + printf(".unk"); + break; + } + if (ADD.op & 0x200) { + printf(")"); + } + break; + case ADD_FCMP16: + dump_src(ADD.src0, regs, consts, false); + dump_16swizzle((ADD.op >> 6) & 0x3); + printf(", "); + dump_src(ADD.op & 0x7, regs, consts, false); + dump_16swizzle((ADD.op >> 8) & 0x3); + break; + case ADD_BRANCH: { + enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f); + enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); + if (code != BR_ALWAYS) { + dump_src(ADD.src0, regs, consts, false); + switch (size) { + case BR_SIZE_16XX: + printf(".x"); + break; + case BR_SIZE_16YY: + case BR_SIZE_16YX0: + case BR_SIZE_16YX1: + printf(".y"); + break; + case BR_SIZE_ZERO: { + unsigned ctrl = (ADD.op >> 1) & 0x3; + switch (ctrl) { + case 1: + printf(".y"); + break; + case 2: + printf(".x"); + break; + default: + break; + } + } + default: + break; + } + printf(", "); + } + if (code != BR_ALWAYS && size != BR_SIZE_ZERO) { + dump_src(ADD.op & 0x7, regs, consts, false); + switch (size) { + case BR_SIZE_16XX: + case BR_SIZE_16YX0: + case BR_SIZE_16YX1: + case BR_SIZE_32_AND_16X: + printf(".x"); + break; + case BR_SIZE_16YY: + case BR_SIZE_32_AND_16Y: + printf(".y"); + break; + default: + break; + } + printf(", "); + } + // I haven't had the chance to test if this actually specifies the + // branch offset, since I couldn't get it to produce values other + // than 5 (uniform/const high), but these three bits are always + // consistent across branch instructions, so it makes sense... + int offsetSrc = (ADD.op >> 3) & 0x7; + if (offsetSrc == 4 || offsetSrc == 5) { + // If the offset is known/constant, we can decode it + uint32_t raw_offset; + if (offsetSrc == 4) + raw_offset = get_const(consts, regs); + else + raw_offset = get_const(consts, regs) >> 32; + // The high 4 bits are flags, while the rest is the + // twos-complement offset in bytes (here we convert to + // clauses). + int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8; + + // If high4 is the high 4 bits of the last 64-bit constant, + // this is calculated as (high4 + 4) & 0xf, or 0 if the branch + // offset itself is the last constant. Not sure if this is + // actually used, or just garbage in unused bits, but in any + // case, we can just ignore it here since it's redundant. Note + // that if there is any padding, this will be 4 since the + // padding counts as the last constant. + unsigned flags = raw_offset >> 28; + (void) flags; + + // Note: the offset is in bytes, relative to the beginning of the + // current clause, so a zero offset would be a loop back to the + // same clause (annoyingly different from Midgard). + printf("clause_%d", offset + branch_offset); + } else { + dump_src(offsetSrc, regs, consts, false); + } + } } if (info.has_data_reg) { printf(", R%d", data_reg); @@ -1981,7 +2046,8 @@ void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_r dump_add(instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose); } -bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) { +bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) +{ // State for a decoded clause struct bifrost_alu_inst instrs[8] = {}; uint64_t consts[6] = {}; @@ -2027,128 +2093,128 @@ bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) } else { bool done = false; switch ((tag >> 3) & 0x7) { - case 0x0: - switch (tag & 0x7) { - case 0x3: - main_instr.add_bits |= bits(words[3], 29, 32) << 17; - instrs[1] = main_instr; - num_instrs = 2; - done = stop; - break; - case 0x4: - instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; - instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; - consts[0] = const0; - num_instrs = 3; - num_consts = 1; - done = stop; - break; - case 0x1: - case 0x5: - instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; - instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; - main_instr.add_bits |= bits(words[3], 26, 29) << 17; - instrs[3] = main_instr; - if ((tag & 0x7) == 0x5) { - num_instrs = 4; - done = stop; - } - break; - case 0x6: - instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; - instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; - consts[0] = const0; - num_instrs = 6; - num_consts = 1; - done = stop; - break; - case 0x7: - instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; - instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; - main_instr.add_bits |= bits(words[3], 26, 29) << 17; - instrs[6] = main_instr; - num_instrs = 7; - done = stop; - break; - default: - printf("unknown tag bits 0x%02x\n", tag); - } + case 0x0: + switch (tag & 0x7) { + case 0x3: + main_instr.add_bits |= bits(words[3], 29, 32) << 17; + instrs[1] = main_instr; + num_instrs = 2; + done = stop; break; - case 0x2: - case 0x3: { - unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7; - main_instr.add_bits |= (tag & 0x7) << 17; - instrs[idx] = main_instr; - consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19; - num_consts = 1; - num_instrs = idx + 1; - done = stop; - break; - } - case 0x4: { - unsigned idx = stop ? 4 : 1; - main_instr.add_bits |= (tag & 0x7) << 17; - instrs[idx] = main_instr; - instrs[idx + 1].fma_bits |= bits(words[3], 22, 32); - instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19)); - break; - } - case 0x1: - // only constants can come after this - num_instrs = 1; + case 0x4: + instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; + consts[0] = const0; + num_instrs = 3; + num_consts = 1; done = stop; + break; + case 0x1: case 0x5: - header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19)); - main_instr.add_bits |= (tag & 0x7) << 17; - instrs[0] = main_instr; + instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[2].fma_bits |= bits(words[2], 19, 32) << 10; + main_instr.add_bits |= bits(words[3], 26, 29) << 17; + instrs[3] = main_instr; + if ((tag & 0x7) == 0x5) { + num_instrs = 4; + done = stop; + } break; case 0x6: - case 0x7: { - unsigned pos = tag & 0xf; - // note that `pos' encodes both the total number of - // instructions and the position in the constant stream, - // presumably because decoded constants and instructions - // share a buffer in the decoder, but we only care about - // the position in the constant stream; the total number of - // instructions is redundant. - unsigned const_idx = 7; - switch (pos) { - case 0: - case 1: - case 2: - case 6: - const_idx = 0; - break; - case 3: - case 4: - case 7: - case 9: - const_idx = 1; - break; - case 5: - case 0xa: - const_idx = 2; - break; - case 8: - case 0xb: - case 0xc: - const_idx = 3; - break; - case 0xd: - const_idx = 4; - break; - default: - printf("# unknown pos 0x%x\n", pos); - } - if (num_consts < const_idx + 2) - num_consts = const_idx + 2; - consts[const_idx] = const0; - consts[const_idx + 1] = const1; - done = stop; - break; - } + instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; + consts[0] = const0; + num_instrs = 6; + num_consts = 1; + done = stop; + break; + case 0x7: + instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17; + instrs[5].fma_bits |= bits(words[2], 19, 32) << 10; + main_instr.add_bits |= bits(words[3], 26, 29) << 17; + instrs[6] = main_instr; + num_instrs = 7; + done = stop; + break; + default: + printf("unknown tag bits 0x%02x\n", tag); + } + break; + case 0x2: + case 0x3: { + unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7; + main_instr.add_bits |= (tag & 0x7) << 17; + instrs[idx] = main_instr; + consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19; + num_consts = 1; + num_instrs = idx + 1; + done = stop; + break; + } + case 0x4: { + unsigned idx = stop ? 4 : 1; + main_instr.add_bits |= (tag & 0x7) << 17; + instrs[idx] = main_instr; + instrs[idx + 1].fma_bits |= bits(words[3], 22, 32); + instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19)); + break; + } + case 0x1: + // only constants can come after this + num_instrs = 1; + done = stop; + case 0x5: + header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19)); + main_instr.add_bits |= (tag & 0x7) << 17; + instrs[0] = main_instr; + break; + case 0x6: + case 0x7: { + unsigned pos = tag & 0xf; + // note that `pos' encodes both the total number of + // instructions and the position in the constant stream, + // presumably because decoded constants and instructions + // share a buffer in the decoder, but we only care about + // the position in the constant stream; the total number of + // instructions is redundant. + unsigned const_idx = 7; + switch (pos) { + case 0: + case 1: + case 2: + case 6: + const_idx = 0; + break; + case 3: + case 4: + case 7: + case 9: + const_idx = 1; + break; + case 5: + case 0xa: + const_idx = 2; + break; + case 8: + case 0xb: + case 0xc: + const_idx = 3; + break; + case 0xd: + const_idx = 4; + break; default: - break; + printf("# unknown pos 0x%x\n", pos); + } + if (num_consts < const_idx + 2) + num_consts = const_idx + 2; + consts[const_idx] = const0; + consts[const_idx + 1] = const1; + done = stop; + break; + } + default: + break; } if (done) @@ -2173,10 +2239,10 @@ bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) struct bifrost_regs next_regs; if (i + 1 == num_instrs) { memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits, - sizeof(next_regs)); + sizeof(next_regs)); } else { memcpy((char *) &next_regs, (char *) &instrs[i + 1].reg_bits, - sizeof(next_regs)); + sizeof(next_regs)); } dump_instr(&instrs[i], next_regs, consts, header.datareg, offset, verbose); @@ -2198,8 +2264,7 @@ void disassemble_bifrost(uint8_t *code, size_t size, bool verbose) uint32_t *words_end = words + (size / 4); // used for displaying branch targets unsigned offset = 0; - while (words != words_end) - { + while (words != words_end) { // we don't know what the program-end bit is quite yet, so for now just // assume that an all-0 quadword is padding uint32_t zero[4] = {};