From 67610a0323ddfe0d7cced121abb43286b862b495 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Tue, 18 Dec 2018 20:15:57 -0500 Subject: [PATCH] freedreno: a2xx: NIR backend This patch replaces the a2xx TGSI compiler with a NIR compiler. It also adds several new features: -gl_FrontFacing, gl_FragCoord, gl_PointCoord, gl_PointSize -control flow (including loops) -texture related features (LOD/bias, cubemaps) -filling scalar ALU slot when possible Signed-off-by: Jonathan Marek --- .../drivers/freedreno/Makefile.sources | 10 +- .../drivers/freedreno/a2xx/fd2_compiler.c | 1119 ---------------- .../drivers/freedreno/a2xx/fd2_compiler.h | 36 - src/gallium/drivers/freedreno/a2xx/fd2_draw.c | 2 +- src/gallium/drivers/freedreno/a2xx/fd2_emit.c | 20 +- src/gallium/drivers/freedreno/a2xx/fd2_gmem.c | 14 +- .../drivers/freedreno/a2xx/fd2_program.c | 521 +++----- .../drivers/freedreno/a2xx/fd2_program.h | 46 +- .../drivers/freedreno/a2xx/instr-a2xx.h | 19 +- src/gallium/drivers/freedreno/a2xx/ir-a2xx.c | 809 ------------ src/gallium/drivers/freedreno/a2xx/ir-a2xx.h | 188 --- src/gallium/drivers/freedreno/a2xx/ir2.c | 304 +++++ src/gallium/drivers/freedreno/a2xx/ir2.h | 94 ++ .../drivers/freedreno/a2xx/ir2_assemble.c | 548 ++++++++ src/gallium/drivers/freedreno/a2xx/ir2_nir.c | 1173 +++++++++++++++++ .../drivers/freedreno/a2xx/ir2_private.h | 392 ++++++ src/gallium/drivers/freedreno/a2xx/ir2_ra.c | 226 ++++ .../drivers/freedreno/freedreno_context.h | 8 - .../drivers/freedreno/freedreno_program.c | 9 +- .../drivers/freedreno/freedreno_screen.c | 14 +- src/gallium/drivers/freedreno/meson.build | 10 +- 21 files changed, 3033 insertions(+), 2529 deletions(-) delete mode 100644 src/gallium/drivers/freedreno/a2xx/fd2_compiler.c delete mode 100644 src/gallium/drivers/freedreno/a2xx/fd2_compiler.h delete mode 100644 src/gallium/drivers/freedreno/a2xx/ir-a2xx.c delete mode 100644 src/gallium/drivers/freedreno/a2xx/ir-a2xx.h create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2.c create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2.h create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_assemble.c create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_nir.c create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_private.h create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_ra.c diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources index 7bb033ab875..119b3147c5c 100644 --- a/src/gallium/drivers/freedreno/Makefile.sources +++ b/src/gallium/drivers/freedreno/Makefile.sources @@ -42,8 +42,6 @@ a2xx_SOURCES := \ a2xx/disasm-a2xx.c \ a2xx/fd2_blend.c \ a2xx/fd2_blend.h \ - a2xx/fd2_compiler.c \ - a2xx/fd2_compiler.h \ a2xx/fd2_context.c \ a2xx/fd2_context.h \ a2xx/fd2_draw.c \ @@ -67,8 +65,12 @@ a2xx_SOURCES := \ a2xx/fd2_zsa.c \ a2xx/fd2_zsa.h \ a2xx/instr-a2xx.h \ - a2xx/ir-a2xx.c \ - a2xx/ir-a2xx.h + a2xx/ir2.c \ + a2xx/ir2.h \ + a2xx/ir2_assemble.c \ + a2xx/ir2_nir.c \ + a2xx/ir2_private.h \ + a2xx/ir2_ra.c a3xx_SOURCES := \ a3xx/fd3_blend.c \ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c deleted file mode 100644 index 156bfc247c2..00000000000 --- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c +++ /dev/null @@ -1,1119 +0,0 @@ -/* - * Copyright (C) 2012 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_ureg.h" -#include "tgsi/tgsi_info.h" -#include "tgsi/tgsi_strings.h" -#include "tgsi/tgsi_dump.h" - -#include "fd2_compiler.h" -#include "fd2_program.h" -#include "fd2_util.h" - -#include "instr-a2xx.h" -#include "ir-a2xx.h" - -struct fd2_compile_context { - struct fd_program_stateobj *prog; - struct fd2_shader_stateobj *so; - - struct tgsi_parse_context parser; - unsigned type; - - /* predicate stack: */ - int pred_depth; - enum ir2_pred pred_stack[8]; - - /* Internal-Temporary and Predicate register assignment: - * - * Some TGSI instructions which translate into multiple actual - * instructions need one or more temporary registers, which are not - * assigned from TGSI perspective (ie. not TGSI_FILE_TEMPORARY). - * And some instructions (texture fetch) cannot write directly to - * output registers. We could be more clever and re-use dst or a - * src register in some cases. But for now don't try to be clever. - * Eventually we should implement an optimization pass that re- - * juggles the register usage and gets rid of unneeded temporaries. - * - * The predicate register must be valid across multiple TGSI - * instructions, but internal temporary's do not. For this reason, - * once the predicate register is requested, until it is no longer - * needed, it gets the first register slot after after the TGSI - * assigned temporaries (ie. num_regs[TGSI_FILE_TEMPORARY]), and the - * internal temporaries get the register slots above this. - */ - - int pred_reg; - int num_internal_temps; - - uint8_t num_regs[TGSI_FILE_COUNT]; - - /* maps input register idx to prog->export_linkage idx: */ - uint8_t input_export_idx[64]; - - /* maps output register idx to prog->export_linkage idx: */ - uint8_t output_export_idx[64]; - - /* idx/slot for last compiler generated immediate */ - unsigned immediate_idx; - - // TODO we can skip emit exports in the VS that the FS doesn't need.. - // and get rid perhaps of num_param.. - unsigned num_position, num_param; - unsigned position, psize; - - uint64_t need_sync; -}; - -static int -semantic_idx(struct tgsi_declaration_semantic *semantic) -{ - int idx = semantic->Name; - if (idx == TGSI_SEMANTIC_GENERIC) - idx = TGSI_SEMANTIC_COUNT + semantic->Index; - return idx; -} - -/* assign/get the input/export register # for given semantic idx as - * returned by semantic_idx(): - */ -static int -export_linkage(struct fd2_compile_context *ctx, int idx) -{ - struct fd_program_stateobj *prog = ctx->prog; - - /* if first time we've seen this export, assign the next available slot: */ - if (prog->export_linkage[idx] == 0xff) - prog->export_linkage[idx] = prog->num_exports++; - - return prog->export_linkage[idx]; -} - -static unsigned -compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog, - struct fd2_shader_stateobj *so) -{ - unsigned ret; - - ctx->prog = prog; - ctx->so = so; - ctx->pred_depth = 0; - - ret = tgsi_parse_init(&ctx->parser, so->tokens); - if (ret != TGSI_PARSE_OK) - return ret; - - ctx->type = ctx->parser.FullHeader.Processor.Processor; - ctx->position = ~0; - ctx->psize = ~0; - ctx->num_position = 0; - ctx->num_param = 0; - ctx->need_sync = 0; - ctx->immediate_idx = 0; - ctx->pred_reg = -1; - ctx->num_internal_temps = 0; - - memset(ctx->num_regs, 0, sizeof(ctx->num_regs)); - memset(ctx->input_export_idx, 0, sizeof(ctx->input_export_idx)); - memset(ctx->output_export_idx, 0, sizeof(ctx->output_export_idx)); - - /* do first pass to extract declarations: */ - while (!tgsi_parse_end_of_tokens(&ctx->parser)) { - tgsi_parse_token(&ctx->parser); - - switch (ctx->parser.FullToken.Token.Type) { - case TGSI_TOKEN_TYPE_DECLARATION: { - struct tgsi_full_declaration *decl = - &ctx->parser.FullToken.FullDeclaration; - if (decl->Declaration.File == TGSI_FILE_OUTPUT) { - unsigned name = decl->Semantic.Name; - - assert(decl->Declaration.Semantic); // TODO is this ever not true? - - ctx->output_export_idx[decl->Range.First] = - semantic_idx(&decl->Semantic); - - if (ctx->type == PIPE_SHADER_VERTEX) { - switch (name) { - case TGSI_SEMANTIC_POSITION: - ctx->position = ctx->num_regs[TGSI_FILE_OUTPUT]; - ctx->num_position++; - break; - case TGSI_SEMANTIC_PSIZE: - ctx->psize = ctx->num_regs[TGSI_FILE_OUTPUT]; - ctx->num_position++; - break; - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_GENERIC: - ctx->num_param++; - break; - default: - DBG("unknown VS semantic name: %s", - tgsi_semantic_names[name]); - assert(0); - } - } else { - switch (name) { - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_GENERIC: - ctx->num_param++; - break; - default: - DBG("unknown PS semantic name: %s", - tgsi_semantic_names[name]); - assert(0); - } - } - } else if (decl->Declaration.File == TGSI_FILE_INPUT) { - ctx->input_export_idx[decl->Range.First] = - semantic_idx(&decl->Semantic); - } - ctx->num_regs[decl->Declaration.File] = - MAX2(ctx->num_regs[decl->Declaration.File], decl->Range.Last + 1); - break; - } - case TGSI_TOKEN_TYPE_IMMEDIATE: { - struct tgsi_full_immediate *imm = - &ctx->parser.FullToken.FullImmediate; - unsigned n = ctx->so->num_immediates++; - memcpy(ctx->so->immediates[n].val, imm->u, 16); - break; - } - default: - break; - } - } - - /* TGSI generated immediates are always entire vec4's, ones we - * generate internally are not: - */ - ctx->immediate_idx = ctx->so->num_immediates * 4; - - ctx->so->first_immediate = ctx->num_regs[TGSI_FILE_CONSTANT]; - - tgsi_parse_free(&ctx->parser); - - return tgsi_parse_init(&ctx->parser, so->tokens); -} - -static void -compile_free(struct fd2_compile_context *ctx) -{ - tgsi_parse_free(&ctx->parser); -} - -static void -compile_vtx_fetch(struct fd2_compile_context *ctx) -{ - struct ir2_instruction **vfetch_instrs = ctx->so->vfetch_instrs; - int i; - for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) { - struct ir2_instruction *instr = ir2_instr_create( - ctx->so->ir, IR2_FETCH); - instr->fetch.opc = VTX_FETCH; - - ctx->need_sync |= 1 << (i+1); - - ir2_dst_create(instr, i+1, "xyzw", 0); - ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); - - if (i == 0) - instr->sync = true; - - vfetch_instrs[i] = instr; - } - ctx->so->num_vfetch_instrs = i; -} - -/* - * For vertex shaders (VS): - * --- ------ ------------- - * - * Inputs: R1-R(num_input) - * Constants: C0-C(num_const-1) - * Immediates: C(num_const)-C(num_const+num_imm-1) - * Outputs: export0-export(n) and export62, export63 - * n is # of outputs minus gl_Position (export62) and gl_PointSize (export63) - * Temps: R(num_input+1)-R(num_input+num_temps) - * - * R0 could be clobbered after the vertex fetch instructions.. so we - * could use it for one of the temporaries. - * - * TODO: maybe the vertex fetch part could fetch first input into R0 as - * the last vtx fetch instruction, which would let us use the same - * register layout in either case.. although this is not what the blob - * compiler does. - * - * - * For frag shaders (PS): - * --- ---- ------------- - * - * Inputs: R0-R(num_input-1) - * Constants: same as VS - * Immediates: same as VS - * Outputs: export0-export(num_outputs) - * Temps: R(num_input)-R(num_input+num_temps-1) - * - * In either case, immediates are are postpended to the constants - * (uniforms). - * - */ - -static unsigned -get_temp_gpr(struct fd2_compile_context *ctx, int idx) -{ - unsigned num = idx + ctx->num_regs[TGSI_FILE_INPUT]; - if (ctx->type == PIPE_SHADER_VERTEX) - num++; - return num; -} - -static struct ir2_dst_register * -add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu, - const struct tgsi_dst_register *dst) -{ - unsigned flags = 0, num = 0; - char swiz[5]; - - switch (dst->File) { - case TGSI_FILE_OUTPUT: - flags |= IR2_REG_EXPORT; - if (ctx->type == PIPE_SHADER_VERTEX) { - if (dst->Index == ctx->position) { - num = 62; - } else if (dst->Index == ctx->psize) { - num = 63; - } else { - num = export_linkage(ctx, - ctx->output_export_idx[dst->Index]); - } - } else { - num = dst->Index; - } - break; - case TGSI_FILE_TEMPORARY: - num = get_temp_gpr(ctx, dst->Index); - break; - default: - DBG("unsupported dst register file: %s", - tgsi_file_name(dst->File)); - assert(0); - break; - } - - swiz[0] = (dst->WriteMask & TGSI_WRITEMASK_X) ? 'x' : '_'; - swiz[1] = (dst->WriteMask & TGSI_WRITEMASK_Y) ? 'y' : '_'; - swiz[2] = (dst->WriteMask & TGSI_WRITEMASK_Z) ? 'z' : '_'; - swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_'; - swiz[4] = '\0'; - - return ir2_dst_create(alu, num, swiz, flags); -} - -static struct ir2_src_register * -add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu, - const struct tgsi_src_register *src) -{ - static const char swiz_vals[] = { - 'x', 'y', 'z', 'w', - }; - char swiz[5]; - unsigned flags = 0, num = 0; - - switch (src->File) { - case TGSI_FILE_CONSTANT: - num = src->Index; - flags |= IR2_REG_CONST; - break; - case TGSI_FILE_INPUT: - if (ctx->type == PIPE_SHADER_VERTEX) { - num = src->Index + 1; - } else { - flags |= IR2_REG_INPUT; - num = export_linkage(ctx, - ctx->input_export_idx[src->Index]); - } - break; - case TGSI_FILE_TEMPORARY: - num = get_temp_gpr(ctx, src->Index); - break; - case TGSI_FILE_IMMEDIATE: - num = src->Index + ctx->num_regs[TGSI_FILE_CONSTANT]; - flags |= IR2_REG_CONST; - break; - default: - DBG("unsupported src register file: %s", - tgsi_file_name(src->File)); - assert(0); - break; - } - - if (src->Absolute) - flags |= IR2_REG_ABS; - if (src->Negate) - flags |= IR2_REG_NEGATE; - - swiz[0] = swiz_vals[src->SwizzleX]; - swiz[1] = swiz_vals[src->SwizzleY]; - swiz[2] = swiz_vals[src->SwizzleZ]; - swiz[3] = swiz_vals[src->SwizzleW]; - swiz[4] = '\0'; - - if ((ctx->need_sync & ((uint64_t)1 << num)) && - !(flags & IR2_REG_CONST)) { - alu->sync = true; - ctx->need_sync &= ~((uint64_t)1 << num); - } - - return ir2_reg_create(alu, num, swiz, flags); -} - -static void -add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu) -{ - if (inst->Instruction.Saturate) { - alu->alu_vector.clamp = true; - } -} - -static void -add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu) -{ - if (inst->Instruction.Saturate) { - alu->alu_scalar.clamp = true; - } -} - -static void -add_regs_vector_1(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst, struct ir2_instruction *alu) -{ - assert(inst->Instruction.NumSrcRegs == 1); - assert(inst->Instruction.NumDstRegs == 1); - - add_dst_reg(ctx, alu, &inst->Dst[0].Register); - add_src_reg(ctx, alu, &inst->Src[0].Register); - add_src_reg(ctx, alu, &inst->Src[0].Register); - add_vector_clamp(inst, alu); -} - -static void -add_regs_vector_2(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst, struct ir2_instruction *alu) -{ - assert(inst->Instruction.NumSrcRegs == 2); - assert(inst->Instruction.NumDstRegs == 1); - - add_dst_reg(ctx, alu, &inst->Dst[0].Register); - add_src_reg(ctx, alu, &inst->Src[0].Register); - add_src_reg(ctx, alu, &inst->Src[1].Register); - add_vector_clamp(inst, alu); -} - -static void -add_regs_vector_3(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst, struct ir2_instruction *alu) -{ - assert(inst->Instruction.NumSrcRegs == 3); - assert(inst->Instruction.NumDstRegs == 1); - - add_dst_reg(ctx, alu, &inst->Dst[0].Register); - add_src_reg(ctx, alu, &inst->Src[0].Register); - add_src_reg(ctx, alu, &inst->Src[1].Register); - add_src_reg(ctx, alu, &inst->Src[2].Register); - add_vector_clamp(inst, alu); -} - -static void -add_regs_scalar_1(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst, struct ir2_instruction *alu) -{ - assert(inst->Instruction.NumSrcRegs == 1); - assert(inst->Instruction.NumDstRegs == 1); - - add_dst_reg(ctx, alu, &inst->Dst[0].Register); - add_src_reg(ctx, alu, &inst->Src[0].Register); - add_scalar_clamp(inst, alu); -} - -/* - * Helpers for TGSI instructions that don't map to a single shader instr: - */ - -static void -src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) -{ - src->File = dst->File; - src->Indirect = dst->Indirect; - src->Dimension = dst->Dimension; - src->Index = dst->Index; - src->Absolute = 0; - src->Negate = 0; - src->SwizzleX = TGSI_SWIZZLE_X; - src->SwizzleY = TGSI_SWIZZLE_Y; - src->SwizzleZ = TGSI_SWIZZLE_Z; - src->SwizzleW = TGSI_SWIZZLE_W; -} - -/* Get internal-temp src/dst to use for a sequence of instructions - * generated by a single TGSI op. - */ -static void -get_internal_temp(struct fd2_compile_context *ctx, - struct tgsi_dst_register *tmp_dst, - struct tgsi_src_register *tmp_src) -{ - int n; - - tmp_dst->File = TGSI_FILE_TEMPORARY; - tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; - tmp_dst->Indirect = 0; - tmp_dst->Dimension = 0; - - /* assign next temporary: */ - n = ctx->num_internal_temps++; - if (ctx->pred_reg != -1) - n++; - - tmp_dst->Index = ctx->num_regs[TGSI_FILE_TEMPORARY] + n; - - src_from_dst(tmp_src, tmp_dst); -} - -static void -get_predicate(struct fd2_compile_context *ctx, struct tgsi_dst_register *dst, - struct tgsi_src_register *src) -{ - assert(ctx->pred_reg != -1); - - dst->File = TGSI_FILE_TEMPORARY; - dst->WriteMask = TGSI_WRITEMASK_W; - dst->Indirect = 0; - dst->Dimension = 0; - dst->Index = get_temp_gpr(ctx, ctx->pred_reg); - - if (src) { - src_from_dst(src, dst); - src->SwizzleX = TGSI_SWIZZLE_W; - src->SwizzleY = TGSI_SWIZZLE_W; - src->SwizzleZ = TGSI_SWIZZLE_W; - src->SwizzleW = TGSI_SWIZZLE_W; - } -} - -static void -push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src) -{ - struct ir2_instruction *alu; - struct tgsi_dst_register pred_dst; - - if (ctx->pred_depth == 0) { - /* assign predicate register: */ - ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY]; - - get_predicate(ctx, &pred_dst, NULL); - - alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SETNEs); - add_dst_reg(ctx, alu, &pred_dst); - add_src_reg(ctx, alu, src); - } else { - struct tgsi_src_register pred_src; - - get_predicate(ctx, &pred_dst, &pred_src); - - alu = ir2_instr_create_alu_v(ctx->so->ir, MULv); - add_dst_reg(ctx, alu, &pred_dst); - add_src_reg(ctx, alu, &pred_src); - add_src_reg(ctx, alu, src); - - // XXX need to make PRED_SETE_PUSHv IR2_PRED_NONE.. but need to make - // sure src reg is valid if it was calculated with a predicate - // condition.. - alu->pred = IR2_PRED_NONE; - } - - /* save previous pred state to restore in pop_predicate(): */ - ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred; -} - -static void -pop_predicate(struct fd2_compile_context *ctx) -{ - /* restore previous predicate state: */ - ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth]; - - if (ctx->pred_depth != 0) { - struct ir2_instruction *alu; - struct tgsi_dst_register pred_dst; - struct tgsi_src_register pred_src; - - get_predicate(ctx, &pred_dst, &pred_src); - - alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SET_POPs); - add_dst_reg(ctx, alu, &pred_dst); - add_src_reg(ctx, alu, &pred_src); - alu->pred = IR2_PRED_NONE; - } else { - /* predicate register no longer needed: */ - ctx->pred_reg = -1; - } -} - -static void -get_immediate(struct fd2_compile_context *ctx, - struct tgsi_src_register *reg, uint32_t val) -{ - unsigned neg, swiz, idx, i; - /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ - static const unsigned swiz2tgsi[] = { - TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, - }; - - for (i = 0; i < ctx->immediate_idx; i++) { - swiz = i % 4; - idx = i / 4; - - if (ctx->so->immediates[idx].val[swiz] == val) { - neg = 0; - break; - } - - if (ctx->so->immediates[idx].val[swiz] == -val) { - neg = 1; - break; - } - } - - if (i == ctx->immediate_idx) { - /* need to generate a new immediate: */ - swiz = i % 4; - idx = i / 4; - neg = 0; - ctx->so->immediates[idx].val[swiz] = val; - ctx->so->num_immediates = idx + 1; - ctx->immediate_idx++; - } - - reg->File = TGSI_FILE_IMMEDIATE; - reg->Indirect = 0; - reg->Dimension = 0; - reg->Index = idx; - reg->Absolute = 0; - reg->Negate = neg; - reg->SwizzleX = swiz2tgsi[swiz]; - reg->SwizzleY = swiz2tgsi[swiz]; - reg->SwizzleZ = swiz2tgsi[swiz]; - reg->SwizzleW = swiz2tgsi[swiz]; -} - -/* POW(a,b) = EXP2(b * LOG2(a)) */ -static void -translate_pow(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register tmp_src; - struct ir2_instruction *alu; - - get_internal_temp(ctx, &tmp_dst, &tmp_src); - - alu = ir2_instr_create_alu_s(ctx->so->ir, LOG_CLAMP); - add_dst_reg(ctx, alu, &tmp_dst); - add_src_reg(ctx, alu, &inst->Src[0].Register); - - alu = ir2_instr_create_alu_v(ctx->so->ir, MULv); - add_dst_reg(ctx, alu, &tmp_dst); - add_src_reg(ctx, alu, &tmp_src); - add_src_reg(ctx, alu, &inst->Src[1].Register); - - /* NOTE: some of the instructions, like EXP_IEEE, seem hard- - * coded to take their input from the w component. - */ - switch(inst->Dst[0].Register.WriteMask) { - case TGSI_WRITEMASK_X: - tmp_src.SwizzleW = TGSI_SWIZZLE_X; - break; - case TGSI_WRITEMASK_Y: - tmp_src.SwizzleW = TGSI_SWIZZLE_Y; - break; - case TGSI_WRITEMASK_Z: - tmp_src.SwizzleW = TGSI_SWIZZLE_Z; - break; - case TGSI_WRITEMASK_W: - tmp_src.SwizzleW = TGSI_SWIZZLE_W; - break; - default: - DBG("invalid writemask!"); - assert(0); - break; - } - - alu = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE); - add_dst_reg(ctx, alu, &inst->Dst[0].Register); - add_src_reg(ctx, alu, &tmp_src); - add_scalar_clamp(inst, alu); -} - -static void -translate_tex(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst, unsigned opc) -{ - struct ir2_instruction *instr; - struct ir2_src_register *reg; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register tmp_src; - const struct tgsi_src_register *coord; - bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) || - inst->Instruction.Saturate; - int idx; - - if (using_temp || (opc == TGSI_OPCODE_TXP)) - get_internal_temp(ctx, &tmp_dst, &tmp_src); - - if (opc == TGSI_OPCODE_TXP) { - static const char *swiz[] = { - [TGSI_SWIZZLE_X] = "xxxx", - [TGSI_SWIZZLE_Y] = "yyyy", - [TGSI_SWIZZLE_Z] = "zzzz", - [TGSI_SWIZZLE_W] = "wwww", - }; - - /* TXP - Projective Texture Lookup: - * - * coord.x = src0.x / src.w - * coord.y = src0.y / src.w - * coord.z = src0.z / src.w - * coord.w = src0.w - * bias = 0.0 - * - * dst = texture_sample(unit, coord, bias) - */ - - instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv); - add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w"; - add_src_reg(ctx, instr, &inst->Src[0].Register); - add_src_reg(ctx, instr, &inst->Src[0].Register); - - instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE); - add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___"; - memcpy(add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle, - swiz[inst->Src[0].Register.SwizzleW], 4); - - instr = ir2_instr_create_alu_v(ctx->so->ir, MULv); - add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_"; - add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx"; - add_src_reg(ctx, instr, &inst->Src[0].Register); - - coord = &tmp_src; - } else { - coord = &inst->Src[0].Register; - } - - instr = ir2_instr_create(ctx->so->ir, IR2_FETCH); - instr->fetch.opc = TEX_FETCH; - instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D); - instr->fetch.is_rect = (inst->Texture.Texture == TGSI_TEXTURE_RECT); - assert(inst->Texture.NumOffsets <= 1); // TODO what to do in other cases? - - /* save off the tex fetch to be patched later with correct const_idx: */ - idx = ctx->so->num_tfetch_instrs++; - ctx->so->tfetch_instrs[idx].samp_id = inst->Src[1].Register.Index; - ctx->so->tfetch_instrs[idx].instr = instr; - - add_dst_reg(ctx, instr, using_temp ? &tmp_dst : &inst->Dst[0].Register); - reg = add_src_reg(ctx, instr, coord); - - /* blob compiler always sets 3rd component to same as 1st for 2d: */ - if (inst->Texture.Texture == TGSI_TEXTURE_2D || inst->Texture.Texture == TGSI_TEXTURE_RECT) - reg->swizzle[2] = reg->swizzle[0]; - - /* dst register needs to be marked for sync: */ - ctx->need_sync |= 1 << instr->dst_reg.num; - - /* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */ - instr->sync = true; - - if (using_temp) { - /* texture fetch can't write directly to export, so if tgsi - * is telling us the dst register is in output file, we load - * the texture to a temp and the use ALU instruction to move - * to output - */ - instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv); - - add_dst_reg(ctx, instr, &inst->Dst[0].Register); - add_src_reg(ctx, instr, &tmp_src); - add_src_reg(ctx, instr, &tmp_src); - add_vector_clamp(inst, instr); - } -} - -/* SGE(a,b) = GTE((b - a), 1.0, 0.0) */ -/* SLT(a,b) = GTE((b - a), 0.0, 1.0) */ -/* SEQ(a,b) = EQU((b - a), 1.0, 0.0) */ -/* SNE(a,b) = EQU((b - a), 0.0, 1.0) */ -static void -translate_sge_slt_seq_sne(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst, unsigned opc) -{ - struct ir2_instruction *instr; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register tmp_src; - struct tgsi_src_register tmp_const; - float c0, c1; - instr_vector_opc_t vopc; - - switch (opc) { - default: - assert(0); - case TGSI_OPCODE_SGE: - c0 = 1.0; - c1 = 0.0; - vopc = CNDGTEv; - break; - case TGSI_OPCODE_SLT: - c0 = 0.0; - c1 = 1.0; - vopc = CNDGTEv; - break; - case TGSI_OPCODE_SEQ: - c0 = 0.0; - c1 = 1.0; - vopc = CNDEv; - break; - case TGSI_OPCODE_SNE: - c0 = 1.0; - c1 = 0.0; - vopc = CNDEv; - break; - } - - get_internal_temp(ctx, &tmp_dst, &tmp_src); - - instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv); - add_dst_reg(ctx, instr, &tmp_dst); - add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE; - add_src_reg(ctx, instr, &inst->Src[1].Register); - - instr = ir2_instr_create_alu_v(ctx->so->ir, vopc); - add_dst_reg(ctx, instr, &inst->Dst[0].Register); - add_src_reg(ctx, instr, &tmp_src); - get_immediate(ctx, &tmp_const, fui(c1)); - add_src_reg(ctx, instr, &tmp_const); - get_immediate(ctx, &tmp_const, fui(c0)); - add_src_reg(ctx, instr, &tmp_const); -} - -/* LRP(a,b,c) = (a * b) + ((1 - a) * c) */ -static void -translate_lrp(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst, - unsigned opc) -{ - struct ir2_instruction *instr; - struct tgsi_dst_register tmp_dst1, tmp_dst2; - struct tgsi_src_register tmp_src1, tmp_src2; - struct tgsi_src_register tmp_const; - - get_internal_temp(ctx, &tmp_dst1, &tmp_src1); - get_internal_temp(ctx, &tmp_dst2, &tmp_src2); - - get_immediate(ctx, &tmp_const, fui(1.0)); - - /* tmp1 = (a * b) */ - instr = ir2_instr_create_alu_v(ctx->so->ir, MULv); - add_dst_reg(ctx, instr, &tmp_dst1); - add_src_reg(ctx, instr, &inst->Src[0].Register); - add_src_reg(ctx, instr, &inst->Src[1].Register); - - /* tmp2 = (1 - a) */ - instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv); - add_dst_reg(ctx, instr, &tmp_dst2); - add_src_reg(ctx, instr, &tmp_const); - add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE; - - /* tmp2 = tmp2 * c */ - instr = ir2_instr_create_alu_v(ctx->so->ir, MULv); - add_dst_reg(ctx, instr, &tmp_dst2); - add_src_reg(ctx, instr, &tmp_src2); - add_src_reg(ctx, instr, &inst->Src[2].Register); - - /* dst = tmp1 + tmp2 */ - instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv); - add_dst_reg(ctx, instr, &inst->Dst[0].Register); - add_src_reg(ctx, instr, &tmp_src1); - add_src_reg(ctx, instr, &tmp_src2); -} - -static void -translate_trig(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst, - unsigned opc) -{ - struct ir2_instruction *instr; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register tmp_src; - struct tgsi_src_register tmp_const; - instr_scalar_opc_t op; - - switch (opc) { - default: - assert(0); - case TGSI_OPCODE_SIN: - op = SIN; - break; - case TGSI_OPCODE_COS: - op = COS; - break; - } - - get_internal_temp(ctx, &tmp_dst, &tmp_src); - - tmp_dst.WriteMask = TGSI_WRITEMASK_X; - tmp_src.SwizzleX = tmp_src.SwizzleY = - tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X; - - instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv); - add_dst_reg(ctx, instr, &tmp_dst); - add_src_reg(ctx, instr, &inst->Src[0].Register); - get_immediate(ctx, &tmp_const, fui(0.159155)); - add_src_reg(ctx, instr, &tmp_const); - get_immediate(ctx, &tmp_const, fui(0.5)); - add_src_reg(ctx, instr, &tmp_const); - - instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv); - add_dst_reg(ctx, instr, &tmp_dst); - add_src_reg(ctx, instr, &tmp_src); - add_src_reg(ctx, instr, &tmp_src); - - instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv); - add_dst_reg(ctx, instr, &tmp_dst); - add_src_reg(ctx, instr, &tmp_src); - get_immediate(ctx, &tmp_const, fui(6.283185)); - add_src_reg(ctx, instr, &tmp_const); - get_immediate(ctx, &tmp_const, fui(-3.141593)); - add_src_reg(ctx, instr, &tmp_const); - - instr = ir2_instr_create_alu_s(ctx->so->ir, op); - add_dst_reg(ctx, instr, &inst->Dst[0].Register); - add_src_reg(ctx, instr, &tmp_src); -} - -static void -translate_dp2(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst, - unsigned opc) -{ - struct tgsi_src_register tmp_const; - struct ir2_instruction *instr; - /* DP2ADD c,a,b -> dot2(a,b) + c */ - /* for c we use the constant 0.0 */ - instr = ir2_instr_create_alu_v(ctx->so->ir, DOT2ADDv); - add_dst_reg(ctx, instr, &inst->Dst[0].Register); - add_src_reg(ctx, instr, &inst->Src[0].Register); - add_src_reg(ctx, instr, &inst->Src[1].Register); - get_immediate(ctx, &tmp_const, fui(0.0f)); - add_src_reg(ctx, instr, &tmp_const); - add_vector_clamp(inst, instr); -} - -/* - * Main part of compiler/translator: - */ - -static void -translate_instruction(struct fd2_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - unsigned opc = inst->Instruction.Opcode; - struct ir2_instruction *instr; - - if (opc == TGSI_OPCODE_END) - return; - - /* TODO turn this into a table: */ - switch (opc) { - case TGSI_OPCODE_MOV: - instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv); - add_regs_vector_1(ctx, inst, instr); - break; - case TGSI_OPCODE_RCP: - instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE); - add_regs_scalar_1(ctx, inst, instr); - break; - case TGSI_OPCODE_RSQ: - instr = ir2_instr_create_alu_s(ctx->so->ir, RECIPSQ_IEEE); - add_regs_scalar_1(ctx, inst, instr); - break; - case TGSI_OPCODE_SQRT: - instr = ir2_instr_create_alu_s(ctx->so->ir, SQRT_IEEE); - add_regs_scalar_1(ctx, inst, instr); - break; - case TGSI_OPCODE_MUL: - instr = ir2_instr_create_alu_v(ctx->so->ir, MULv); - add_regs_vector_2(ctx, inst, instr); - break; - case TGSI_OPCODE_ADD: - instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv); - add_regs_vector_2(ctx, inst, instr); - break; - case TGSI_OPCODE_DP2: - translate_dp2(ctx, inst, opc); - break; - case TGSI_OPCODE_DP3: - instr = ir2_instr_create_alu_v(ctx->so->ir, DOT3v); - add_regs_vector_2(ctx, inst, instr); - break; - case TGSI_OPCODE_DP4: - instr = ir2_instr_create_alu_v(ctx->so->ir, DOT4v); - add_regs_vector_2(ctx, inst, instr); - break; - case TGSI_OPCODE_MIN: - instr = ir2_instr_create_alu_v(ctx->so->ir, MINv); - add_regs_vector_2(ctx, inst, instr); - break; - case TGSI_OPCODE_MAX: - instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv); - add_regs_vector_2(ctx, inst, instr); - break; - case TGSI_OPCODE_SLT: - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_SNE: - translate_sge_slt_seq_sne(ctx, inst, opc); - break; - case TGSI_OPCODE_MAD: - instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv); - add_regs_vector_3(ctx, inst, instr); - break; - case TGSI_OPCODE_LRP: - translate_lrp(ctx, inst, opc); - break; - case TGSI_OPCODE_FRC: - instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv); - add_regs_vector_1(ctx, inst, instr); - break; - case TGSI_OPCODE_FLR: - instr = ir2_instr_create_alu_v(ctx->so->ir, FLOORv); - add_regs_vector_1(ctx, inst, instr); - break; - case TGSI_OPCODE_EX2: - instr = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE); - add_regs_scalar_1(ctx, inst, instr); - break; - case TGSI_OPCODE_POW: - translate_pow(ctx, inst); - break; - case TGSI_OPCODE_COS: - case TGSI_OPCODE_SIN: - translate_trig(ctx, inst, opc); - break; - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXP: - translate_tex(ctx, inst, opc); - break; - case TGSI_OPCODE_CMP: - instr = ir2_instr_create_alu_v(ctx->so->ir, CNDGTEv); - add_regs_vector_3(ctx, inst, instr); - instr->src_reg[0].flags ^= IR2_REG_NEGATE; /* src1 */ - break; - case TGSI_OPCODE_IF: - push_predicate(ctx, &inst->Src[0].Register); - ctx->so->ir->pred = IR2_PRED_EQ; - break; - case TGSI_OPCODE_ELSE: - ctx->so->ir->pred = IR2_PRED_NE; - break; - case TGSI_OPCODE_ENDIF: - pop_predicate(ctx); - break; - case TGSI_OPCODE_F2I: - instr = ir2_instr_create_alu_v(ctx->so->ir, TRUNCv); - add_regs_vector_1(ctx, inst, instr); - break; - default: - DBG("unknown TGSI opc: %s", tgsi_get_opcode_name(opc)); - tgsi_dump(ctx->so->tokens, 0); - assert(0); - break; - } - - /* internal temporaries are only valid for the duration of a single - * TGSI instruction: - */ - ctx->num_internal_temps = 0; -} - -static void -compile_instructions(struct fd2_compile_context *ctx) -{ - while (!tgsi_parse_end_of_tokens(&ctx->parser)) { - tgsi_parse_token(&ctx->parser); - - switch (ctx->parser.FullToken.Token.Type) { - case TGSI_TOKEN_TYPE_INSTRUCTION: - translate_instruction(ctx, - &ctx->parser.FullToken.FullInstruction); - break; - default: - break; - } - } -} - -int -fd2_compile_shader(struct fd_program_stateobj *prog, - struct fd2_shader_stateobj *so) -{ - struct fd2_compile_context ctx; - - ir2_shader_destroy(so->ir); - so->ir = ir2_shader_create(); - so->num_vfetch_instrs = so->num_tfetch_instrs = so->num_immediates = 0; - - if (compile_init(&ctx, prog, so) != TGSI_PARSE_OK) - return -1; - - if (ctx.type == PIPE_SHADER_VERTEX) { - compile_vtx_fetch(&ctx); - } else if (ctx.type == PIPE_SHADER_FRAGMENT) { - prog->num_exports = 0; - memset(prog->export_linkage, 0xff, - sizeof(prog->export_linkage)); - } - - compile_instructions(&ctx); - - compile_free(&ctx); - - return 0; -} - diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h deleted file mode 100644 index f26bb2ffc4d..00000000000 --- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (C) 2012 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#ifndef FD2_COMPILER_H_ -#define FD2_COMPILER_H_ - -#include "fd2_program.h" -#include "fd2_util.h" - -int fd2_compile_shader(struct fd_program_stateobj *prog, - struct fd2_shader_stateobj *so); - -#endif /* FD2_COMPILER_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index 00381df5034..f15d57cf0e0 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -216,7 +216,7 @@ fd2_clear(struct fd_context *ctx, unsigned buffers, OUT_RING(ring, 0x0000028f); } - fd2_program_emit(ring, &ctx->solid_prog); + fd2_program_emit(ctx, ring, &ctx->solid_prog); OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c index ac2a02dfae9..9628f267365 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c @@ -272,13 +272,25 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) OUT_RING(ring, fui(ctx->viewport.translate[1])); /* PA_CL_VPORT_YOFFSET */ OUT_RING(ring, fui(ctx->viewport.scale[2])); /* PA_CL_VPORT_ZSCALE */ OUT_RING(ring, fui(ctx->viewport.translate[2])); /* PA_CL_VPORT_ZOFFSET */ - } - if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE | FD_DIRTY_TEXSTATE)) { - fd2_program_validate(ctx); - fd2_program_emit(ring, &ctx->prog); + /* set viewport in C65/C66, for a20x hw binning and fragcoord.z */ + OUT_PKT3(ring, CP_SET_CONSTANT, 9); + OUT_RING(ring, 0x00000184); + + OUT_RING(ring, fui(ctx->viewport.translate[0])); + OUT_RING(ring, fui(ctx->viewport.translate[1])); + OUT_RING(ring, fui(ctx->viewport.translate[2])); + OUT_RING(ring, fui(0.0f)); + + OUT_RING(ring, fui(ctx->viewport.scale[0])); + OUT_RING(ring, fui(ctx->viewport.scale[1])); + OUT_RING(ring, fui(ctx->viewport.scale[2])); + OUT_RING(ring, fui(0.0f)); } + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE | FD_DIRTY_TEXSTATE)) + fd2_program_emit(ctx, ring, &ctx->prog); + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) { emit_constants(ring, VS_CONST_BASE * 4, &ctx->constbuf[PIPE_SHADER_VERTEX], diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c index ca00f3c8a60..56db5608c28 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c @@ -137,7 +137,7 @@ fd2_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, 0x0000028f); } - fd2_program_emit(ring, &ctx->solid_prog); + fd2_program_emit(ctx, ring, &ctx->solid_prog); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); @@ -285,7 +285,7 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); OUT_RING(ring, 0); - fd2_program_emit(ring, &ctx->blit_prog[0]); + fd2_program_emit(ctx, ring, &ctx->blit_prog[0]); OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); @@ -476,6 +476,16 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) | A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff)); + + /* tile offset for gl_FragCoord on a20x (C64 in fragment shader) */ + if (is_a20x(batch->ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000580); + OUT_RING(ring, fui(tile->xoff)); + OUT_RING(ring, fui(tile->yoff)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } } void diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c index 56b3ab2aaeb..da020443bd9 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c @@ -22,6 +22,7 @@ * * Authors: * Rob Clark + * Jonathan Marek */ #include "pipe/p_state.h" @@ -34,18 +35,20 @@ #include "freedreno_program.h" +#include "ir2.h" #include "fd2_program.h" -#include "fd2_compiler.h" #include "fd2_texture.h" #include "fd2_util.h" +#include "instr-a2xx.h" static struct fd2_shader_stateobj * -create_shader(gl_shader_stage type) +create_shader(struct pipe_context *pctx, gl_shader_stage type) { struct fd2_shader_stateobj *so = CALLOC_STRUCT(fd2_shader_stateobj); if (!so) return NULL; so->type = type; + so->is_a20x = is_a20x(fd_context(pctx)->screen); return so; } @@ -54,88 +57,66 @@ delete_shader(struct fd2_shader_stateobj *so) { if (!so) return; - ir2_shader_destroy(so->ir); - free(so->tokens); - free(so->bin); + ralloc_free(so->nir); + for (int i = 0; i < ARRAY_SIZE(so->variant); i++) + free(so->variant[i].info.dwords); free(so); } -static struct fd2_shader_stateobj * -assemble(struct fd2_shader_stateobj *so) +static void +emit(struct fd_ringbuffer *ring, gl_shader_stage type, + struct ir2_shader_info *info) { - free(so->bin); - so->bin = ir2_shader_assemble(so->ir, &so->info); - if (!so->bin) - goto fail; + unsigned i; - if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("disassemble: type=%d", so->type); - disasm_a2xx(so->bin, so->info.sizedwords, 0, so->type); - } + assert(info->sizedwords); - return so; + OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + info->sizedwords); + OUT_RING(ring, type == MESA_SHADER_FRAGMENT); + OUT_RING(ring, info->sizedwords); + for (i = 0; i < info->sizedwords; i++) + OUT_RING(ring, info->dwords[i]); +} -fail: - debug_error("assemble failed!"); - delete_shader(so); - return NULL; +static int +ir2_glsl_type_size(const struct glsl_type *type) +{ + return glsl_count_attribute_slots(type, false); } -static struct fd2_shader_stateobj * -compile(struct fd_program_stateobj *prog, struct fd2_shader_stateobj *so) +static void * +fd2_fp_state_create(struct pipe_context *pctx, + const struct pipe_shader_state *cso) { - int ret; + struct fd2_shader_stateobj *so = create_shader(pctx, MESA_SHADER_FRAGMENT); + if (!so) + return NULL; - if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("dump tgsi: type=%d", so->type); - tgsi_dump(so->tokens, 0); + if (cso->type == PIPE_SHADER_IR_NIR) { + so->nir = cso->ir.nir; + NIR_PASS_V(so->nir, nir_lower_io, nir_var_all, ir2_glsl_type_size, + (nir_lower_io_options)0); + } else { + assert(cso->type == PIPE_SHADER_IR_TGSI); + so->nir = ir2_tgsi_to_nir(cso->tokens); } - ret = fd2_compile_shader(prog, so); - if (ret) + if (ir2_optimize_nir(so->nir, true)) goto fail; - /* NOTE: we don't assemble yet because for VS we don't know the - * type information for vertex fetch yet.. so those need to be - * patched up later before assembling. - */ + so->first_immediate = so->nir->num_uniforms; - so->info.sizedwords = 0; + ir2_compile(so, 0, NULL); + ralloc_free(so->nir); + so->nir = NULL; return so; fail: - debug_error("compile failed!"); delete_shader(so); return NULL; } -static void -emit(struct fd_ringbuffer *ring, struct fd2_shader_stateobj *so) -{ - unsigned i; - - if (so->info.sizedwords == 0) - assemble(so); - - OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + so->info.sizedwords); - OUT_RING(ring, (so->type == MESA_SHADER_VERTEX) ? 0 : 1); - OUT_RING(ring, so->info.sizedwords); - for (i = 0; i < so->info.sizedwords; i++) - OUT_RING(ring, so->bin[i]); -} - -static void * -fd2_fp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) -{ - struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_FRAGMENT); - if (!so) - return NULL; - so->tokens = tgsi_dup_tokens(cso->tokens); - return so; -} - static void fd2_fp_state_delete(struct pipe_context *pctx, void *hwcso) { @@ -147,11 +128,32 @@ static void * fd2_vp_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso) { - struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_VERTEX); + struct fd2_shader_stateobj *so = create_shader(pctx, MESA_SHADER_VERTEX); if (!so) return NULL; - so->tokens = tgsi_dup_tokens(cso->tokens); + + if (cso->type == PIPE_SHADER_IR_NIR) { + so->nir = cso->ir.nir; + NIR_PASS_V(so->nir, nir_lower_io, nir_var_all, ir2_glsl_type_size, + (nir_lower_io_options)0); + } else { + assert(cso->type == PIPE_SHADER_IR_TGSI); + so->nir = ir2_tgsi_to_nir(cso->tokens); + } + + if (ir2_optimize_nir(so->nir, true)) + goto fail; + + so->first_immediate = so->nir->num_uniforms; + + /* compile binning variant now */ + ir2_compile(so, 0, NULL); + return so; + +fail: + delete_shader(so); + return NULL; } static void @@ -162,277 +164,145 @@ fd2_vp_state_delete(struct pipe_context *pctx, void *hwcso) } static void -patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so, - struct fd_vertex_stateobj *vtx) +patch_vtx_fetch(struct fd_context *ctx, struct pipe_vertex_element *elem, + instr_fetch_vtx_t *instr, uint16_t dst_swiz) { - unsigned i; - - assert(so->num_vfetch_instrs == vtx->num_elements); - - /* update vtx fetch instructions: */ - for (i = 0; i < so->num_vfetch_instrs; i++) { - struct ir2_instruction *instr = so->vfetch_instrs[i]; - struct pipe_vertex_element *elem = &vtx->pipe[i]; - struct pipe_vertex_buffer *vb = + struct pipe_vertex_buffer *vb = &ctx->vtx.vertexbuf.vb[elem->vertex_buffer_index]; - enum pipe_format format = elem->src_format; - const struct util_format_description *desc = - util_format_description(format); - unsigned j; - - /* Find the first non-VOID channel. */ - for (j = 0; j < 4; j++) - if (desc->channel[j].type != UTIL_FORMAT_TYPE_VOID) - break; - - /* CI/CIS can probably be set in compiler instead: */ - instr->fetch.const_idx = 20 + (i / 3); - instr->fetch.const_idx_sel = i % 3; - - instr->fetch.fmt = fd2_pipe2surface(format); - instr->fetch.is_normalized = desc->channel[j].normalized; - instr->fetch.is_signed = - desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED; - instr->fetch.stride = vb->stride ? : 1; - instr->fetch.offset = elem->src_offset; - - for (j = 0; j < 4; j++) - instr->dst_reg.swizzle[j] = "xyzw01__"[desc->swizzle[j]]; - - assert(instr->fetch.fmt != ~0); - - DBG("vtx[%d]: %s (%d), ci=%d, cis=%d, id=%d, swizzle=%s, " - "stride=%d, offset=%d", - i, util_format_name(format), - instr->fetch.fmt, - instr->fetch.const_idx, - instr->fetch.const_idx_sel, - elem->instance_divisor, - instr->dst_reg.swizzle, - instr->fetch.stride, - instr->fetch.offset); + enum pipe_format format = elem->src_format; + const struct util_format_description *desc = + util_format_description(format); + unsigned j; + + /* Find the first non-VOID channel. */ + for (j = 0; j < 4; j++) + if (desc->channel[j].type != UTIL_FORMAT_TYPE_VOID) + break; + + instr->format = fd2_pipe2surface(format); + instr->num_format_all = !desc->channel[j].normalized; + instr->format_comp_all = desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED; + instr->stride = vb->stride; + instr->offset = elem->src_offset; + + unsigned swiz = 0; + for (int i = 0; i < 4; i++) { + unsigned s = dst_swiz >> i*3 & 7; + swiz |= (s >= 4 ? s : desc->swizzle[s]) << i*3; } - - /* trigger re-assemble: */ - so->info.sizedwords = 0; + instr->dst_swiz = swiz; } static void -patch_tex_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so, - struct fd_texture_stateobj *tex) +patch_fetches(struct fd_context *ctx, struct ir2_shader_info *info, + struct fd_vertex_stateobj *vtx, struct fd_texture_stateobj *tex) { - unsigned i; - - /* update tex fetch instructions: */ - for (i = 0; i < so->num_tfetch_instrs; i++) { - struct ir2_instruction *instr = so->tfetch_instrs[i].instr; - unsigned samp_id = so->tfetch_instrs[i].samp_id; - unsigned const_idx = fd2_get_const_idx(ctx, tex, samp_id); + for (int i = 0; i < info->num_fetch_instrs; i++) { + struct ir2_fetch_info *fi = &info->fetch_info[i]; + + instr_fetch_t *instr = (instr_fetch_t*) &info->dwords[fi->offset]; + if (instr->opc == VTX_FETCH) { + unsigned idx = (instr->vtx.const_index - 20) * 3 + + instr->vtx.const_index_sel; + patch_vtx_fetch(ctx, &vtx->pipe[idx], &instr->vtx, fi->vtx.dst_swiz); + continue; + } - if (const_idx != instr->fetch.const_idx) { - instr->fetch.const_idx = const_idx; - /* trigger re-assemble: */ - so->info.sizedwords = 0; + assert(instr->opc == TEX_FETCH); + instr->tex.const_idx = fd2_get_const_idx(ctx, tex, fi->tex.samp_id); + instr->tex.src_swiz = fi->tex.src_swiz; + if (fd2_texture_swap_xy(tex, fi->tex.samp_id)) { + unsigned x = instr->tex.src_swiz; + instr->tex.src_swiz = (x & 0x30) | (x & 3) << 2 | (x >> 2 & 3); } } } void -fd2_program_validate(struct fd_context *ctx) +fd2_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, + struct fd_program_stateobj *prog) { - struct fd_program_stateobj *prog = &ctx->prog; - bool dirty_fp = !!(ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_PROG); - bool dirty_vp = !!(ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_PROG); - - /* if vertex or frag shader is dirty, we may need to recompile. Compile - * frag shader first, as that assigns the register slots for exports - * from the vertex shader. And therefore if frag shader has changed we - * need to recompile both vert and frag shader. - */ - if (dirty_fp) - compile(prog, prog->fp); - - if (dirty_fp || dirty_vp) - compile(prog, prog->vp); - - /* if necessary, fix up vertex fetch instructions: */ - if (ctx->dirty & (FD_DIRTY_VTXSTATE | FD_DIRTY_PROG)) - patch_vtx_fetches(ctx, prog->vp, ctx->vtx.vtx); - - /* if necessary, fix up texture fetch instructions: */ - if (ctx->dirty & (FD_DIRTY_TEXSTATE | FD_DIRTY_PROG)) { - patch_tex_fetches(ctx, prog->vp, &ctx->tex[PIPE_SHADER_VERTEX]); - patch_tex_fetches(ctx, prog->fp, &ctx->tex[PIPE_SHADER_FRAGMENT]); + struct fd2_shader_stateobj *fp = NULL, *vp; + struct ir2_shader_info *fpi, *vpi; + struct ir2_frag_linkage *f; + uint8_t vs_gprs, fs_gprs = 0, vs_export = 0; + enum a2xx_sq_ps_vtx_mode mode = POSITION_1_VECTOR; + bool binning = (ctx->batch && ring == ctx->batch->binning); + unsigned variant = 0; + + vp = prog->vp; + + /* find variant matching the linked fragment shader */ + if (!binning) { + fp = prog->fp; + for (variant = 1; variant < ARRAY_SIZE(vp->variant); variant++) { + /* if checked all variants, compile a new variant */ + if (!vp->variant[variant].info.sizedwords) { + ir2_compile(vp, variant, fp); + break; + } + + /* check if fragment shader linkage matches */ + if (!memcmp(&vp->variant[variant].f, &fp->variant[0].f, + sizeof(struct ir2_frag_linkage))) + break; + } + assert(variant < ARRAY_SIZE(vp->variant)); } -} -void -fd2_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog) -{ - struct ir2_shader_info *vsi = - &((struct fd2_shader_stateobj *)prog->vp)->info; - struct ir2_shader_info *fsi = - &((struct fd2_shader_stateobj *)prog->fp)->info; - uint8_t vs_gprs, fs_gprs, vs_export; + vpi = &vp->variant[variant].info; + fpi = &fp->variant[0].info; + f = &fp->variant[0].f; + + /* clear/gmem2mem/mem2gmem need to be changed to remove this condition */ + if (prog != &ctx->solid_prog && prog != &ctx->blit_prog[0]) { + patch_fetches(ctx, vpi, ctx->vtx.vtx, &ctx->tex[PIPE_SHADER_VERTEX]); + if (fp) + patch_fetches(ctx, fpi, NULL, &ctx->tex[PIPE_SHADER_FRAGMENT]); + } - emit(ring, prog->vp); - emit(ring, prog->fp); + emit(ring, MESA_SHADER_VERTEX, vpi); - vs_gprs = (vsi->max_reg < 0) ? 0x80 : vsi->max_reg; - fs_gprs = (fsi->max_reg < 0) ? 0x80 : fsi->max_reg; - vs_export = MAX2(1, prog->num_exports) - 1; + if (fp) { + emit(ring, MESA_SHADER_FRAGMENT, fpi); + fs_gprs = (fpi->max_reg < 0) ? 0x80 : fpi->max_reg; + vs_export = MAX2(1, f->inputs_count) - 1; + } + + vs_gprs = (vpi->max_reg < 0) ? 0x80 : vpi->max_reg; + + if (vp->writes_psize && !binning) + mode = POSITION_2_VECTORS_SPRITE; + + /* set register to use for param (fragcoord/pointcoord/frontfacing) */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_SQ_CONTEXT_MISC)); + OUT_RING(ring, A2XX_SQ_CONTEXT_MISC_SC_SAMPLE_CNTL(CENTERS_ONLY) | + COND(fp, A2XX_SQ_CONTEXT_MISC_PARAM_GEN_POS(f->inputs_count)) | + /* we need SCREEN_XY for both fragcoord and frontfacing */ + A2XX_SQ_CONTEXT_MISC_SC_OUTPUT_SCREEN_XY); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_SQ_PROGRAM_CNTL)); - OUT_RING(ring, A2XX_SQ_PROGRAM_CNTL_PS_EXPORT_MODE(POSITION_2_VECTORS_SPRITE) | + OUT_RING(ring, A2XX_SQ_PROGRAM_CNTL_PS_EXPORT_MODE(2) | + A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_MODE(mode) | A2XX_SQ_PROGRAM_CNTL_VS_RESOURCE | A2XX_SQ_PROGRAM_CNTL_PS_RESOURCE | A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_COUNT(vs_export) | A2XX_SQ_PROGRAM_CNTL_PS_REGS(fs_gprs) | - A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs)); -} - -/* Creates shader: - * EXEC ADDR(0x2) CNT(0x1) - * (S)FETCH: SAMPLE R0.xyzw = R0.xyx CONST(0) LOCATION(CENTER) - * ALLOC PARAM/PIXEL SIZE(0x0) - * EXEC_END ADDR(0x3) CNT(0x1) - * ALU: MAXv export0 = R0, R0 ; gl_FragColor - * NOP - */ -static struct fd2_shader_stateobj * -create_blit_fp(void) -{ - struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_FRAGMENT); - struct ir2_instruction *instr; - - if (!so) - return NULL; - - so->ir = ir2_shader_create(); - - instr = ir2_instr_create_tex_fetch(so->ir, 0); - ir2_dst_create(instr, 0, "xyzw", 0); - ir2_reg_create(instr, 0, "xyx", IR2_REG_INPUT); - instr->sync = true; - - instr = ir2_instr_create_alu_v(so->ir, MAXv); - ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT); - ir2_reg_create(instr, 0, NULL, 0); - ir2_reg_create(instr, 0, NULL, 0); - - return assemble(so); -} - -/* Creates shader: -* EXEC ADDR(0x3) CNT(0x2) -* FETCH: VERTEX R1.xy01 = R0.x FMT_32_32_FLOAT UNSIGNED STRIDE(8) CONST(26, 1) -* FETCH: VERTEX R2.xyz1 = R0.x FMT_32_32_32_FLOAT UNSIGNED STRIDE(12) CONST(26, 0) -* ALLOC POSITION SIZE(0x0) -* EXEC ADDR(0x5) CNT(0x1) -* ALU: MAXv export62 = R2, R2 ; gl_Position -* ALLOC PARAM/PIXEL SIZE(0x0) -* EXEC_END ADDR(0x6) CNT(0x1) -* ALU: MAXv export0 = R1, R1 -* NOP - */ -static struct fd2_shader_stateobj * -create_blit_vp(void) -{ - struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_VERTEX); - struct ir2_instruction *instr; - - if (!so) - return NULL; - - so->ir = ir2_shader_create(); - - instr = ir2_instr_create_vtx_fetch(so->ir, 26, 1, FMT_32_32_FLOAT, false, 8); - instr->fetch.is_normalized = true; - ir2_dst_create(instr, 1, "xy01", 0); - ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); - - instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12); - instr->fetch.is_normalized = true; - ir2_dst_create(instr, 2, "xyz1", 0); - ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); - - instr = ir2_instr_create_alu_v(so->ir, MAXv); - ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT); - ir2_reg_create(instr, 2, NULL, 0); - ir2_reg_create(instr, 2, NULL, 0); - - instr = ir2_instr_create_alu_v(so->ir, MAXv); - ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT); - ir2_reg_create(instr, 1, NULL, 0); - ir2_reg_create(instr, 1, NULL, 0); - - return assemble(so); -} - -/* Creates shader: - * ALLOC PARAM/PIXEL SIZE(0x0) - * EXEC_END ADDR(0x1) CNT(0x1) - * ALU: MAXv export0 = C0, C0 ; gl_FragColor - */ -static struct fd2_shader_stateobj * -create_solid_fp(void) -{ - struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_FRAGMENT); - struct ir2_instruction *instr; - - if (!so) - return NULL; - - so->ir = ir2_shader_create(); - - instr = ir2_instr_create_alu_v(so->ir, MAXv); - ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT); - ir2_reg_create(instr, 0, NULL, IR2_REG_CONST); - ir2_reg_create(instr, 0, NULL, IR2_REG_CONST); - - return assemble(so); -} - -/* Creates shader: - * EXEC ADDR(0x3) CNT(0x1) - * (S)FETCH: VERTEX R1.xyz1 = R0.x FMT_32_32_32_FLOAT - * UNSIGNED STRIDE(12) CONST(26, 0) - * ALLOC POSITION SIZE(0x0) - * EXEC ADDR(0x4) CNT(0x1) - * ALU: MAXv export62 = R1, R1 ; gl_Position - * ALLOC PARAM/PIXEL SIZE(0x0) - * EXEC_END ADDR(0x5) CNT(0x0) - */ -static struct fd2_shader_stateobj * -create_solid_vp(void) -{ - struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_VERTEX); - struct ir2_instruction *instr; - - if (!so) - return NULL; - - so->ir = ir2_shader_create(); - - instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12); - ir2_dst_create(instr, 1, "xyz1", 0); - ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); - - instr = ir2_instr_create_alu_v(so->ir, MAXv); - ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT); - ir2_reg_create(instr, 1, NULL, 0); - ir2_reg_create(instr, 1, NULL, 0); - - - return assemble(so); + A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs) | + COND(fp && fp->need_param, A2XX_SQ_PROGRAM_CNTL_PARAM_GEN) | + COND(!fp, A2XX_SQ_PROGRAM_CNTL_GEN_INDEX_VTX)); } void fd2_prog_init(struct pipe_context *pctx) { struct fd_context *ctx = fd_context(pctx); + struct fd_program_stateobj *prog; + struct fd2_shader_stateobj *so; + struct ir2_shader_info *info; + instr_fetch_vtx_t *instr; pctx->create_fs_state = fd2_fp_state_create; pctx->delete_fs_state = fd2_fp_state_delete; @@ -442,8 +312,47 @@ fd2_prog_init(struct pipe_context *pctx) fd_prog_init(pctx); - ctx->solid_prog.fp = create_solid_fp(); - ctx->solid_prog.vp = create_solid_vp(); - ctx->blit_prog[0].fp = create_blit_fp(); - ctx->blit_prog[0].vp = create_blit_vp(); + /* XXX maybe its possible to reuse patch_vtx_fetch somehow? */ + + prog = &ctx->solid_prog; + so = prog->vp; + ir2_compile(prog->vp, 1, prog->fp); + +#define IR2_FETCH_SWIZ_XY01 0xb08 +#define IR2_FETCH_SWIZ_XYZ1 0xa88 + + info = &so->variant[1].info; + + instr = (instr_fetch_vtx_t*) &info->dwords[info->fetch_info[0].offset]; + instr->const_index = 26; + instr->const_index_sel = 0; + instr->format = FMT_32_32_32_FLOAT; + instr->format_comp_all = false; + instr->stride = 12; + instr->num_format_all = true; + instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1; + + prog = &ctx->blit_prog[0]; + so = prog->vp; + ir2_compile(prog->vp, 1, prog->fp); + + info = &so->variant[1].info; + + instr = (instr_fetch_vtx_t*) &info->dwords[info->fetch_info[0].offset]; + instr->const_index = 26; + instr->const_index_sel = 1; + instr->format = FMT_32_32_FLOAT; + instr->format_comp_all = false; + instr->stride = 8; + instr->num_format_all = false; + instr->dst_swiz = IR2_FETCH_SWIZ_XY01; + + instr = (instr_fetch_vtx_t*) &info->dwords[info->fetch_info[1].offset]; + instr->const_index = 26; + instr->const_index_sel = 0; + instr->format = FMT_32_32_32_FLOAT; + instr->format_comp_all = false; + instr->stride = 12; + instr->num_format_all = false; + instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.h b/src/gallium/drivers/freedreno/a2xx/fd2_program.h index 01e9983555e..d4ac93bfed3 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.h @@ -31,48 +31,38 @@ #include "freedreno_context.h" -#include "ir-a2xx.h" +#include "ir2.h" #include "disasm.h" struct fd2_shader_stateobj { + nir_shader *nir; gl_shader_stage type; + bool is_a20x; - uint32_t *bin; - - struct tgsi_token *tokens; - - /* note that we defer compiling shader until we know both vs and ps.. - * and if one changes, we potentially need to recompile in order to - * get varying linkages correct: - */ - struct ir2_shader_info info; - struct ir2_shader *ir; - - /* for vertex shaders, the fetch instructions which need to be - * patched up before assembly: - */ - unsigned num_vfetch_instrs; - struct ir2_instruction *vfetch_instrs[64]; - - /* for all shaders, any tex fetch instructions which need to be - * patched before assembly: + /* note: using same set of immediates for all variants + * it doesn't matter, other than the slightly larger command stream */ - unsigned num_tfetch_instrs; - struct { - unsigned samp_id; - struct ir2_instruction *instr; - } tfetch_instrs[64]; - unsigned first_immediate; /* const reg # of first immediate */ unsigned num_immediates; struct { uint32_t val[4]; + unsigned ncomp; } immediates[64]; + + bool writes_psize; + bool need_param; + + /* note: + * fragment shader only has one variant + * first vertex shader variant is always binning shader + * we should use a dynamic array but in normal case there is + * only 2 variants (and 3 sometimes with GALLIUM_HUD) + */ + struct ir2_shader_variant variant[8]; }; -void fd2_program_emit(struct fd_ringbuffer *ring, +void fd2_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, struct fd_program_stateobj *prog); -void fd2_program_validate(struct fd_context *ctx); void fd2_prog_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h index 5a9f93ec794..2591062ee3c 100644 --- a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h +++ b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h @@ -87,6 +87,7 @@ typedef enum { SIN = 48, COS = 49, RETAIN_PREV = 50, + SCALAR_NONE = 63, } instr_scalar_opc_t; typedef enum { @@ -120,6 +121,7 @@ typedef enum { KILLNEv = 27, DSTv = 28, MOVAv = 29, + VECTOR_NONE = 31, } instr_vector_opc_t; typedef struct PACKED { @@ -161,9 +163,9 @@ typedef struct PACKED { }; /* constants have full 8-bit index */ struct { - uint8_t src3_reg_const : 8; - uint8_t src2_reg_const : 8; - uint8_t src1_reg_const : 8; + uint8_t src3_reg_byte : 8; + uint8_t src2_reg_byte : 8; + uint8_t src1_reg_byte : 8; }; }; instr_vector_opc_t vector_opc : 5; @@ -389,10 +391,17 @@ typedef union PACKED { instr_fetch_opc_t opc : 5; uint32_t dummy0 : 27; /* dword1: */ - uint32_t dummy1 : 32; + uint32_t dummy1 : 31; + uint8_t pred_select : 1; /* dword2: */ - uint32_t dummy2 : 32; + uint32_t dummy2 : 31; + uint8_t pred_condition : 1; }; } instr_fetch_t; +typedef union PACKED { + instr_alu_t alu; + instr_fetch_t fetch; +} instr_t; + #endif /* INSTR_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c deleted file mode 100644 index af9811864ff..00000000000 --- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c +++ /dev/null @@ -1,809 +0,0 @@ -/* - * Copyright (c) 2012 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "ir-a2xx.h" - -#include -#include -#include -#include - -#include "freedreno_util.h" -#include "instr-a2xx.h" - -#define DEBUG_MSG(f, ...) do { if (0) DBG(f, ##__VA_ARGS__); } while (0) -#define WARN_MSG(f, ...) DBG("WARN: "f, ##__VA_ARGS__) -#define ERROR_MSG(f, ...) DBG("ERROR: "f, ##__VA_ARGS__) - -static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords, - uint32_t idx, struct ir2_shader_info *info); - -static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n); -static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg); -static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg); -static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg); - -/* simple allocator to carve allocations out of an up-front allocated heap, - * so that we can free everything easily in one shot. - */ -static void * ir2_alloc(struct ir2_shader *shader, int sz) -{ - void *ptr = &shader->heap[shader->heap_idx]; - shader->heap_idx += align(sz, 4) / 4; - return ptr; -} - -static char * ir2_strdup(struct ir2_shader *shader, const char *str) -{ - char *ptr = NULL; - if (str) { - int len = strlen(str); - ptr = ir2_alloc(shader, len+1); - memcpy(ptr, str, len); - ptr[len] = '\0'; - } - return ptr; -} - -struct ir2_shader * ir2_shader_create(void) -{ - DEBUG_MSG(""); - struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader)); - shader->max_reg = -1; - return shader; -} - -void ir2_shader_destroy(struct ir2_shader *shader) -{ - DEBUG_MSG(""); - free(shader); -} - -/* check if an instruction is a simple MOV - */ -static struct ir2_instruction * simple_mov(struct ir2_instruction *instr, - bool output) -{ - struct ir2_src_register *src_reg = instr->src_reg; - struct ir2_dst_register *dst_reg = &instr->dst_reg; - struct ir2_register *reg; - unsigned i; - - /* MAXv used for MOV */ - if (instr->instr_type != IR2_ALU_VECTOR || - instr->alu_vector.opc != MAXv) - return NULL; - - /* non identical srcs */ - if (src_reg[0].num != src_reg[1].num) - return NULL; - - /* flags */ - int flags = IR2_REG_NEGATE | IR2_REG_ABS; - if (output) - flags |= IR2_REG_INPUT | IR2_REG_CONST; - if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags)) - return NULL; - - /* clamping */ - if (instr->alu_vector.clamp) - return NULL; - - /* swizzling */ - for (i = 0; i < 4; i++) { - char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i]; - if (swiz == '_') - continue; - - if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : "xyzw")[i] || - swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : "xyzw")[i]) - return NULL; - } - - if (output) - reg = &instr->shader->reg[src_reg[0].num]; - else - reg = &instr->shader->reg[dst_reg->num]; - - assert(reg->write_idx >= 0); - if (reg->write_idx != reg->write_idx2) - return NULL; - - if (!output) - return instr; - - instr = instr->shader->instr[reg->write_idx]; - return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr; -} - -static int src_to_reg(struct ir2_instruction *instr, - struct ir2_src_register *reg) -{ - if (reg->flags & IR2_REG_CONST) - return reg->num; - - return instr->shader->reg[reg->num].reg; -} - -static int dst_to_reg(struct ir2_instruction *instr, - struct ir2_dst_register *reg) -{ - if (reg->flags & IR2_REG_EXPORT) - return reg->num; - - return instr->shader->reg[reg->num].reg; -} - -static bool mask_get(uint32_t *mask, unsigned index) -{ - return !!(mask[index / 32] & 1 << index % 32); -} - -static void mask_set(uint32_t *mask, struct ir2_register *reg, int index) -{ - if (reg) { - unsigned i; - for (i = 0; i < ARRAY_SIZE(reg->regmask); i++) - mask[i] |= reg->regmask[i]; - } - if (index >= 0) - mask[index / 32] |= 1 << index % 32; -} - -static bool sets_pred(struct ir2_instruction *instr) -{ - return instr->instr_type == IR2_ALU_SCALAR && - instr->alu_scalar.opc >= PRED_SETEs && - instr->alu_scalar.opc <= PRED_SET_RESTOREs; -} - - - -void* ir2_shader_assemble(struct ir2_shader *shader, - struct ir2_shader_info *info) -{ - /* NOTES - * blob compiler seems to always puts PRED_* instrs in a CF by - * themselves, and wont combine EQ/NE in the same CF - * (not doing this - doesn't seem to make a difference) - * - * TODO: implement scheduling for combining vector+scalar instructions - * -some vector instructions can be replaced by scalar - */ - - /* first step: - * 1. remove "NOP" MOV instructions generated by TGSI for input/output: - * 2. track information for register allocation, and to remove - * the dead code when some exports are not needed - * 3. add additional instructions for a20x hw binning if needed - * NOTE: modifies the shader instrs - * this step could be done as instructions are added by compiler instead - */ - - /* mask of exports that must be generated - * used to avoid calculating ps exports with hw binning - */ - uint64_t export = ~0ull; - /* bitmask of variables required for exports defined by "export" */ - uint32_t export_mask[REG_MASK/32+1] = {}; - - unsigned idx, reg_idx; - unsigned max_input = 0; - int export_size = -1; - - for (idx = 0; idx < shader->instr_count; idx++) { - struct ir2_instruction *instr = shader->instr[idx], *prev; - struct ir2_dst_register dst_reg = instr->dst_reg; - - if (dst_reg.flags & IR2_REG_EXPORT) { - if (dst_reg.num < 32) - export_size++; - - if ((prev = simple_mov(instr, true))) { - /* copy instruction but keep dst */ - *instr = *prev; - instr->dst_reg = dst_reg; - } - } - - for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) { - struct ir2_src_register *src_reg = &instr->src_reg[reg_idx]; - struct ir2_register *reg; - int num; - - if (src_reg->flags & IR2_REG_CONST) - continue; - - num = src_reg->num; - reg = &shader->reg[num]; - reg->read_idx = idx; - - if (src_reg->flags & IR2_REG_INPUT) { - max_input = MAX2(max_input, num); - } else { - /* bypass simple mov used to set src_reg */ - assert(reg->write_idx >= 0); - prev = shader->instr[reg->write_idx]; - if (simple_mov(prev, false)) { - *src_reg = prev->src_reg[0]; - /* process same src_reg again */ - reg_idx -= 1; - continue; - } - } - - /* update dependencies */ - uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ? - export_mask : shader->reg[dst_reg.num].regmask; - mask_set(mask, reg, num); - if (sets_pred(instr)) - mask_set(export_mask, reg, num); - } - } - - /* second step: - * emit instructions (with CFs) + RA - */ - instr_cf_t cfs[128], *cf = cfs; - uint32_t alufetch[3*256], *af = alufetch; - - /* RA is done on write, so inputs must be allocated here */ - for (reg_idx = 0; reg_idx <= max_input; reg_idx++) - shader->reg[reg_idx].reg = reg_idx; - info->max_reg = max_input; - - /* CF instr state */ - instr_cf_exec_t exec = { .opc = EXEC }; - instr_cf_alloc_t alloc = { .opc = ALLOC }; - bool need_alloc = 0; - bool pos_export = 0; - - export_size = MAX2(export_size, 0); - - for (idx = 0; idx < shader->instr_count; idx++) { - struct ir2_instruction *instr = shader->instr[idx]; - struct ir2_dst_register *dst_reg = &instr->dst_reg; - unsigned num = dst_reg->num; - struct ir2_register *reg; - - /* a2xx only has 64 registers, so we can use a single 64-bit mask */ - uint64_t regmask = 0ull; - - /* compute the current regmask */ - for (reg_idx = 0; (int) reg_idx <= shader->max_reg; reg_idx++) { - reg = &shader->reg[reg_idx]; - if ((int) idx > reg->write_idx && idx < reg->read_idx) - regmask |= (1ull << reg->reg); - } - - if (dst_reg->flags & IR2_REG_EXPORT) { - /* skip if export is not needed */ - if (!(export & (1ull << num))) - continue; - - /* ALLOC CF: - * want to alloc all < 32 at once - * 32/33 and 62/63 come in pairs - * XXX assuming all 3 types are never interleaved - */ - if (num < 32) { - alloc.size = export_size; - alloc.buffer_select = SQ_PARAMETER_PIXEL; - need_alloc = export_size >= 0; - export_size = -1; - } else if (num == 32 || num == 33) { - alloc.size = 0; - alloc.buffer_select = SQ_MEMORY; - need_alloc = num != 33; - } else { - alloc.size = 0; - alloc.buffer_select = SQ_POSITION; - need_alloc = !pos_export; - pos_export = true; - } - - } else { - /* skip if dst register not needed to compute exports */ - if (!mask_get(export_mask, num)) - continue; - - /* RA on first write */ - reg = &shader->reg[num]; - if (reg->write_idx == idx) { - reg->reg = ffsll(~regmask) - 1; - info->max_reg = MAX2(info->max_reg, reg->reg); - } - } - - if (exec.count == 6 || (exec.count && need_alloc)) { - *cf++ = *(instr_cf_t*) &exec; - exec.address += exec.count; - exec.serialize = 0; - exec.count = 0; - } - - if (need_alloc) { - *cf++ = *(instr_cf_t*) &alloc; - need_alloc = false; - } - - int ret = instr_emit(instr, af, idx, info); af += 3; - assert(!ret); - - if (instr->instr_type == IR2_FETCH) - exec.serialize |= 0x1 << exec.count * 2; - if (instr->sync) - exec.serialize |= 0x2 << exec.count * 2; - exec.count += 1; - } - - - exec.opc = !export_size ? EXEC : EXEC_END; - *cf++ = *(instr_cf_t*) &exec; - exec.address += exec.count; - exec.serialize = 0; - exec.count = 0; - - /* GPU will hang without at least one pixel alloc */ - if (!export_size) { - alloc.size = 0; - alloc.buffer_select = SQ_PARAMETER_PIXEL; - *cf++ = *(instr_cf_t*) &alloc; - - exec.opc = EXEC_END; - *cf++ = *(instr_cf_t*) &exec; - } - - unsigned num_cfs = cf - cfs; - - /* insert nop to get an even # of CFs */ - if (num_cfs % 2) { - *cf++ = (instr_cf_t) { .opc = NOP }; - num_cfs++; - } - - /* offset cf addrs */ - for (idx = 0; idx < num_cfs; idx++) { - switch (cfs[idx].opc) { - case EXEC: - case EXEC_END: - cfs[idx].exec.address += num_cfs / 2; - break; - default: - break; - /* XXX and any other address using cf that gets implemented */ - } - } - - /* concatenate cfs+alufetchs */ - uint32_t cfdwords = num_cfs / 2 * 3; - uint32_t alufetchdwords = exec.address * 3; - info->sizedwords = cfdwords + alufetchdwords; - uint32_t *dwords = malloc(info->sizedwords * 4); - assert(dwords); - memcpy(dwords, cfs, cfdwords * 4); - memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4); - return dwords; -} - -struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader, - int instr_type) -{ - struct ir2_instruction *instr = - ir2_alloc(shader, sizeof(struct ir2_instruction)); - DEBUG_MSG("%d", instr_type); - instr->shader = shader; - instr->idx = shader->instr_count; - instr->pred = shader->pred; - instr->instr_type = instr_type; - shader->instr[shader->instr_count++] = instr; - return instr; -} - - -/* - * FETCH instructions: - */ - -static int instr_emit_fetch(struct ir2_instruction *instr, - uint32_t *dwords, uint32_t idx, - struct ir2_shader_info *info) -{ - instr_fetch_t *fetch = (instr_fetch_t *)dwords; - struct ir2_dst_register *dst_reg = &instr->dst_reg; - struct ir2_src_register *src_reg = &instr->src_reg[0]; - - memset(fetch, 0, sizeof(*fetch)); - - fetch->opc = instr->fetch.opc; - - if (instr->fetch.opc == VTX_FETCH) { - instr_fetch_vtx_t *vtx = &fetch->vtx; - - assert(instr->fetch.stride <= 0xff); - assert(instr->fetch.fmt <= 0x3f); - assert(instr->fetch.const_idx <= 0x1f); - assert(instr->fetch.const_idx_sel <= 0x3); - - vtx->src_reg = src_to_reg(instr, src_reg); - vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1); - vtx->dst_reg = dst_to_reg(instr, dst_reg); - vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg); - vtx->must_be_one = 1; - vtx->const_index = instr->fetch.const_idx; - vtx->const_index_sel = instr->fetch.const_idx_sel; - vtx->format_comp_all = !!instr->fetch.is_signed; - vtx->num_format_all = !instr->fetch.is_normalized; - vtx->format = instr->fetch.fmt; - vtx->stride = instr->fetch.stride; - vtx->offset = instr->fetch.offset; - - if (instr->pred != IR2_PRED_NONE) { - vtx->pred_select = 1; - vtx->pred_condition = (instr->pred == IR2_PRED_EQ) ? 1 : 0; - } - - /* XXX seems like every FETCH but the first has - * this bit set: - */ - vtx->reserved3 = (idx > 0) ? 0x1 : 0x0; - vtx->reserved0 = (idx > 0) ? 0x2 : 0x3; - } else if (instr->fetch.opc == TEX_FETCH) { - instr_fetch_tex_t *tex = &fetch->tex; - - assert(instr->fetch.const_idx <= 0x1f); - - tex->src_reg = src_to_reg(instr, src_reg); - tex->src_swiz = reg_fetch_src_swiz(src_reg, 3); - tex->dst_reg = dst_to_reg(instr, dst_reg); - tex->dst_swiz = reg_fetch_dst_swiz(dst_reg); - tex->const_idx = instr->fetch.const_idx; - tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; - tex->min_filter = TEX_FILTER_USE_FETCH_CONST; - tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; - tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; - tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; - tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; - tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; - tex->use_comp_lod = 1; - tex->use_reg_lod = !instr->fetch.is_cube; - tex->sample_location = SAMPLE_CENTER; - tex->tx_coord_denorm = instr->fetch.is_rect; - - if (instr->pred != IR2_PRED_NONE) { - tex->pred_select = 1; - tex->pred_condition = (instr->pred == IR2_PRED_EQ) ? 1 : 0; - } - - } else { - ERROR_MSG("invalid fetch opc: %d\n", instr->fetch.opc); - return -1; - } - - return 0; -} - -/* - * ALU instructions: - */ - -static int instr_emit_alu(struct ir2_instruction *instr_v, - struct ir2_instruction *instr_s, uint32_t *dwords, - struct ir2_shader_info *info) -{ - instr_alu_t *alu = (instr_alu_t *)dwords; - struct ir2_dst_register *vdst_reg, *sdst_reg; - struct ir2_src_register *src1_reg, *src2_reg, *src3_reg; - struct ir2_shader *shader = instr_v ? instr_v->shader : instr_s->shader; - enum ir2_pred pred = IR2_PRED_NONE; - - memset(alu, 0, sizeof(*alu)); - - vdst_reg = NULL; - sdst_reg = NULL; - src1_reg = NULL; - src2_reg = NULL; - src3_reg = NULL; - - if (instr_v) { - vdst_reg = &instr_v->dst_reg; - assert(instr_v->src_reg_count >= 2); - src1_reg = &instr_v->src_reg[0]; - src2_reg = &instr_v->src_reg[1]; - if (instr_v->src_reg_count > 2) - src3_reg = &instr_v->src_reg[2]; - pred = instr_v->pred; - } - - if (instr_s) { - sdst_reg = &instr_s->dst_reg; - assert(instr_s->src_reg_count == 1); - assert(!instr_v || vdst_reg->flags == sdst_reg->flags); - assert(!instr_v || pred == instr_s->pred); - if (src3_reg) { - assert(src3_reg->flags == instr_s->src_reg[0].flags); - assert(src3_reg->num == instr_s->src_reg[0].num); - assert(!strcmp(src3_reg->swizzle, instr_s->src_reg[0].swizzle)); - } - src3_reg = &instr_s->src_reg[0]; - pred = instr_s->pred; - } - - if (vdst_reg) { - assert((vdst_reg->flags & ~IR2_REG_EXPORT) == 0); - assert(!vdst_reg->swizzle || (strlen(vdst_reg->swizzle) == 4)); - alu->vector_opc = instr_v->alu_vector.opc; - alu->vector_write_mask = reg_alu_dst_swiz(vdst_reg); - alu->vector_dest = dst_to_reg(instr_v, vdst_reg); - } else { - alu->vector_opc = MAXv; - } - - if (sdst_reg) { - alu->scalar_opc = instr_s->alu_scalar.opc; - alu->scalar_write_mask = reg_alu_dst_swiz(sdst_reg); - alu->scalar_dest = dst_to_reg(instr_s, sdst_reg); - } else { - /* not sure if this is required, but adreno compiler seems - * to always set scalar opc to MAXs if it is not used: - */ - alu->scalar_opc = MAXs; - } - - alu->export_data = - !!((instr_v ? vdst_reg : sdst_reg)->flags & IR2_REG_EXPORT); - - /* export32 has this bit set.. it seems to do more than just set - * the base address of the constants used to zero - * TODO make this less of a hack - */ - if (alu->export_data && alu->vector_dest == 32) { - assert(!instr_s); - alu->relative_addr = 1; - } - - if (src1_reg) { - if (src1_reg->flags & IR2_REG_CONST) { - assert(!(src1_reg->flags & IR2_REG_ABS)); - alu->src1_reg_const = src1_reg->num; - } else { - alu->src1_reg = shader->reg[src1_reg->num].reg; - alu->src1_reg_abs = !!(src1_reg->flags & IR2_REG_ABS); - } - alu->src1_swiz = reg_alu_src_swiz(src1_reg); - alu->src1_reg_negate = !!(src1_reg->flags & IR2_REG_NEGATE); - alu->src1_sel = !(src1_reg->flags & IR2_REG_CONST); - } else { - alu->src1_sel = 1; - } - - if (src2_reg) { - if (src2_reg->flags & IR2_REG_CONST) { - assert(!(src2_reg->flags & IR2_REG_ABS)); - alu->src2_reg_const = src2_reg->num; - } else { - alu->src2_reg = shader->reg[src2_reg->num].reg; - alu->src2_reg_abs = !!(src2_reg->flags & IR2_REG_ABS); - } - alu->src2_swiz = reg_alu_src_swiz(src2_reg); - alu->src2_reg_negate = !!(src2_reg->flags & IR2_REG_NEGATE); - alu->src2_sel = !(src2_reg->flags & IR2_REG_CONST); - } else { - alu->src2_sel = 1; - } - - if (src3_reg) { - if (src3_reg->flags & IR2_REG_CONST) { - assert(!(src3_reg->flags & IR2_REG_ABS)); - alu->src3_reg_const = src3_reg->num; - } else { - alu->src3_reg = shader->reg[src3_reg->num].reg; - alu->src3_reg_abs = !!(src3_reg->flags & IR2_REG_ABS); - } - alu->src3_swiz = reg_alu_src_swiz(src3_reg); - alu->src3_reg_negate = !!(src3_reg->flags & IR2_REG_NEGATE); - alu->src3_sel = !(src3_reg->flags & IR2_REG_CONST); - } else { - /* not sure if this is required, but adreno compiler seems - * to always set register bank for 3rd src if unused: - */ - alu->src3_sel = 1; - } - - alu->vector_clamp = instr_v ? instr_v->alu_vector.clamp : 0; - alu->scalar_clamp = instr_s ? instr_s->alu_scalar.clamp : 0; - - if (pred != IR2_PRED_NONE) - alu->pred_select = (pred == IR2_PRED_EQ) ? 3 : 2; - - return 0; -} - -static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords, - uint32_t idx, struct ir2_shader_info *info) -{ - switch (instr->instr_type) { - case IR2_FETCH: return instr_emit_fetch(instr, dwords, idx, info); - case IR2_ALU_VECTOR: return instr_emit_alu(instr, NULL, dwords, info); - case IR2_ALU_SCALAR: return instr_emit_alu(NULL, instr, dwords, info); - } - return -1; -} - -struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr, - int num, const char *swizzle, int flags) -{ - if (!(flags & IR2_REG_EXPORT)) { - struct ir2_register *reg = &instr->shader->reg[num]; - - unsigned i; - for (i = instr->shader->max_reg + 1; i <= num; i++) - instr->shader->reg[i].write_idx = -1; - instr->shader->max_reg = i - 1; - - if (reg->write_idx < 0) - reg->write_idx = instr->idx; - reg->write_idx2 = instr->idx; - } - - struct ir2_dst_register *reg = &instr->dst_reg; - reg->flags = flags; - reg->num = num; - reg->swizzle = ir2_strdup(instr->shader, swizzle); - return reg; -} - -struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr, - int num, const char *swizzle, int flags) -{ - assert(instr->src_reg_count + 1 <= ARRAY_SIZE(instr->src_reg)); - if (!(flags & IR2_REG_CONST)) { - struct ir2_register *reg = &instr->shader->reg[num]; - - reg->read_idx = instr->idx; - - unsigned i; - for (i = instr->shader->max_reg + 1; i <= num; i++) - instr->shader->reg[i].write_idx = -1; - instr->shader->max_reg = i - 1; - } - - struct ir2_src_register *reg = &instr->src_reg[instr->src_reg_count++]; - reg->flags = flags; - reg->num = num; - reg->swizzle = ir2_strdup(instr->shader, swizzle); - return reg; -} - -static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n) -{ - uint32_t swiz = 0; - int i; - - assert((reg->flags & ~IR2_REG_INPUT) == 0); - assert(reg->swizzle); - - DEBUG_MSG("fetch src R%d.%s", reg->num, reg->swizzle); - - for (i = n-1; i >= 0; i--) { - swiz <<= 2; - switch (reg->swizzle[i]) { - default: - ERROR_MSG("invalid fetch src swizzle: %s", reg->swizzle); - case 'x': swiz |= 0x0; break; - case 'y': swiz |= 0x1; break; - case 'z': swiz |= 0x2; break; - case 'w': swiz |= 0x3; break; - } - } - - return swiz; -} - -static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg) -{ - uint32_t swiz = 0; - int i; - - assert(reg->flags == 0); - assert(!reg->swizzle || (strlen(reg->swizzle) == 4)); - - DEBUG_MSG("fetch dst R%d.%s", reg->num, reg->swizzle); - - if (reg->swizzle) { - for (i = 3; i >= 0; i--) { - swiz <<= 3; - switch (reg->swizzle[i]) { - default: - ERROR_MSG("invalid dst swizzle: %s", reg->swizzle); - case 'x': swiz |= 0x0; break; - case 'y': swiz |= 0x1; break; - case 'z': swiz |= 0x2; break; - case 'w': swiz |= 0x3; break; - case '0': swiz |= 0x4; break; - case '1': swiz |= 0x5; break; - case '_': swiz |= 0x7; break; - } - } - } else { - swiz = 0x688; - } - - return swiz; -} - -/* actually, a write-mask */ -static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg) -{ - uint32_t swiz = 0; - int i; - - assert((reg->flags & ~IR2_REG_EXPORT) == 0); - assert(!reg->swizzle || (strlen(reg->swizzle) == 4)); - - DEBUG_MSG("alu dst R%d.%s", reg->num, reg->swizzle); - - if (reg->swizzle) { - for (i = 3; i >= 0; i--) { - swiz <<= 1; - if (reg->swizzle[i] == "xyzw"[i]) { - swiz |= 0x1; - } else if (reg->swizzle[i] != '_') { - ERROR_MSG("invalid dst swizzle: %s", reg->swizzle); - break; - } - } - } else { - swiz = 0xf; - } - - return swiz; -} - -static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg) -{ - uint32_t swiz = 0; - int i; - - assert(!reg->swizzle || (strlen(reg->swizzle) == 4)); - - DEBUG_MSG("vector src R%d.%s", reg->num, reg->swizzle); - - if (reg->swizzle) { - for (i = 3; i >= 0; i--) { - swiz <<= 2; - switch (reg->swizzle[i]) { - default: - ERROR_MSG("invalid vector src swizzle: %s", reg->swizzle); - case 'x': swiz |= (0x0 - i) & 0x3; break; - case 'y': swiz |= (0x1 - i) & 0x3; break; - case 'z': swiz |= (0x2 - i) & 0x3; break; - case 'w': swiz |= (0x3 - i) & 0x3; break; - } - } - } else { - swiz = 0x0; - } - - return swiz; -} diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h deleted file mode 100644 index ac2931266d4..00000000000 --- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2012 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IR2_H_ -#define IR2_H_ - -#include -#include - -#include "instr-a2xx.h" - -/* low level intermediate representation of an adreno a2xx shader program */ - -struct ir2_shader; - -#define REG_MASK 0xff - -struct ir2_shader_info { - uint16_t sizedwords; - int8_t max_reg; /* highest GPR # used by shader */ -}; - -struct ir2_register { - int16_t write_idx, write_idx2, read_idx, reg; - /* bitmask of variables on which this one depends - * XXX: use bitmask util? - */ - uint32_t regmask[REG_MASK/32+1]; -}; - -struct ir2_src_register { - enum { - IR2_REG_INPUT = 0x1, - IR2_REG_CONST = 0x2, - IR2_REG_NEGATE = 0x4, - IR2_REG_ABS = 0x8, - } flags; - int num; - char *swizzle; -}; - -struct ir2_dst_register { - enum { - IR2_REG_EXPORT = 0x1, - } flags; - int num; - char *swizzle; -}; - -enum ir2_pred { - IR2_PRED_NONE = 0, - IR2_PRED_EQ = 1, - IR2_PRED_NE = 2, -}; - -struct ir2_instruction { - struct ir2_shader *shader; - unsigned idx; - enum { - IR2_FETCH, - IR2_ALU_VECTOR, - IR2_ALU_SCALAR, - } instr_type; - enum ir2_pred pred; - int sync; - unsigned src_reg_count; - struct ir2_dst_register dst_reg; - struct ir2_src_register src_reg[3]; - union { - /* FETCH specific: */ - struct { - instr_fetch_opc_t opc; - unsigned const_idx; - /* texture fetch specific: */ - bool is_cube : 1; - bool is_rect : 1; - /* vertex fetch specific: */ - unsigned const_idx_sel; - enum a2xx_sq_surfaceformat fmt; - bool is_signed : 1; - bool is_normalized : 1; - uint32_t stride; - uint32_t offset; - } fetch; - /* ALU-Vector specific: */ - struct { - instr_vector_opc_t opc; - bool clamp; - } alu_vector; - /* ALU-Scalar specific: */ - struct { - instr_scalar_opc_t opc; - bool clamp; - } alu_scalar; - }; -}; - -struct ir2_shader { - unsigned instr_count; - int max_reg; - struct ir2_register reg[REG_MASK+1]; - - struct ir2_instruction *instr[0x200]; - uint32_t heap[100 * 4096]; - unsigned heap_idx; - - enum ir2_pred pred; /* pred inherited by newly created instrs */ -}; - -struct ir2_shader * ir2_shader_create(void); -void ir2_shader_destroy(struct ir2_shader *shader); -void * ir2_shader_assemble(struct ir2_shader *shader, - struct ir2_shader_info *info); - -struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader, - int instr_type); - -struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr, - int num, const char *swizzle, int flags); -struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr, - int num, const char *swizzle, int flags); - -/* some helper fxns: */ - -static inline struct ir2_instruction * -ir2_instr_create_alu_v(struct ir2_shader *shader, instr_vector_opc_t vop) -{ - struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_VECTOR); - if (!instr) - return instr; - instr->alu_vector.opc = vop; - return instr; -} - -static inline struct ir2_instruction * -ir2_instr_create_alu_s(struct ir2_shader *shader, instr_scalar_opc_t sop) -{ - struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_SCALAR); - if (!instr) - return instr; - instr->alu_scalar.opc = sop; - return instr; -} - -static inline struct ir2_instruction * -ir2_instr_create_vtx_fetch(struct ir2_shader *shader, int ci, int cis, - enum a2xx_sq_surfaceformat fmt, bool is_signed, int stride) -{ - struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH); - instr->fetch.opc = VTX_FETCH; - instr->fetch.const_idx = ci; - instr->fetch.const_idx_sel = cis; - instr->fetch.fmt = fmt; - instr->fetch.is_signed = is_signed; - instr->fetch.stride = stride; - return instr; -} -static inline struct ir2_instruction * -ir2_instr_create_tex_fetch(struct ir2_shader *shader, int ci) -{ - struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH); - instr->fetch.opc = TEX_FETCH; - instr->fetch.const_idx = ci; - return instr; -} - - -#endif /* IR2_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/ir2.c b/src/gallium/drivers/freedreno/a2xx/ir2.c new file mode 100644 index 00000000000..e7e69966f15 --- /dev/null +++ b/src/gallium/drivers/freedreno/a2xx/ir2.c @@ -0,0 +1,304 @@ +/* + * Copyright (C) 2018 Jonathan Marek + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Jonathan Marek + */ + +#include "ir2_private.h" + +static bool scalar_possible(struct ir2_instr *instr) +{ + if (instr->alu.scalar_opc == SCALAR_NONE) + return false; + + return src_ncomp(instr) == 1; +} + +static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b) +{ + if (!a) + return true; + + /* dont use same instruction twice */ + if (a == b) + return false; + + /* PRED_SET must be alone */ + if (b->alu.scalar_opc >= PRED_SETEs && + b->alu.scalar_opc <= PRED_SET_RESTOREs) + return false; + + /* must write to same export (issues otherwise?) */ + return a->alu.export == b->alu.export; +} + +/* priority of vector instruction for scheduling (lower=higher prio) */ +static unsigned alu_vector_prio(struct ir2_instr *instr) +{ + if (instr->alu.vector_opc == VECTOR_NONE) + return ~0u; + + if (is_export(instr)) + return 4; + + /* TODO check src type and ncomps */ + if (instr->src_count == 3) + return 0; + + if (!scalar_possible(instr)) + return 1; + + return instr->src_count == 2 ? 2 : 3; +} + +/* priority of scalar instruction for scheduling (lower=higher prio) */ +static unsigned alu_scalar_prio(struct ir2_instr *instr) +{ + if (!scalar_possible(instr)) + return ~0u; + + /* this case is dealt with later */ + if (instr->src_count > 1) + return ~0u; + + if (is_export(instr)) + return 4; + + /* PRED to end of block */ + if (instr->alu.scalar_opc >= PRED_SETEs && + instr->alu.scalar_opc <= PRED_SET_RESTOREs) + return 5; + + /* scalar only have highest priority */ + return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3; +} + +/* fill sched with next fetch or (vector and/or scalar) alu instruction */ +static int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched) +{ + struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL; + unsigned avail_count = 0; + + instr_alloc_type_t export = ~0u; + int block_idx = -1; + + /* XXX merge this loop with the other one somehow? */ + ir2_foreach_instr(instr, ctx) { + if (!instr->need_emit) + continue; + if (is_export(instr)) + export = MIN2(export, export_buf(instr->alu.export)); + } + + ir2_foreach_instr(instr, ctx) { + if (!instr->need_emit) + continue; + + /* dont mix exports */ + if (is_export(instr) && export_buf(instr->alu.export) != export) + continue; + + if (block_idx < 0) + block_idx = instr->block_idx; + else if (block_idx != instr->block_idx || /* must be same block */ + instr->type == IR2_CF || /* CF/MEM must be alone */ + (is_export(instr) && export == SQ_MEMORY)) + break; + /* it works because IR2_CF is always at end of block + * and somewhat same idea with MEM exports, which might not be alone + * but will end up in-order at least + */ + + /* check if dependencies are satisfied */ + bool is_ok = true; + ir2_foreach_src(src, instr) { + if (src->type == IR2_SRC_REG) { + /* need to check if all previous instructions in the block + * which write the reg have been emitted + * slow.. + * XXX: check components instead of whole register + */ + struct ir2_reg *reg = get_reg_src(ctx, src); + ir2_foreach_instr(p, ctx) { + if (!p->is_ssa && p->reg == reg && p->idx < instr->idx) + is_ok &= !p->need_emit; + } + } else if (src->type == IR2_SRC_SSA) { + /* in this case its easy, just check need_emit */ + is_ok &= !ctx->instr[src->num].need_emit; + } + } + if (!is_ok) + continue; + + avail[avail_count++] = instr; + } + + if (!avail_count) { + assert(block_idx == -1); + return -1; + } + + /* priority to FETCH instructions */ + ir2_foreach_avail(instr) { + if (instr->type == IR2_ALU) + continue; + + ra_src_free(ctx, instr); + ra_reg(ctx, get_reg(instr), -1, false, 0); + + instr->need_emit = false; + sched->instr = instr; + sched->instr_s = NULL; + return block_idx; + } + + /* TODO precompute priorities */ + + unsigned prio_v = ~0u, prio_s = ~0u, prio; + ir2_foreach_avail(instr) { + prio = alu_vector_prio(instr); + if (prio < prio_v) { + instr_v = instr; + prio_v = prio; + } + } + + /* TODO can still insert scalar if src_count=3, if smart about it */ + if (!instr_v || instr_v->src_count < 3) { + ir2_foreach_avail(instr) { + bool compat = is_alu_compatible(instr_v, instr); + + prio = alu_scalar_prio(instr); + if (prio >= prio_v && !compat) + continue; + + if (prio < prio_s) { + instr_s = instr; + prio_s = prio; + if (!compat) + instr_v = NULL; + } + } + } + + assert(instr_v || instr_s); + + /* free src registers */ + if (instr_v) { + instr_v->need_emit = false; + ra_src_free(ctx, instr_v); + } + + if (instr_s) { + instr_s->need_emit = false; + ra_src_free(ctx, instr_s); + } + + /* allocate dst registers */ + if (instr_v) + ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask); + + if (instr_s) + ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask); + + sched->instr = instr_v; + sched->instr_s = instr_s; + return block_idx; +} + +/* scheduling: determine order of instructions */ +static void schedule_instrs(struct ir2_context *ctx) +{ + struct ir2_sched_instr *sched; + int block_idx; + + /* allocate input registers */ + for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++) + if (ctx->input[idx].initialized) + ra_reg(ctx, &ctx->input[idx], idx, false, 0); + + for (;;) { + sched = &ctx->instr_sched[ctx->instr_sched_count++]; + block_idx = sched_next(ctx, sched); + if (block_idx < 0) + break; + memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state)); + + /* catch texture fetch after scheduling and insert the + * SET_TEX_LOD right before it if necessary + * TODO clean this up + */ + struct ir2_instr *instr = sched->instr, *tex_lod; + if (instr && instr->type == IR2_FETCH && + instr->fetch.opc == TEX_FETCH && instr->src_count == 2) { + /* generate the SET_LOD instruction */ + tex_lod = &ctx->instr[ctx->instr_count++]; + tex_lod->type = IR2_FETCH; + tex_lod->block_idx = instr->block_idx; + tex_lod->pred = instr->pred; + tex_lod->fetch.opc = TEX_SET_TEX_LOD; + tex_lod->src[0] = instr->src[1]; + tex_lod->src_count = 1; + + sched[1] = sched[0]; + sched->instr = tex_lod; + ctx->instr_sched_count++; + } + + bool free_block = true; + ir2_foreach_instr(instr, ctx) + free_block &= instr->block_idx != block_idx; + if (free_block) + ra_block_free(ctx, block_idx); + }; + ctx->instr_sched_count--; +} + +void +ir2_compile(struct fd2_shader_stateobj *so, unsigned variant, + struct fd2_shader_stateobj *fp) +{ + struct ir2_context ctx = { }; + bool binning = !fp && so->type == MESA_SHADER_VERTEX; + + if (fp) + so->variant[variant].f = fp->variant[0].f; + + ctx.so = so; + ctx.info = &so->variant[variant].info; + ctx.f = &so->variant[variant].f; + ctx.info->max_reg = -1; + + /* convert nir to internal representation */ + ir2_nir_compile(&ctx, binning); + + /* get ref_counts and kill non-needed instructions */ + ra_count_refs(&ctx); + + /* instruction order.. and vector->scalar conversions */ + schedule_instrs(&ctx); + + /* finally, assemble to bitcode */ + assemble(&ctx, binning); +} diff --git a/src/gallium/drivers/freedreno/a2xx/ir2.h b/src/gallium/drivers/freedreno/a2xx/ir2.h new file mode 100644 index 00000000000..f381fdfff16 --- /dev/null +++ b/src/gallium/drivers/freedreno/a2xx/ir2.h @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2018 Jonathan Marek + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Jonathan Marek + */ + +#ifndef IR2_H_ +#define IR2_H_ + +#include "compiler/nir/nir.h" + +struct ir2_fetch_info { + /* dword offset of the fetch instruction */ + uint16_t offset; + union { + /* swizzle to merge with tgsi swizzle */ + struct { + uint16_t dst_swiz; + } vtx; + /* sampler id to patch const_idx */ + struct { + uint16_t samp_id; + uint8_t src_swiz; + } tex; + }; +}; + +struct ir2_shader_info { + /* compiler shader */ + uint32_t *dwords; + + /* size of the compiled shader in dwords */ + uint16_t sizedwords; + + /* highest GPR # used by shader */ + int8_t max_reg; + + /* offset in dwords of first MEMORY export CF (for a20x hw binning) */ + int16_t mem_export_ptr; + + /* fetch instruction info for patching */ + uint16_t num_fetch_instrs; + struct ir2_fetch_info fetch_info[64]; +}; + +struct ir2_frag_linkage { + unsigned inputs_count; + struct { + uint8_t slot; + uint8_t ncomp; + } inputs[16]; + + /* driver_location of fragcoord.zw, -1 if not used */ + int fragcoord; +}; + +struct ir2_shader_variant { + struct ir2_shader_info info; + struct ir2_frag_linkage f; +}; + +struct fd2_shader_stateobj; +struct tgsi_token; + +void ir2_compile(struct fd2_shader_stateobj *so, unsigned variant, + struct fd2_shader_stateobj *fp); + +struct nir_shader *ir2_tgsi_to_nir(const struct tgsi_token *tokens); + +const nir_shader_compiler_options *ir2_get_compiler_options(void); + +int ir2_optimize_nir(nir_shader *s, bool lower); + +#endif /* IR2_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c b/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c new file mode 100644 index 00000000000..e786a2cdd11 --- /dev/null +++ b/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c @@ -0,0 +1,548 @@ +/* + * Copyright (C) 2018 Jonathan Marek + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Jonathan Marek + */ + +#include "ir2_private.h" + +static unsigned +src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp) +{ + struct ir2_reg_component *comps; + unsigned swiz = 0; + + switch (src->type) { + case IR2_SRC_SSA: + case IR2_SRC_REG: + break; + default: + return src->swizzle; + } + /* we need to take into account where the components were allocated */ + comps = get_reg_src(ctx, src)->comp; + for (int i = 0; i < ncomp; i++) { + swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i); + } + return swiz; +} + +/* alu instr need to take into how the output components are allocated */ + +/* scalar doesn't need to take into account dest swizzle */ + +static unsigned +alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg) +{ + /* hardware seems to take from W, but swizzle everywhere just in case */ + return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX); +} + +static unsigned +alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, struct ir2_src *src) +{ + struct ir2_reg_component *comp = get_reg(instr)->comp; + unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr)); + unsigned swiz = 0; + + /* non per component special cases */ + switch (instr->alu.vector_opc) { + case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv: + return alu_swizzle_scalar(ctx, src); + case DOT2ADDv: + case DOT3v: + case DOT4v: + case CUBEv: + return swiz0; + default: + break; + } + + for (int i = 0, j = 0; i < dst_ncomp(instr); j++) { + if (instr->alu.write_mask & 1 << j) { + if (comp[j].c != 7) + swiz |= swiz_set(i, comp[j].c); + i++; + } + } + return swiz_merge(swiz0, swiz); +} + +static unsigned +alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1) +{ + /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */ + unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0); + return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY); +} + +/* write_mask needs to be transformed by allocation information */ + +static unsigned +alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr) +{ + struct ir2_reg_component *comp = get_reg(instr)->comp; + unsigned write_mask = 0; + + for (int i = 0; i < 4; i++) { + if (instr->alu.write_mask & 1 << i) + write_mask |= 1 << comp[i].c; + } + + return write_mask; +} + +/* fetch instructions can swizzle dest, but src swizzle needs conversion */ + +static unsigned +fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp) +{ + unsigned alu_swiz = src_swizzle(ctx, src, ncomp); + unsigned swiz = 0; + for (int i = 0; i < ncomp; i++) + swiz |= swiz_get(alu_swiz, i) << i * 2; + return swiz; +} + +static unsigned +fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr) +{ + struct ir2_reg_component *comp = get_reg(instr)->comp; + unsigned dst_swiz = 0xfff; + for (int i = 0; i < dst_ncomp(instr); i++) { + dst_swiz &= ~(7 << comp[i].c * 3); + dst_swiz |= i << comp[i].c * 3; + } + return dst_swiz; +} + +/* register / export # for instr */ +static unsigned +dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr) +{ + if (is_export(instr)) + return instr->alu.export; + + return get_reg(instr)->idx; +} + +/* register # for src */ +static unsigned src_to_reg(struct ir2_context *ctx, struct ir2_src *src) +{ + return get_reg_src(ctx, src)->idx; +} + +static unsigned src_reg_byte(struct ir2_context *ctx, struct ir2_src *src) +{ + if (src->type == IR2_SRC_CONST) { + assert(!src->abs); /* no abs bit for const */ + return src->num; + } + return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0); +} + +/* produce the 12 byte binary instruction for a given sched_instr */ +static void +fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, + instr_t *bc, bool * is_fetch) +{ + struct ir2_instr *instr = sched->instr, *instr_s, *instr_v; + + *bc = (instr_t) {}; + + if (instr && instr->type == IR2_FETCH) { + *is_fetch = true; + + bc->fetch.opc = instr->fetch.opc; + bc->fetch.pred_select = !!instr->pred; + bc->fetch.pred_condition = instr->pred & 1; + + struct ir2_src *src = instr->src; + + if (instr->fetch.opc == VTX_FETCH) { + instr_fetch_vtx_t *vtx = &bc->fetch.vtx; + + assert(instr->fetch.vtx.const_idx <= 0x1f); + assert(instr->fetch.vtx.const_idx_sel <= 0x3); + + vtx->src_reg = src_to_reg(ctx, src); + vtx->src_swiz = fetch_swizzle(ctx, src, 1); + vtx->dst_reg = dst_to_reg(ctx, instr); + vtx->dst_swiz = fetch_dst_swiz(ctx, instr); + + vtx->must_be_one = 1; + vtx->const_index = instr->fetch.vtx.const_idx; + vtx->const_index_sel = instr->fetch.vtx.const_idx_sel; + + /* other fields will be patched */ + + /* XXX seems like every FETCH but the first has + * this bit set: + */ + vtx->reserved3 = instr->idx ? 0x1 : 0x0; + vtx->reserved0 = instr->idx ? 0x2 : 0x3; + } else if (instr->fetch.opc == TEX_FETCH) { + instr_fetch_tex_t *tex = &bc->fetch.tex; + + tex->src_reg = src_to_reg(ctx, src); + tex->src_swiz = fetch_swizzle(ctx, src, 3); + tex->dst_reg = dst_to_reg(ctx, instr); + tex->dst_swiz = fetch_dst_swiz(ctx, instr); + /* tex->const_idx = patch_fetches */ + tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; + tex->min_filter = TEX_FILTER_USE_FETCH_CONST; + tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; + tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; + tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; + tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; + tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; + tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT; + tex->use_reg_lod = instr->src_count == 2; + tex->sample_location = SAMPLE_CENTER; + tex->tx_coord_denorm = instr->fetch.tex.is_rect; + } else if (instr->fetch.opc == TEX_SET_TEX_LOD) { + instr_fetch_tex_t *tex = &bc->fetch.tex; + + tex->src_reg = src_to_reg(ctx, src); + tex->src_swiz = fetch_swizzle(ctx, src, 1); + tex->dst_reg = 0; + tex->dst_swiz = 0xfff; + + tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; + tex->min_filter = TEX_FILTER_USE_FETCH_CONST; + tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; + tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; + tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; + tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; + tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; + tex->use_comp_lod = 1; + tex->use_reg_lod = 0; + tex->sample_location = SAMPLE_CENTER; + } else { + assert(0); + } + return; + } + + instr_v = sched->instr; + instr_s = sched->instr_s; + + if (instr_v) { + struct ir2_src src1, src2, *src3; + + src1 = instr_v->src[0]; + src2 = instr_v->src[instr_v->src_count > 1]; + src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL; + + bc->alu.vector_opc = instr_v->alu.vector_opc; + bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v); + bc->alu.vector_dest = dst_to_reg(ctx, instr_v); + bc->alu.vector_clamp = instr_v->alu.saturate; + bc->alu.export_data = instr_v->alu.export >= 0; + + /* single operand SETEv, use 0.0f as src2 */ + if (instr_v->src_count == 1 && + (bc->alu.vector_opc == SETEv || + bc->alu.vector_opc == SETNEv || + bc->alu.vector_opc == SETGTv || + bc->alu.vector_opc == SETGTEv)) + src2 = ir2_zero(ctx); + + /* export32 instr for a20x hw binning has this bit set.. + * it seems to do more than change the base address of constants + * XXX this is a hack + */ + bc->alu.relative_addr = + (bc->alu.export_data && bc->alu.vector_dest == 32); + + bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1); + bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1); + bc->alu.src1_reg_negate = src1.negate; + bc->alu.src1_sel = src1.type != IR2_SRC_CONST; + + bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2); + bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2); + bc->alu.src2_reg_negate = src2.negate; + bc->alu.src2_sel = src2.type != IR2_SRC_CONST; + + if (src3) { + bc->alu.src3_reg_byte = src_reg_byte(ctx, src3); + bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3); + bc->alu.src3_reg_negate = src3->negate; + bc->alu.src3_sel = src3->type != IR2_SRC_CONST; + } + + bc->alu.pred_select = instr_v->pred; + } + + if (instr_s) { + struct ir2_src *src = instr_s->src; + + bc->alu.scalar_opc = instr_s->alu.scalar_opc; + bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s); + bc->alu.scalar_dest = dst_to_reg(ctx, instr_s); + bc->alu.scalar_clamp = instr_s->alu.saturate; + bc->alu.export_data = instr_s->alu.export >= 0; + + if (instr_s->src_count == 1) { + bc->alu.src3_reg_byte = src_reg_byte(ctx, src); + bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src); + bc->alu.src3_reg_negate = src->negate; + bc->alu.src3_sel = src->type != IR2_SRC_CONST; + } else { + assert(instr_s->src_count == 2); + + bc->alu.src3_reg_byte = src_reg_byte(ctx, src); + bc->alu.src3_swiz = alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle); + bc->alu.src3_reg_negate = src->negate; + bc->alu.src3_sel = src->type != IR2_SRC_CONST;; + } + + if (instr_v) + assert(instr_s->pred == instr_v->pred); + bc->alu.pred_select = instr_s->pred; + } + + *is_fetch = false; + return; +} + +static unsigned +write_cfs(struct ir2_context *ctx, instr_cf_t * cfs, unsigned cf_idx, + instr_cf_alloc_t *alloc, instr_cf_exec_t *exec) +{ + assert(exec->count); + + if (alloc) + cfs[cf_idx++].alloc = *alloc; + + /* for memory alloc offset for patching */ + if (alloc && alloc->buffer_select == SQ_MEMORY && + ctx->info->mem_export_ptr == -1) + ctx->info->mem_export_ptr = cf_idx / 2 * 3; + + cfs[cf_idx++].exec = *exec; + exec->address += exec->count; + exec->serialize = 0; + exec->count = 0; + + return cf_idx; +} + +/* assemble the final shader */ +void assemble(struct ir2_context *ctx, bool binning) +{ + /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384) + * address is 9 bits so could it be 512 ? + */ + instr_cf_t cfs[384]; + instr_t bytecode[384], bc; + unsigned block_addr[128]; + unsigned num_cf = 0; + + /* CF instr state */ + instr_cf_exec_t exec = {.opc = EXEC}; + instr_cf_alloc_t alloc = {.opc = ALLOC}; + + int sync_id, sync_id_prev = -1; + bool is_fetch = false; + bool need_sync = true; + bool need_alloc = false; + unsigned block_idx = 0; + + ctx->info->mem_export_ptr = -1; + ctx->info->num_fetch_instrs = 0; + + /* vertex shader always needs to allocate at least one parameter + * if it will never happen, + */ + if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) { + alloc.buffer_select = SQ_PARAMETER_PIXEL; + cfs[num_cf++].alloc = alloc; + } + + block_addr[0] = 0; + + for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) { + struct ir2_instr *instr = ctx->instr_sched[j].instr; + + /* catch IR2_CF since it isn't a regular instruction */ + if (instr && instr->type == IR2_CF) { + assert(!need_alloc); /* XXX */ + + /* flush any exec cf before inserting jmp */ + if (exec.count) + num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec); + + cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t) { + .opc = COND_JMP, + .address = instr->cf.block_idx, /* will be fixed later */ + .force_call = !instr->pred, + .predicated_jmp = 1, + .direction = instr->cf.block_idx > instr->block_idx, + .condition = instr->pred & 1, + }; + continue; + } + + /* fill the 3 dwords for the instruction */ + fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch); + + /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */ + sync_id = 0; + if (is_fetch) + sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2; + + need_sync = sync_id != sync_id_prev; + sync_id_prev = sync_id; + + unsigned block; + { + + if (ctx->instr_sched[j].instr) + block = ctx->instr_sched[j].instr->block_idx; + else + block = ctx->instr_sched[j].instr_s->block_idx; + + assert(block_idx <= block); + } + + /* info for patching */ + if (is_fetch) { + struct ir2_fetch_info *info = + &ctx->info->fetch_info[ctx->info->num_fetch_instrs++]; + info->offset = i * 3; /* add cf offset later */ + + if (bc.fetch.opc == VTX_FETCH) { + info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz; + } else if (bc.fetch.opc == TEX_FETCH) { + info->tex.samp_id = instr->fetch.tex.samp_id; + info->tex.src_swiz = bc.fetch.tex.src_swiz; + } else { + ctx->info->num_fetch_instrs--; + } + } + + /* exec cf after 6 instr or when switching between fetch / alu */ + if (exec.count == 6 || (exec.count && (need_sync || block != block_idx))) { + num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); + need_alloc = false; + } + + /* update block_addrs for jmp patching */ + while (block_idx < block) + block_addr[++block_idx] = num_cf; + + /* export - fill alloc cf */ + if (!is_fetch && bc.alu.export_data) { + /* get the export buffer from either vector/scalar dest */ + instr_alloc_type_t buffer = + export_buf(bc.alu.vector_dest); + if (bc.alu.scalar_write_mask) { + if (bc.alu.vector_write_mask) + assert(buffer == export_buf(bc.alu.scalar_dest)); + buffer = export_buf(bc.alu.scalar_dest); + } + + /* flush previous alloc if the buffer changes */ + bool need_new_alloc = buffer != alloc.buffer_select; + + /* memory export always in 32/33 pair, new alloc on 32 */ + if (bc.alu.vector_dest == 32) + need_new_alloc = true; + + if (need_new_alloc && exec.count) { + num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); + need_alloc = false; + } + + need_alloc |= need_new_alloc; + + alloc.size = 0; + alloc.buffer_select = buffer; + + if (buffer == SQ_PARAMETER_PIXEL && ctx->so->type == MESA_SHADER_VERTEX) + alloc.size = ctx->f->inputs_count - 1; + + if (buffer == SQ_POSITION) + alloc.size = ctx->so->writes_psize; + } + + if (is_fetch) + exec.serialize |= 0x1 << exec.count * 2; + if (need_sync) + exec.serialize |= 0x2 << exec.count * 2; + + need_sync = false; + exec.count += 1; + bytecode[i++] = bc; + } + + /* final exec cf */ + exec.opc = EXEC_END; + num_cf = + write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); + + /* insert nop to get an even # of CFs */ + if (num_cf % 2) + cfs[num_cf++] = (instr_cf_t) { + .opc = NOP}; + + /* patch cf addrs */ + for (int idx = 0; idx < num_cf; idx++) { + switch (cfs[idx].opc) { + case NOP: + case ALLOC: + break; + case EXEC: + case EXEC_END: + cfs[idx].exec.address += num_cf / 2; + break; + case COND_JMP: + cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address]; + break; + default: + assert(0); + } + } + + /* concatenate cfs and alu/fetch */ + uint32_t cfdwords = num_cf / 2 * 3; + uint32_t alufetchdwords = exec.address * 3; + uint32_t sizedwords = cfdwords + alufetchdwords; + uint32_t *dwords = malloc(sizedwords * 4); + assert(dwords); + memcpy(dwords, cfs, cfdwords * 4); + memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4); + + /* finalize ir2_shader_info */ + ctx->info->dwords = dwords; + ctx->info->sizedwords = sizedwords; + for (int i = 0; i < ctx->info->num_fetch_instrs; i++) + ctx->info->fetch_info[i].offset += cfdwords; + + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("disassemble: type=%d", ctx->so->type); + disasm_a2xx(dwords, sizedwords, 0, ctx->so->type); + } +} diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c new file mode 100644 index 00000000000..ef9c5e0c4df --- /dev/null +++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c @@ -0,0 +1,1173 @@ +/* + * Copyright (C) 2018 Jonathan Marek + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Jonathan Marek + */ + +#include "ir2_private.h" +#include "nir/tgsi_to_nir.h" + +#include "freedreno_util.h" +#include "fd2_program.h" + +static const nir_shader_compiler_options options = { + .lower_fpow = true, + .lower_flrp32 = true, + .lower_fmod32 = true, + .lower_fdiv = true, + .lower_fceil = true, + .fuse_ffma = true, + /* .fdot_replicates = true, it is replicated, but it makes things worse */ + .lower_all_io_to_temps = true, + .vertex_id_zero_based = true, /* its not implemented anyway */ +}; + +struct nir_shader * +ir2_tgsi_to_nir(const struct tgsi_token *tokens) +{ + return tgsi_to_nir(tokens, &options); +} + +const nir_shader_compiler_options * +ir2_get_compiler_options(void) +{ + return &options; +} + +#define OPT(nir, pass, ...) ({ \ + bool this_progress = false; \ + NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ + this_progress; \ +}) +#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) + +static void +ir2_optimize_loop(nir_shader *s) +{ + bool progress; + do { + progress = false; + + OPT_V(s, nir_lower_vars_to_ssa); + progress |= OPT(s, nir_opt_copy_prop_vars); + progress |= OPT(s, nir_copy_prop); + progress |= OPT(s, nir_opt_dce); + progress |= OPT(s, nir_opt_cse); + /* progress |= OPT(s, nir_opt_gcm, true); */ + progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true); + progress |= OPT(s, nir_opt_intrinsics); + progress |= OPT(s, nir_opt_algebraic); + progress |= OPT(s, nir_opt_constant_folding); + progress |= OPT(s, nir_opt_dead_cf); + if (OPT(s, nir_opt_trivial_continues)) { + progress |= true; + /* If nir_opt_trivial_continues makes progress, then we need to clean + * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll + * to make progress. + */ + OPT(s, nir_copy_prop); + OPT(s, nir_opt_dce); + } + progress |= OPT(s, nir_opt_loop_unroll, nir_var_all); + progress |= OPT(s, nir_opt_if); + progress |= OPT(s, nir_opt_remove_phis); + progress |= OPT(s, nir_opt_undef); + + } + while (progress); +} + +/* trig workarounds is the same as ir3.. but we don't want to include ir3 */ +bool ir3_nir_apply_trig_workarounds(nir_shader * shader); + +int +ir2_optimize_nir(nir_shader *s, bool lower) +{ + struct nir_lower_tex_options tex_options = { + .lower_txp = ~0u, + .lower_rect = 0, + }; + + if (fd_mesa_debug & FD_DBG_DISASM) { + debug_printf("----------------------\n"); + nir_print_shader(s, stdout); + debug_printf("----------------------\n"); + } + + OPT_V(s, nir_opt_global_to_local); + OPT_V(s, nir_lower_regs_to_ssa); + OPT_V(s, nir_lower_vars_to_ssa); + OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out); + + if (lower) { + OPT_V(s, ir3_nir_apply_trig_workarounds); + OPT_V(s, nir_lower_tex, &tex_options); + } + + ir2_optimize_loop(s); + + OPT_V(s, nir_remove_dead_variables, nir_var_function_temp); + OPT_V(s, nir_move_load_const); + + /* TODO we dont want to get shaders writing to depth for depth textures */ + if (s->info.stage == MESA_SHADER_FRAGMENT) { + nir_foreach_variable(var, &s->outputs) { + if (var->data.location == FRAG_RESULT_DEPTH) + return -1; + } + } + + return 0; +} + +static struct ir2_src +load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp) +{ + struct fd2_shader_stateobj *so = ctx->so; + unsigned imm_ncomp, swiz, idx, i, j; + uint32_t *value = (uint32_t*) value_f; + + /* try to merge with existing immediate (TODO: try with neg) */ + for (idx = 0; idx < so->num_immediates; idx++) { + swiz = 0; + imm_ncomp = so->immediates[idx].ncomp; + for (i = 0; i < ncomp; i++) { + for (j = 0; j < imm_ncomp; j++) { + if (value[i] == so->immediates[idx].val[j]) + break; + } + if (j == imm_ncomp) { + if (j == 4) + break; + so->immediates[idx].val[imm_ncomp++] = value[i]; + } + swiz |= swiz_set(j, i); + } + /* matched all components */ + if (i == ncomp) + break; + } + + /* need to allocate new immediate */ + if (idx == so->num_immediates) { + swiz = 0; + imm_ncomp = 0; + for (i = 0; i < ncomp; i++) { + for (j = 0; j < imm_ncomp; j++) { + if (value[i] == ctx->so->immediates[idx].val[j]) + break; + } + if (j == imm_ncomp) { + so->immediates[idx].val[imm_ncomp++] = value[i]; + } + swiz |= swiz_set(j, i); + } + so->num_immediates++; + } + so->immediates[idx].ncomp = imm_ncomp; + + if (ncomp == 1) + swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX); + + return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST); +} + +struct ir2_src +ir2_zero(struct ir2_context *ctx) +{ + return load_const(ctx, (float[]) {0.0f}, 1); +} + +static void +update_range(struct ir2_context *ctx, struct ir2_reg *reg) +{ + if (!reg->initialized) { + reg->initialized = true; + reg->loop_depth = ctx->loop_depth; + } + + if (ctx->loop_depth > reg->loop_depth) { + reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1]; + } else { + reg->loop_depth = ctx->loop_depth; + reg->block_idx_free = -1; + } + + /* for regs we want to free at the end of the loop in any case + * XXX dont do this for ssa + */ + if (reg->loop_depth) + reg->block_idx_free = ctx->loop_last_block[reg->loop_depth]; +} + +static struct ir2_src +make_src(struct ir2_context *ctx, nir_src src) +{ + struct ir2_src res = {}; + struct ir2_reg *reg; + + nir_const_value *const_value = nir_src_as_const_value(src); + + if (const_value) { + assert(src.is_ssa); + return load_const(ctx, &const_value->f32[0], src.ssa->num_components); + } + + if (!src.is_ssa) { + res.num = src.reg.reg->index; + res.type = IR2_SRC_REG; + reg = &ctx->reg[res.num]; + } else { + assert(ctx->ssa_map[src.ssa->index] >= 0); + res.num = ctx->ssa_map[src.ssa->index]; + res.type = IR2_SRC_SSA; + reg = &ctx->instr[res.num].ssa; + } + + update_range(ctx, reg); + return res; +} + +static void +set_index(struct ir2_context *ctx, nir_dest * dst, + struct ir2_instr *instr) +{ + struct ir2_reg *reg = &instr->ssa; + + if (dst->is_ssa) { + ctx->ssa_map[dst->ssa.index] = instr->idx; + } else { + assert(instr->is_ssa); + reg = &ctx->reg[dst->reg.reg->index]; + + instr->is_ssa = false; + instr->reg = reg; + } + update_range(ctx, reg); +} + +static struct ir2_instr * +ir2_instr_create(struct ir2_context *ctx, int type) +{ + struct ir2_instr *instr; + + instr = &ctx->instr[ctx->instr_count++]; + instr->idx = ctx->instr_count - 1; + instr->type = type; + instr->block_idx = ctx->block_idx; + instr->pred = ctx->pred; + instr->is_ssa = true; + return instr; +} + +static struct ir2_instr * +instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp) +{ + /* emit_alu will fixup instrs that don't map directly */ + static const struct ir2_opc { + int8_t scalar, vector; + } nir_ir2_opc[nir_num_opcodes+1] = { + [0 ... nir_num_opcodes - 1] = {-1, -1}, + + [nir_op_fmov] = {MAXs, MAXv}, + [nir_op_fsign] = {-1, CNDGTEv}, + [nir_op_fnot] = {SETEs, SETEv}, + [nir_op_f2b32] = {SETNEs, SETNEv}, + [nir_op_for] = {MAXs, MAXv}, + [nir_op_fand] = {MINs, MINv}, + [nir_op_fxor] = {-1, SETNEv}, + [nir_op_fadd] = {ADDs, ADDv}, + [nir_op_fsub] = {ADDs, ADDv}, + [nir_op_fmul] = {MULs, MULv}, + [nir_op_ffma] = {-1, MULADDv}, + [nir_op_fmax] = {MAXs, MAXv}, + [nir_op_fmin] = {MINs, MINv}, + [nir_op_ffloor] = {FLOORs, FLOORv}, + [nir_op_ffract] = {FRACs, FRACv}, + [nir_op_ftrunc] = {TRUNCs, TRUNCv}, + [nir_op_fdot2] = {-1, DOT2ADDv}, + [nir_op_fdot3] = {-1, DOT3v}, + [nir_op_fdot4] = {-1, DOT4v}, + [nir_op_sge] = {-1, SETGTEv}, + [nir_op_slt] = {-1, SETGTv}, + [nir_op_sne] = {-1, SETNEv}, + [nir_op_seq] = {-1, SETEv}, + [nir_op_fcsel] = {-1, CNDEv}, + [nir_op_frsq] = {RECIPSQ_IEEE, -1}, + [nir_op_frcp] = {RECIP_IEEE, -1}, + [nir_op_flog2] = {LOG_IEEE, -1}, + [nir_op_fexp2] = {EXP_IEEE, -1}, + [nir_op_fsqrt] = {SQRT_IEEE, -1}, + [nir_op_fcos] = {COS, -1}, + [nir_op_fsin] = {SIN, -1}, + /* no fsat, fneg, fabs since source mods deal with those */ + + /* some nir passes still generate nir_op_imov */ + [nir_op_imov] = {MAXs, MAXv}, + + /* so we can use this function with non-nir op */ +#define ir2_op_cube nir_num_opcodes + [ir2_op_cube] = {-1, CUBEv}, + }; + + struct ir2_opc op = nir_ir2_opc[opcode]; + assert(op.vector >= 0 || op.scalar >= 0); + + struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU); + instr->alu.vector_opc = op.vector; + instr->alu.scalar_opc = op.scalar; + instr->alu.export = -1; + instr->alu.write_mask = (1 << ncomp) - 1; + instr->src_count = opcode == ir2_op_cube ? 2 : + nir_op_infos[opcode].num_inputs; + instr->ssa.ncomp = ncomp; + return instr; +} + +static struct ir2_instr * +instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, + uint8_t write_mask, struct ir2_instr *share_reg) +{ + struct ir2_instr *instr; + struct ir2_reg *reg; + unsigned ncomp, max_comp; + + reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++]; + reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1); + + instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask)); + instr->alu.write_mask = write_mask; + instr->reg = reg; + instr->is_ssa = false; + return instr; +} + + +static struct ir2_instr * +instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst) +{ + struct ir2_instr *instr; + instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst)); + set_index(ctx, dst, instr); + return instr; +} + +static struct ir2_instr * +ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst, + instr_fetch_opc_t opc) +{ + struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH); + instr->fetch.opc = opc; + instr->src_count = 1; + instr->ssa.ncomp = nir_dest_num_components(*dst); + set_index(ctx, dst, instr); + return instr; +} + +static struct ir2_src +make_src_noconst(struct ir2_context *ctx, nir_src src) +{ + struct ir2_instr *instr; + + if (nir_src_as_const_value(src)) { + assert(src.is_ssa); + instr = instr_create_alu(ctx, nir_op_fmov, src.ssa->num_components); + instr->src[0] = make_src(ctx, src); + return ir2_src(instr->idx, 0, IR2_SRC_SSA); + } + + return make_src(ctx, src); +} + +static void +emit_alu(struct ir2_context *ctx, nir_alu_instr * alu) +{ + const nir_op_info *info = &nir_op_infos[alu->op]; + nir_dest *dst = &alu->dest.dest; + struct ir2_instr *instr; + struct ir2_src tmp; + unsigned ncomp; + + /* get the number of dst components */ + if (dst->is_ssa) { + ncomp = dst->ssa.num_components; + } else { + ncomp = 0; + for (int i = 0; i < 4; i++) + ncomp += !!(alu->dest.write_mask & 1 << i); + } + + instr = instr_create_alu(ctx, alu->op, ncomp); + set_index(ctx, dst, instr); + instr->alu.saturate = alu->dest.saturate; + instr->alu.write_mask = alu->dest.write_mask; + + for (int i = 0; i < info->num_inputs; i++) { + nir_alu_src *src = &alu->src[i]; + + /* compress swizzle with writemask when applicable */ + unsigned swiz = 0, j = 0; + for (int i = 0; i < 4; i++) { + if (!(alu->dest.write_mask & 1 << i) && !info->output_size) + continue; + swiz |= swiz_set(src->swizzle[i], j++); + } + + instr->src[i] = make_src(ctx, src->src); + instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz); + instr->src[i].negate = src->negate; + instr->src[i].abs = src->abs; + } + + /* workarounds for NIR ops that don't map directly to a2xx ops */ + switch (alu->op) { + case nir_op_slt: + tmp = instr->src[0]; + instr->src[0] = instr->src[1]; + instr->src[1] = tmp; + break; + case nir_op_fcsel: + case nir_op_bcsel: + tmp = instr->src[1]; + instr->src[1] = instr->src[2]; + instr->src[2] = tmp; + break; + case nir_op_fsub: + instr->src[1].negate = !instr->src[1].negate; + break; + case nir_op_fdot2: + instr->src_count = 3; + instr->src[2] = ir2_zero(ctx); + break; + case nir_op_fsign: { + /* we need an extra instruction to deal with the zero case */ + struct ir2_instr *tmp; + + /* tmp = x == 0 ? 0 : 1 */ + tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp); + tmp->src[0] = instr->src[0]; + tmp->src[1] = ir2_zero(ctx); + tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1); + + /* result = x >= 0 ? tmp : -tmp */ + instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); + instr->src[2] = instr->src[1]; + instr->src[2].negate = true; + instr->src_count = 3; + } break; + default: + break; + } +} + +static void +load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx) +{ + struct ir2_instr *instr; + int slot = -1; + + if (ctx->so->type == MESA_SHADER_VERTEX) { + instr = ir2_instr_create_fetch(ctx, dst, 0); + instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT); + instr->fetch.vtx.const_idx = 20 + (idx / 3); + instr->fetch.vtx.const_idx_sel = idx % 3; + return; + } + + /* get slot from idx */ + nir_foreach_variable(var, &ctx->nir->inputs) { + if (var->data.driver_location == idx) { + slot = var->data.location; + break; + } + } + assert(slot >= 0); + + switch (slot) { + case VARYING_SLOT_PNTC: + /* need to extract with abs and invert y */ + instr = instr_create_alu_dest(ctx, nir_op_ffma, dst); + instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT); + instr->src[0].abs = true; + instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2); + instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2); + break; + case VARYING_SLOT_POS: + /* need to extract xy with abs and add tile offset on a20x + * zw from fragcoord input (w inverted in fragment shader) + * TODO: only components that are required by fragment shader + */ + instr = instr_create_alu_reg(ctx, + ctx->so->is_a20x ? nir_op_fadd : nir_op_fmov, 3, NULL); + instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); + instr->src[0].abs = true; + /* on a20x, C64 contains the tile offset */ + instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST); + + instr = instr_create_alu_reg(ctx, nir_op_fmov, 4, instr); + instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT); + + instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr); + instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT); + + unsigned reg_idx = instr->reg - ctx->reg; /* XXX */ + instr = instr_create_alu_dest(ctx, nir_op_fmov, dst); + instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); + break; + default: + instr = instr_create_alu_dest(ctx, nir_op_fmov, dst); + instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT); + break; + } +} + +static unsigned +output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr) +{ + int slot = -1; + unsigned idx = nir_intrinsic_base(intr); + nir_foreach_variable(var, &ctx->nir->outputs) { + if (var->data.driver_location == idx) { + slot = var->data.location; + break; + } + } + assert(slot != -1); + return slot; +} + +static void +store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp) +{ + struct ir2_instr *instr; + unsigned idx = 0; + + if (ctx->so->type == MESA_SHADER_VERTEX) { + switch (slot) { + case VARYING_SLOT_POS: + ctx->position = make_src(ctx, src); + idx = 62; + break; + case VARYING_SLOT_PSIZ: + ctx->so->writes_psize = true; + idx = 63; + break; + default: + /* find matching slot from fragment shader input */ + for (idx = 0; idx < ctx->f->inputs_count; idx++) + if (ctx->f->inputs[idx].slot == slot) + break; + if (idx == ctx->f->inputs_count) + return; + } + } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) { + /* only color output is implemented */ + return; + } + + instr = instr_create_alu(ctx, nir_op_fmov, ncomp); + instr->src[0] = make_src(ctx, src); + instr->alu.export = idx; +} + +static void +emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir2_instr *instr; + nir_const_value *const_offset; + nir_deref_instr *deref; + unsigned idx; + + switch (intr->intrinsic) { + case nir_intrinsic_load_input: + load_input(ctx, &intr->dest, nir_intrinsic_base(intr)); + break; + case nir_intrinsic_store_output: + store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components); + break; + case nir_intrinsic_load_deref: + deref = nir_src_as_deref(intr->src[0]); + assert(deref->deref_type == nir_deref_type_var); + load_input(ctx, &intr->dest, deref->var->data.driver_location); + break; + case nir_intrinsic_store_deref: + deref = nir_src_as_deref(intr->src[0]); + assert(deref->deref_type == nir_deref_type_var); + store_output(ctx, intr->src[1], deref->var->data.location, intr->num_components); + break; + case nir_intrinsic_load_uniform: + const_offset = nir_src_as_const_value(intr->src[0]); + assert(const_offset); /* TODO can be false in ES2? */ + idx = nir_intrinsic_base(intr); + idx += (uint32_t) nir_src_as_const_value(intr->src[0])->f32[0]; + instr = instr_create_alu_dest(ctx, nir_op_fmov, &intr->dest); + instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST); + break; + case nir_intrinsic_discard: + case nir_intrinsic_discard_if: + instr = ir2_instr_create(ctx, IR2_ALU); + instr->alu.vector_opc = VECTOR_NONE; + if (intr->intrinsic == nir_intrinsic_discard_if) { + instr->alu.scalar_opc = KILLNEs; + instr->src[0] = make_src(ctx, intr->src[0]); + } else { + instr->alu.scalar_opc = KILLEs; + instr->src[0] = ir2_zero(ctx); + } + instr->alu.export = -1; + instr->src_count = 1; + break; + case nir_intrinsic_load_front_face: + /* gl_FrontFacing is in the sign of param.x + * rcp required because otherwise we can't differentiate -0.0 and +0.0 + */ + ctx->so->need_param = true; + + struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1); + tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); + + instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest); + instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); + instr->src[1] = ir2_zero(ctx); + break; + default: + compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic); + break; + } +} + +static void +emit_tex(struct ir2_context *ctx, nir_tex_instr * tex) +{ + bool is_rect = false, is_cube = false; + struct ir2_instr *instr; + nir_src *coord, *lod_bias; + + coord = lod_bias = NULL; + + for (unsigned i = 0; i < tex->num_srcs; i++) { + switch (tex->src[i].src_type) { + case nir_tex_src_coord: + coord = &tex->src[i].src; + break; + case nir_tex_src_bias: + case nir_tex_src_lod: + assert(!lod_bias); + lod_bias = &tex->src[i].src; + break; + default: + compile_error(ctx, "Unhandled NIR tex src type: %d\n", + tex->src[i].src_type); + return; + } + } + + switch (tex->op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_txl: + break; + default: + compile_error(ctx, "unimplemented texop %d\n", tex->op); + return; + } + + switch (tex->sampler_dim) { + case GLSL_SAMPLER_DIM_2D: + break; + case GLSL_SAMPLER_DIM_RECT: + is_rect = true; + break; + case GLSL_SAMPLER_DIM_CUBE: + is_cube = true; + break; + default: + compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim); + return; + } + + struct ir2_src src_coord = make_src_noconst(ctx, *coord); + + /* for cube maps + * tmp = cube(coord) + * tmp.xy = tmp.xy / |tmp.z| + 1.5 + * coord = tmp.xyw + */ + if (is_cube) { + struct ir2_instr *rcp, *coord_xy; + unsigned reg_idx; + + instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL); + instr->src[0] = src_coord; + instr->src[0].swizzle = IR2_SWIZZLE_ZZXY; + instr->src[1] = src_coord; + instr->src[1].swizzle = IR2_SWIZZLE_YXZZ; + + reg_idx = instr->reg - ctx->reg; /* hacky */ + + rcp = instr_create_alu(ctx, nir_op_frcp, 1); + rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG); + rcp->src[0].abs = true; + + coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr); + coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); + coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); + coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1); + + src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG); + /* TODO: lod/bias transformed by src_coord.z ? */ + } + + instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH); + instr->src[0] = src_coord; + instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_XYW : 0; + instr->fetch.tex.is_cube = is_cube; + instr->fetch.tex.is_rect = is_rect; + instr->fetch.tex.samp_id = tex->sampler_index; + + /* for lod/bias, we insert an extra src for the backend to deal with */ + if (lod_bias) { + instr->src[1] = make_src_noconst(ctx, *lod_bias); + /* backend will use 2-3 components so apply swizzle */ + swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX); + instr->src_count = 2; + } +} + +static void +setup_input(struct ir2_context *ctx, nir_variable * in) +{ + struct fd2_shader_stateobj *so = ctx->so; + unsigned array_len = MAX2(glsl_get_length(in->type), 1); + unsigned n = in->data.driver_location; + unsigned slot = in->data.location; + + assert(array_len == 1); + + /* handle later */ + if (ctx->so->type == MESA_SHADER_VERTEX) + return; + + if (ctx->so->type != MESA_SHADER_FRAGMENT) + compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); + + if (slot == VARYING_SLOT_PNTC) { + so->need_param = true; + return; + } + + n = ctx->f->inputs_count++; + + /* half of fragcoord from param reg, half from a varying */ + if (slot == VARYING_SLOT_POS) { + ctx->f->fragcoord = n; + so->need_param = true; + } + + ctx->f->inputs[n].slot = slot; + ctx->f->inputs[n].ncomp = glsl_get_components(in->type); + + /* in->data.interpolation? + * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD + */ +} + +static void +emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef) +{ + /* TODO we don't want to emit anything for undefs */ + + struct ir2_instr *instr; + + instr = instr_create_alu_dest(ctx, nir_op_fmov, + &(nir_dest) {.ssa = undef->def,.is_ssa = true}); + instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST); +} + +static void +emit_instr(struct ir2_context *ctx, nir_instr * instr) +{ + switch (instr->type) { + case nir_instr_type_alu: + emit_alu(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_deref: + /* ignored, handled as part of the intrinsic they are src to */ + break; + case nir_instr_type_intrinsic: + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_load_const: + /* dealt with when using nir_src */ + break; + case nir_instr_type_tex: + emit_tex(ctx, nir_instr_as_tex(instr)); + break; + case nir_instr_type_jump: + ctx->block_has_jump[ctx->block_idx] = true; + break; + case nir_instr_type_ssa_undef: + emit_undef(ctx, nir_instr_as_ssa_undef(instr)); + break; + default: + break; + } +} + +/* fragcoord.zw and a20x hw binning outputs */ +static void +extra_position_exports(struct ir2_context *ctx, bool binning) +{ + struct ir2_instr *instr, *rcp, *sc, *wincoord, *off; + + if (ctx->f->fragcoord < 0 && !binning) + return; + + instr = instr_create_alu(ctx, nir_op_fmax, 1); + instr->src[0] = ctx->position; + instr->src[0].swizzle = IR2_SWIZZLE_W; + instr->src[1] = ir2_zero(ctx); + + rcp = instr_create_alu(ctx, nir_op_frcp, 1); + rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA); + + sc = instr_create_alu(ctx, nir_op_fmul, 4); + sc->src[0] = ctx->position; + sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); + + wincoord = instr_create_alu(ctx, nir_op_ffma, 4); + wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST); + wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA); + wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST); + + /* fragcoord z/w */ + if (ctx->f->fragcoord >= 0 && !binning) { + instr = instr_create_alu(ctx, nir_op_fmov, 1); + instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA); + instr->alu.export = ctx->f->fragcoord; + + instr = instr_create_alu(ctx, nir_op_fmov, 1); + instr->src[0] = ctx->position; + instr->src[0].swizzle = IR2_SWIZZLE_W; + instr->alu.export = ctx->f->fragcoord; + instr->alu.write_mask = 2; + } + + if (!binning) + return; + + off = instr_create_alu(ctx, nir_op_fadd, 1); + off->src[0] = ir2_src(64, 0, IR2_SRC_CONST); + off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT); + + /* 8 max set in freedreno_screen.. unneeded instrs patched out */ + for (int i = 0; i < 8; i++) { + instr = instr_create_alu(ctx, nir_op_ffma, 4); + instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST); + instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); + instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST); + instr->alu.export = 32; + + instr = instr_create_alu(ctx, nir_op_ffma, 4); + instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST); + instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA); + instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST); + instr->alu.export = 33; + } +} + +static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list); + +static bool +emit_block(struct ir2_context *ctx, nir_block * block) +{ + struct ir2_instr *instr; + nir_block *succs = block->successors[0]; + + ctx->block_idx = block->index; + + nir_foreach_instr(instr, block) + emit_instr(ctx, instr); + + if (!succs || !succs->index) + return false; + + /* we want to be smart and always jump and have the backend cleanup + * but we are not, so there are two cases where jump is needed: + * loops (succs index lower) + * jumps (jump instruction seen in block) + */ + if (succs->index > block->index && !ctx->block_has_jump[block->index]) + return false; + + assert(block->successors[1] == NULL); + + instr = ir2_instr_create(ctx, IR2_CF); + instr->cf.block_idx = succs->index; + /* XXX can't jump to a block with different predicate */ + return true; +} + +static void +emit_if(struct ir2_context *ctx, nir_if * nif) +{ + unsigned pred = ctx->pred, pred_idx = ctx->pred_idx; + struct ir2_instr *instr; + + /* XXX: blob seems to always use same register for condition */ + + instr = ir2_instr_create(ctx, IR2_ALU); + instr->src[0] = make_src(ctx, nif->condition); + instr->src_count = 1; + instr->ssa.ncomp = 1; + instr->alu.vector_opc = VECTOR_NONE; + instr->alu.scalar_opc = SCALAR_NONE; + instr->alu.export = -1; + instr->alu.write_mask = 1; + instr->pred = 0; + + /* if nested, use PRED_SETNE_PUSHv */ + if (pred) { + instr->alu.vector_opc = PRED_SETNE_PUSHv; + instr->src[1] = instr->src[0]; + instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA); + instr->src[0].swizzle = IR2_SWIZZLE_XXXX; + instr->src[1].swizzle = IR2_SWIZZLE_XXXX; + instr->src_count = 2; + } else { + instr->alu.scalar_opc = PRED_SETNEs; + } + + ctx->pred_idx = instr->idx; + ctx->pred = 3; + + emit_cf_list(ctx, &nif->then_list); + + /* TODO: if these is no else branch we don't need this + * and if the else branch is simple, can just flip ctx->pred instead + */ + instr = ir2_instr_create(ctx, IR2_ALU); + instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); + instr->src_count = 1; + instr->ssa.ncomp = 1; + instr->alu.vector_opc = VECTOR_NONE; + instr->alu.scalar_opc = PRED_SET_INVs; + instr->alu.export = -1; + instr->alu.write_mask = 1; + instr->pred = 0; + ctx->pred_idx = instr->idx; + + emit_cf_list(ctx, &nif->else_list); + + /* restore predicate for nested predicates */ + if (pred) { + instr = ir2_instr_create(ctx, IR2_ALU); + instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); + instr->src_count = 1; + instr->ssa.ncomp = 1; + instr->alu.vector_opc = VECTOR_NONE; + instr->alu.scalar_opc = PRED_SET_POPs; + instr->alu.export = -1; + instr->alu.write_mask = 1; + instr->pred = 0; + ctx->pred_idx = instr->idx; + } + + /* restore ctx->pred */ + ctx->pred = pred; +} + +/* get the highest block idx in the loop, so we know when + * we can free registers that are allocated outside the loop + */ +static unsigned +loop_last_block(struct exec_list *list) +{ + nir_cf_node *node = + exec_node_data(nir_cf_node, exec_list_get_tail(list), node); + switch (node->type) { + case nir_cf_node_block: + return nir_cf_node_as_block(node)->index; + case nir_cf_node_if: + assert(0); /* XXX could this ever happen? */ + return 0; + case nir_cf_node_loop: + return loop_last_block(&nir_cf_node_as_loop(node)->body); + default: + compile_error(ctx, "Not supported\n"); + return 0; + } +} + +static void +emit_loop(struct ir2_context *ctx, nir_loop *nloop) +{ + ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body); + emit_cf_list(ctx, &nloop->body); + ctx->loop_depth--; +} + +static bool +emit_cf_list(struct ir2_context *ctx, struct exec_list *list) +{ + bool ret = false; + foreach_list_typed(nir_cf_node, node, node, list) { + ret = false; + switch (node->type) { + case nir_cf_node_block: + ret = emit_block(ctx, nir_cf_node_as_block(node)); + break; + case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; + case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; + case nir_cf_node_function: + compile_error(ctx, "Not supported\n"); + break; + } + } + return ret; +} + +static void cleanup_binning(struct ir2_context *ctx) +{ + assert(ctx->so->type == MESA_SHADER_VERTEX); + + /* kill non-position outputs for binning variant */ + nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + unsigned slot; + switch (intr->intrinsic) { + case nir_intrinsic_store_deref: { + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + assert(deref->deref_type == nir_deref_type_var); + slot = deref->var->data.location; + } break; + case nir_intrinsic_store_output: + slot = output_slot(ctx, intr); + break; + default: + continue; + } + + if (slot != VARYING_SLOT_POS) + nir_instr_remove(instr); + } + } + + ir2_optimize_nir(ctx->nir, false); +} + +void +ir2_nir_compile(struct ir2_context *ctx, bool binning) +{ + struct fd2_shader_stateobj *so = ctx->so; + + memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map)); + + ctx->nir = nir_shader_clone(NULL, so->nir); + + if (binning) + cleanup_binning(ctx); + + /* postprocess */ + OPT_V(ctx->nir, nir_opt_algebraic_late); + + OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods); + OPT_V(ctx->nir, nir_copy_prop); + OPT_V(ctx->nir, nir_opt_dce); + OPT_V(ctx->nir, nir_opt_move_comparisons); + + OPT_V(ctx->nir, nir_lower_bool_to_float); + + OPT_V(ctx->nir, nir_lower_locals_to_regs); + + OPT_V(ctx->nir, nir_convert_from_ssa, true); + + OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest); + OPT_V(ctx->nir, nir_lower_vec_to_movs); + + OPT_V(ctx->nir, nir_opt_dce); + + nir_sweep(ctx->nir); + + if (fd_mesa_debug & FD_DBG_DISASM) { + debug_printf("----------------------\n"); + nir_print_shader(ctx->nir, stdout); + debug_printf("----------------------\n"); + } + + /* fd2_shader_stateobj init */ + if (so->type == MESA_SHADER_FRAGMENT) { + ctx->f->fragcoord = -1; + ctx->f->inputs_count = 0; + memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs)); + } + + /* Setup inputs: */ + nir_foreach_variable(in, &ctx->nir->inputs) + setup_input(ctx, in); + + if (so->type == MESA_SHADER_FRAGMENT) { + unsigned idx; + for (idx = 0; idx < ctx->f->inputs_count; idx++) { + ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp; + update_range(ctx, &ctx->input[idx]); + } + /* assume we have param input and kill it later if not */ + ctx->input[idx].ncomp = 4; + update_range(ctx, &ctx->input[idx]); + } else { + ctx->input[0].ncomp = 1; + ctx->input[2].ncomp = 1; + update_range(ctx, &ctx->input[0]); + update_range(ctx, &ctx->input[2]); + } + + /* And emit the body: */ + nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir); + + nir_foreach_register(reg, &fxn->registers) { + ctx->reg[reg->index].ncomp = reg->num_components; + ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1); + } + + nir_metadata_require(fxn, nir_metadata_block_index); + emit_cf_list(ctx, &fxn->body); + /* TODO emit_block(ctx, fxn->end_block); */ + + if (so->type == MESA_SHADER_VERTEX) + extra_position_exports(ctx, binning); + + ralloc_free(ctx->nir); + + /* kill unused param input */ + if (so->type == MESA_SHADER_FRAGMENT && !so->need_param) + ctx->input[ctx->f->inputs_count].initialized = false; +} diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_private.h b/src/gallium/drivers/freedreno/a2xx/ir2_private.h new file mode 100644 index 00000000000..d1fbacd908f --- /dev/null +++ b/src/gallium/drivers/freedreno/a2xx/ir2_private.h @@ -0,0 +1,392 @@ +/* + * Copyright (C) 2018 Jonathan Marek + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Jonathan Marek + */ + +#include +#include +#include +#include +#include + +#include "ir2.h" +#include "fd2_program.h" +#include "instr-a2xx.h" + +enum ir2_src_type { + IR2_SRC_SSA, + IR2_SRC_REG, + IR2_SRC_INPUT, + IR2_SRC_CONST, +}; + +struct ir2_src { + /* num can mean different things + * ssa: index of instruction + * reg: index in ctx->reg array + * input: index in ctx->input array + * const: constant index (C0, C1, etc) + */ + uint16_t num; + uint8_t swizzle; + enum ir2_src_type type : 2; + uint8_t abs : 1; + uint8_t negate : 1; + uint8_t : 4; +}; + +struct ir2_reg_component { + uint8_t c : 3; /* assigned x/y/z/w (7=dont write, for fetch instr) */ + bool alloc : 1; /* is it currently allocated */ + uint8_t ref_count; /* for ra */ +}; + +struct ir2_reg { + uint8_t idx; /* assigned hardware register */ + uint8_t ncomp; + + uint8_t loop_depth; + bool initialized; + /* block_idx to free on (-1 = free on ref_count==0) */ + int block_idx_free; + struct ir2_reg_component comp[4]; +}; + +struct ir2_instr { + unsigned idx; + + unsigned block_idx; + + enum { + IR2_NONE, + IR2_FETCH, + IR2_ALU, + IR2_CF, + } type : 2; + + /* instruction needs to be emitted (for scheduling) */ + bool need_emit : 1; + + /* predicate value - (usually) same for entire block */ + uint8_t pred : 2; + + /* src */ + uint8_t src_count; + struct ir2_src src[4]; + + /* dst */ + bool is_ssa; + union { + struct ir2_reg ssa; + struct ir2_reg *reg; + }; + + /* type-specific */ + union { + struct { + instr_fetch_opc_t opc : 5; + union { + struct { + uint8_t const_idx; + uint8_t const_idx_sel; + } vtx; + struct { + bool is_cube : 1; + bool is_rect : 1; + uint8_t samp_id; + } tex; + }; + } fetch; + struct { + /* store possible opcs, then we can choose vector/scalar instr */ + instr_scalar_opc_t scalar_opc : 6; + instr_vector_opc_t vector_opc : 5; + /* same as nir */ + uint8_t write_mask : 4; + bool saturate : 1; + + /* export idx (-1 no export) */ + int8_t export; + + /* for scalarized 2 src instruction */ + uint8_t src1_swizzle; + } alu; + struct { + /* jmp dst block_idx */ + uint8_t block_idx; + } cf; + }; +}; + +struct ir2_sched_instr { + uint32_t reg_state[8]; + struct ir2_instr *instr, *instr_s; +}; + +struct ir2_context { + struct fd2_shader_stateobj *so; + + unsigned block_idx, pred_idx; + uint8_t pred; + bool block_has_jump[64]; + + unsigned loop_last_block[64]; + unsigned loop_depth; + + nir_shader *nir; + + /* ssa index of position output */ + struct ir2_src position; + + /* to translate SSA ids to instruction ids */ + int16_t ssa_map[1024]; + + struct ir2_shader_info *info; + struct ir2_frag_linkage *f; + + int prev_export; + + /* RA state */ + struct ir2_reg* live_regs[64]; + uint32_t reg_state[256/32]; /* 64*4 bits */ + + /* inputs */ + struct ir2_reg input[16 + 1]; /* 16 + param */ + + /* non-ssa regs */ + struct ir2_reg reg[64]; + unsigned reg_count; + + struct ir2_instr instr[0x300]; + unsigned instr_count; + + struct ir2_sched_instr instr_sched[0x180]; + unsigned instr_sched_count; +}; + +void assemble(struct ir2_context *ctx, bool binning); + +void ir2_nir_compile(struct ir2_context *ctx, bool binning); + +void ra_count_refs(struct ir2_context *ctx); +void ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx, + bool export, uint8_t export_writemask); +void ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr); +void ra_block_free(struct ir2_context *ctx, unsigned block); + +/* utils */ +enum { + IR2_SWIZZLE_Y = 1 << 0, + IR2_SWIZZLE_Z = 2 << 0, + IR2_SWIZZLE_W = 3 << 0, + + IR2_SWIZZLE_ZW = 2 << 0 | 2 << 2, + + IR2_SWIZZLE_XYW = 0 << 0 | 0 << 2 | 1 << 4, + + IR2_SWIZZLE_XXXX = 0 << 0 | 3 << 2 | 2 << 4 | 1 << 6, + IR2_SWIZZLE_YYYY = 1 << 0 | 0 << 2 | 3 << 4 | 2 << 6, + IR2_SWIZZLE_ZZZZ = 2 << 0 | 1 << 2 | 0 << 4 | 3 << 6, + IR2_SWIZZLE_WWWW = 3 << 0 | 2 << 2 | 1 << 4 | 0 << 6, + IR2_SWIZZLE_WYWW = 3 << 0 | 0 << 2 | 1 << 4 | 0 << 6, + IR2_SWIZZLE_XYXY = 0 << 0 | 0 << 2 | 2 << 4 | 2 << 6, + IR2_SWIZZLE_ZZXY = 2 << 0 | 1 << 2 | 2 << 4 | 2 << 6, + IR2_SWIZZLE_YXZZ = 1 << 0 | 3 << 2 | 0 << 4 | 3 << 6, +}; + +#define compile_error(ctx, args...) ({ \ + printf(args); \ + assert(0); \ +}) + +static inline struct ir2_src +ir2_src(uint16_t num, uint8_t swizzle, enum ir2_src_type type) +{ + return (struct ir2_src) { + .num = num, + .swizzle = swizzle, + .type = type + }; +} + +/* ir2_assemble uses it .. */ +struct ir2_src ir2_zero(struct ir2_context *ctx); + +#define ir2_foreach_instr(it, ctx) \ + for (struct ir2_instr *it = (ctx)->instr; ({ \ + while (it != &(ctx)->instr[(ctx)->instr_count] && it->type == IR2_NONE) it++; \ + it != &(ctx)->instr[(ctx)->instr_count]; }); it++) + +#define ir2_foreach_live_reg(it, ctx) \ + for (struct ir2_reg **__ptr = (ctx)->live_regs, *it; ({ \ + while (__ptr != &(ctx)->live_regs[64] && *__ptr == NULL) __ptr++; \ + __ptr != &(ctx)->live_regs[64] ? (it=*__ptr) : NULL; }); it++) + +#define ir2_foreach_avail(it) \ + for (struct ir2_instr **__instrp = avail, *it; \ + it = *__instrp, __instrp != &avail[avail_count]; __instrp++) + +#define ir2_foreach_src(it, instr) \ + for (struct ir2_src *it = instr->src; \ + it != &instr->src[instr->src_count]; it++) + +/* mask for register allocation + * 64 registers with 4 components each = 256 bits + */ +/* typedef struct { + uint64_t data[4]; +} regmask_t; */ + +static inline bool mask_isset(uint32_t * mask, unsigned num) +{ + return ! !(mask[num / 32] & 1 << num % 32); +} + +static inline void mask_set(uint32_t * mask, unsigned num) +{ + mask[num / 32] |= 1 << num % 32; +} + +static inline void mask_unset(uint32_t * mask, unsigned num) +{ + mask[num / 32] &= ~(1 << num % 32); +} + +static inline unsigned mask_reg(uint32_t * mask, unsigned num) +{ + return mask[num / 8] >> num % 8 * 4 & 0xf; +} + +static inline bool is_export(struct ir2_instr *instr) +{ + return instr->type == IR2_ALU && instr->alu.export >= 0; +} + +static inline instr_alloc_type_t export_buf(unsigned num) +{ + return num < 32 ? SQ_PARAMETER_PIXEL : + num >= 62 ? SQ_POSITION : SQ_MEMORY; +} + +/* component c for channel i */ +static inline unsigned swiz_set(unsigned c, unsigned i) +{ + return ((c - i) & 3) << i * 2; +} + +/* get swizzle in channel i */ +static inline unsigned swiz_get(unsigned swiz, unsigned i) +{ + return ((swiz >> i * 2) + i) & 3; +} + +static inline unsigned swiz_merge(unsigned swiz0, unsigned swiz1) +{ + unsigned swiz = 0; + for (int i = 0; i < 4; i++) + swiz |= swiz_set(swiz_get(swiz0, swiz_get(swiz1, i)), i); + return swiz; +} + +static inline void swiz_merge_p(uint8_t *swiz0, unsigned swiz1) +{ + unsigned swiz = 0; + for (int i = 0; i < 4; i++) + swiz |= swiz_set(swiz_get(*swiz0, swiz_get(swiz1, i)), i); + *swiz0 = swiz; +} + +static inline struct ir2_reg * get_reg(struct ir2_instr *instr) +{ + return instr->is_ssa ? &instr->ssa : instr->reg; +} + +static inline struct ir2_reg * +get_reg_src(struct ir2_context *ctx, struct ir2_src *src) +{ + switch (src->type) { + case IR2_SRC_INPUT: + return &ctx->input[src->num]; + case IR2_SRC_SSA: + return &ctx->instr[src->num].ssa; + case IR2_SRC_REG: + return &ctx->reg[src->num]; + default: + return NULL; + } +} + +/* gets a ncomp value for the dst */ +static inline unsigned dst_ncomp(struct ir2_instr *instr) +{ + if (instr->is_ssa) + return instr->ssa.ncomp; + + if (instr->type == IR2_FETCH) + return instr->reg->ncomp; + + assert(instr->type == IR2_ALU); + + unsigned ncomp = 0; + for (int i = 0; i < instr->reg->ncomp; i++) + ncomp += !!(instr->alu.write_mask & 1 << i); + return ncomp; +} + +/* gets a ncomp value for the src registers */ +static inline unsigned src_ncomp(struct ir2_instr *instr) +{ + if (instr->type == IR2_FETCH) { + switch (instr->fetch.opc) { + case VTX_FETCH: + return 1; + case TEX_FETCH: + return instr->fetch.tex.is_cube ? 3 : 2; + case TEX_SET_TEX_LOD: + return 1; + default: + assert(0); + } + } + + switch (instr->alu.scalar_opc) { + case PRED_SETEs ... KILLONEs: + return 1; + default: + break; + } + + switch (instr->alu.vector_opc) { + case DOT2ADDv: + return 2; + case DOT3v: + return 3; + case DOT4v: + case CUBEv: + case PRED_SETE_PUSHv: + return 4; + default: + return dst_ncomp(instr); + } +} diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_ra.c b/src/gallium/drivers/freedreno/a2xx/ir2_ra.c new file mode 100644 index 00000000000..f37eb36b4b0 --- /dev/null +++ b/src/gallium/drivers/freedreno/a2xx/ir2_ra.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2018 Jonathan Marek + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Jonathan Marek + */ + +#include "ir2_private.h" + +/* if an instruction has side effects, we should never kill it */ +static bool has_side_effects(struct ir2_instr *instr) +{ + if (instr->type == IR2_CF) + return true; + else if (instr->type == IR2_FETCH) + return false; + + switch (instr->alu.scalar_opc) { + case PRED_SETEs ... KILLONEs: + return true; + default: + break; + } + + switch (instr->alu.vector_opc) { + case PRED_SETE_PUSHv ... KILLNEv: + return true; + default: + break; + } + + return instr->alu.export >= 0; +} + +/* mark an instruction as required, and all its sources recursively */ +static void set_need_emit(struct ir2_context *ctx, struct ir2_instr *instr) +{ + struct ir2_reg *reg; + + /* don't repeat work already done */ + if (instr->need_emit) + return; + + instr->need_emit = true; + + ir2_foreach_src(src, instr) { + switch (src->type) { + case IR2_SRC_SSA: + set_need_emit(ctx, &ctx->instr[src->num]); + break; + case IR2_SRC_REG: + /* slow .. */ + reg = get_reg_src(ctx, src); + ir2_foreach_instr(instr, ctx) { + if (!instr->is_ssa && instr->reg == reg) + set_need_emit(ctx, instr); + } + default: + break; + } + } +} + +/* get current bit mask of allocated components for a register */ +static unsigned reg_mask(struct ir2_context *ctx, unsigned idx) +{ + return ctx->reg_state[idx/8] >> idx%8*4 & 0xf; +} + +static void reg_setmask(struct ir2_context *ctx, unsigned idx, unsigned c) +{ + idx = idx * 4 + c; + ctx->reg_state[idx/32] |= 1 << idx%32; +} + +static void reg_freemask(struct ir2_context *ctx, unsigned idx, unsigned c) +{ + idx = idx * 4 + c; + ctx->reg_state[idx/32] &= ~(1 << idx%32); +} + +void ra_count_refs(struct ir2_context *ctx) +{ + struct ir2_reg *reg; + + /* mark instructions as needed + * need to do this because "substitutions" pass makes many movs not needed + */ + ir2_foreach_instr(instr, ctx) { + if (has_side_effects(instr)) + set_need_emit(ctx, instr); + } + + /* compute ref_counts */ + ir2_foreach_instr(instr, ctx) { + /* kill non-needed so they can be skipped */ + if (!instr->need_emit) { + instr->type = IR2_NONE; + continue; + } + + ir2_foreach_src(src, instr) { + if (src->type == IR2_SRC_CONST) + continue; + + reg = get_reg_src(ctx, src); + for (int i = 0; i < src_ncomp(instr); i++) + reg->comp[swiz_get(src->swizzle, i)].ref_count++; + } + } +} + +void ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx, + bool export, uint8_t export_writemask) +{ + /* for export, don't allocate anything but set component layout */ + if (export) { + for (int i = 0; i < 4; i++) + reg->comp[i].c = i; + return; + } + + unsigned idx = force_idx; + + /* TODO: allocate into the same register if theres room + * note: the blob doesn't do it, so verify that it is indeed better + * also, doing it would conflict with scalar mov insertion + */ + + /* check if already allocated */ + for (int i = 0; i < reg->ncomp; i++) { + if (reg->comp[i].alloc) + return; + } + + if (force_idx < 0) { + for (idx = 0; idx < 64; idx++) { + if (reg_mask(ctx, idx) == 0) + break; + } + } + assert(idx != 64); /* TODO ran out of register space.. */ + + /* update max_reg value */ + ctx->info->max_reg = MAX2(ctx->info->max_reg, (int) idx); + + unsigned mask = reg_mask(ctx, idx); + + for (int i = 0; i < reg->ncomp; i++) { + /* don't allocate never used values */ + if (reg->comp[i].ref_count == 0) { + reg->comp[i].c = 7; + continue; + } + + /* TODO */ + unsigned c = 1 ? i : (ffs(~mask) - 1); + mask |= 1 << c; + reg->comp[i].c = c; + reg_setmask(ctx, idx, c); + reg->comp[i].alloc = true; + } + + reg->idx = idx; + ctx->live_regs[reg->idx] = reg; +} + +/* reduce srcs ref_count and free if needed */ +void ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr) +{ + struct ir2_reg *reg; + struct ir2_reg_component *comp; + + ir2_foreach_src(src, instr) { + if (src->type == IR2_SRC_CONST) + continue; + + reg = get_reg_src(ctx, src); + /* XXX use before write case */ + + for (int i = 0; i < src_ncomp(instr); i++) { + comp = ®->comp[swiz_get(src->swizzle, i)]; + if (!--comp->ref_count && reg->block_idx_free < 0) { + reg_freemask(ctx, reg->idx, comp->c); + comp->alloc = false; + } + } + } +} + +/* free any regs left for a block */ +void ra_block_free(struct ir2_context *ctx, unsigned block) +{ + ir2_foreach_live_reg(reg, ctx) { + if (reg->block_idx_free != block) + continue; + + for (int i = 0; i < reg->ncomp; i++) { + if (!reg->comp[i].alloc) /* XXX should never be true? */ + continue; + + reg_freemask(ctx, reg->idx, reg->comp[i].c); + reg->comp[i].alloc = false; + } + ctx->live_regs[reg->idx] = NULL; + } +} diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index f44738fc398..85f17c81a9d 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -56,14 +56,6 @@ struct fd_texture_stateobj { struct fd_program_stateobj { void *vp, *fp; - - /* rest only used by fd2.. split out: */ - uint8_t num_exports; - /* Indexed by semantic name or TGSI_SEMANTIC_COUNT + semantic index - * for TGSI_SEMANTIC_GENERIC. Special vs exports (position and point- - * size) are not included in this - */ - uint8_t export_linkage[63]; }; struct fd_constbuf_stateobj { diff --git a/src/gallium/drivers/freedreno/freedreno_program.c b/src/gallium/drivers/freedreno/freedreno_program.c index 989ccd1838f..3fa09ce0c48 100644 --- a/src/gallium/drivers/freedreno/freedreno_program.c +++ b/src/gallium/drivers/freedreno/freedreno_program.c @@ -129,15 +129,14 @@ void fd_prog_init(struct pipe_context *pctx) pctx->bind_fs_state = fd_fp_state_bind; pctx->bind_vs_state = fd_vp_state_bind; - // XXX for now, let a2xx keep it's own hand-rolled shaders - // for solid and blit progs: - if (ctx->screen->gpu_id < 300) - return; - ctx->solid_prog.fp = assemble_tgsi(pctx, solid_fp, true); ctx->solid_prog.vp = assemble_tgsi(pctx, solid_vp, false); ctx->blit_prog[0].vp = assemble_tgsi(pctx, blit_vp, false); ctx->blit_prog[0].fp = fd_prog_blit(pctx, 1, false); + + if (ctx->screen->gpu_id < 300) + return; + for (i = 1; i < ctx->screen->max_rts; i++) { ctx->blit_prog[i].vp = ctx->blit_prog[0].vp; ctx->blit_prog[i].fp = fd_prog_blit(pctx, i + 1, false); diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 03b358782c1..e59922cba47 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -58,6 +58,7 @@ #include "ir3/ir3_nir.h" +#include "a2xx/ir2.h" /* XXX this should go away */ #include "state_tracker/drm_driver.h" @@ -496,16 +497,9 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: return 16; case PIPE_SHADER_CAP_PREFERRED_IR: - if (is_ir3(screen)) - return PIPE_SHADER_IR_NIR; - return PIPE_SHADER_IR_TGSI; + return PIPE_SHADER_IR_NIR; case PIPE_SHADER_CAP_SUPPORTED_IRS: - if (is_ir3(screen)) { - return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI); - } else { - return (1 << PIPE_SHADER_IR_TGSI); - } - return 0; + return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI); case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; case PIPE_SHADER_CAP_SCALAR_ISA: @@ -636,7 +630,7 @@ fd_get_compiler_options(struct pipe_screen *pscreen, if (is_ir3(screen)) return ir3_get_compiler_options(screen->compiler); - return NULL; + return ir2_get_compiler_options(); } boolean diff --git a/src/gallium/drivers/freedreno/meson.build b/src/gallium/drivers/freedreno/meson.build index 7afdf5a746f..40b55ad491d 100644 --- a/src/gallium/drivers/freedreno/meson.build +++ b/src/gallium/drivers/freedreno/meson.build @@ -60,8 +60,6 @@ files_libfreedreno = files( 'a2xx/disasm-a2xx.c', 'a2xx/fd2_blend.c', 'a2xx/fd2_blend.h', - 'a2xx/fd2_compiler.c', - 'a2xx/fd2_compiler.h', 'a2xx/fd2_context.c', 'a2xx/fd2_context.h', 'a2xx/fd2_draw.c', @@ -85,8 +83,12 @@ files_libfreedreno = files( 'a2xx/fd2_zsa.c', 'a2xx/fd2_zsa.h', 'a2xx/instr-a2xx.h', - 'a2xx/ir-a2xx.c', - 'a2xx/ir-a2xx.h', + 'a2xx/ir2.c', + 'a2xx/ir2.h', + 'a2xx/ir2_assemble.c', + 'a2xx/ir2_nir.c', + 'a2xx/ir2_private.h', + 'a2xx/ir2_ra.c', 'a3xx/fd3_blend.c', 'a3xx/fd3_blend.h', 'a3xx/fd3_context.c', -- 2.30.2