+++ /dev/null
-/*
- * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com>
- * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com>
- * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __bifrost_h__
-#define __bifrost_h__
-
-#include <stdint.h>
-#include <stdbool.h>
-
-struct bifrost_header {
- unsigned unk0 : 7;
- // If true, convert any infinite result of any floating-point operation to
- // the biggest representable number.
- unsigned suppress_inf: 1;
- // Convert any NaN results to 0.
- unsigned suppress_nan : 1;
- unsigned unk1 : 2;
- // true if the execution mask of the next clause is the same as the mask of
- // the current clause.
- unsigned back_to_back : 1;
- unsigned no_end_of_shader: 1;
- unsigned unk2 : 2;
- // Set to true for fragment shaders, to implement this bit of spec text
- // from section 7.1.5 of the GLSL ES spec:
- //
- // "Stores to image and buffer variables performed by helper invocations
- // have no effect on the underlying image or buffer memory."
- //
- // Helper invocations are threads (invocations) corresponding to pixels in
- // a quad that aren't actually part of the triangle, but are included to
- // make derivatives work correctly. They're usually turned on, but they
- // need to be masked off for GLSL-level stores. This bit seems to be the
- // only bit that's actually different between fragment shaders and other
- // shaders, so this is probably what it's doing.
- unsigned elide_writes : 1;
- // If backToBack is off:
- // - true for conditional branches and fallthrough
- // - false for unconditional branches
- // The blob seems to always set it to true if back-to-back is on.
- unsigned branch_cond : 1;
- // This bit is set when the next clause writes to the data register of some
- // previous clause.
- unsigned datareg_writebarrier: 1;
- unsigned datareg : 6;
- unsigned scoreboard_deps: 8;
- unsigned scoreboard_index: 3;
- unsigned clause_type: 4;
- unsigned unk3 : 1; // part of clauseType?
- unsigned next_clause_type: 4;
- unsigned unk4 : 1; // part of nextClauseType?
-};
-
-struct bifrost_fma_inst {
- unsigned src0 : 3;
- unsigned op : 20;
-};
-
-struct bifrost_add_inst {
- unsigned src0 : 3;
- unsigned op : 17;
-};
-
-#endif
+++ /dev/null
-/*
- * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "main/mtypes.h"
-#include "compiler/glsl/standalone.h"
-#include "compiler/glsl/glsl_to_nir.h"
-#include "compiler/nir_types.h"
-#include "disassemble.h"
-#include "util/u_dynarray.h"
-
-static void
-disassemble(const char *filename)
-{
- FILE *fp = fopen(filename, "rb");
- assert(fp);
-
- fseek(fp, 0, SEEK_END);
- int filesize = ftell(fp);
- rewind(fp);
-
- unsigned char *code = malloc(filesize);
- int res = fread(code, 1, filesize, fp);
- if (res != filesize) {
- printf("Couldn't read full file\n");
- }
- fclose(fp);
-
- disassemble_bifrost(code, filesize, false);
- free(code);
-}
-
-int
-main(int argc, char **argv)
-{
- if (argc < 2) {
- printf("Pass a command\n");
- exit(1);
- }
- if (strcmp(argv[1], "disasm") == 0) {
- disassemble(argv[2]);
- }
- return 0;
-}
+++ /dev/null
-/*
- * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com>
- * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com>
- * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <assert.h>
-#include <inttypes.h>
-#include <string.h>
-
-#include "bifrost.h"
-#include "disassemble.h"
-#include "util/macros.h"
-
-// return bits (high, lo]
-static uint64_t bits(uint32_t word, unsigned lo, unsigned high)
-{
- if (high == 32)
- return word >> lo;
- return (word & ((1 << high) - 1)) >> lo;
-}
-
-// each of these structs represents an instruction that's dispatched in one
-// cycle. Note that these instructions are packed in funny ways within the
-// clause, hence the need for a separate struct.
-struct bifrost_alu_inst {
- uint32_t fma_bits;
- uint32_t add_bits;
- uint64_t reg_bits;
-};
-
-struct bifrost_regs {
- unsigned uniform_const : 8;
- unsigned reg2 : 6;
- unsigned reg3 : 6;
- unsigned reg0 : 5;
- unsigned reg1 : 6;
- unsigned ctrl : 4;
-};
-
-static unsigned get_reg0(struct bifrost_regs regs)
-{
- if (regs.ctrl == 0)
- return regs.reg0 | ((regs.reg1 & 0x1) << 5);
-
- return regs.reg0 <= regs.reg1 ? regs.reg0 : 63 - regs.reg0;
-}
-
-static unsigned get_reg1(struct bifrost_regs regs)
-{
- return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1;
-}
-
-enum bifrost_reg_write_unit {
- REG_WRITE_NONE = 0, // don't write
- REG_WRITE_TWO, // write using reg2
- REG_WRITE_THREE, // write using reg3
-};
-
-// this represents the decoded version of the ctrl register field.
-struct bifrost_reg_ctrl{
- bool read_reg0;
- bool read_reg1;
- bool read_reg3;
- enum bifrost_reg_write_unit fma_write_unit;
- enum bifrost_reg_write_unit add_write_unit;
- bool clause_start;
-};
-
-enum fma_src_type {
- FMA_ONE_SRC,
- FMA_TWO_SRC,
- FMA_FADD,
- FMA_FMINMAX,
- FMA_FADD16,
- FMA_FMINMAX16,
- FMA_FCMP,
- FMA_FCMP16,
- FMA_THREE_SRC,
- FMA_FMA,
- FMA_FMA16,
- FMA_FOUR_SRC,
- FMA_FMA_MSCALE,
- FMA_SHIFT_ADD64,
-};
-
-struct fma_op_info {
- unsigned op;
- char name[30];
- enum fma_src_type src_type;
-};
-
-enum add_src_type {
- ADD_ONE_SRC,
- ADD_TWO_SRC,
- ADD_FADD,
- ADD_FMINMAX,
- ADD_FADD16,
- ADD_FMINMAX16,
- ADD_THREE_SRC,
- ADD_FADDMscale,
- ADD_FCMP,
- ADD_FCMP16,
- ADD_TEX_COMPACT, // texture instruction with embedded sampler
- ADD_TEX, // texture instruction with sampler/etc. in uniform port
- ADD_VARYING_INTERP,
- ADD_BLENDING,
- ADD_LOAD_ATTR,
- ADD_VARYING_ADDRESS,
- ADD_BRANCH,
-};
-
-struct add_op_info {
- unsigned op;
- char name[30];
- enum add_src_type src_type;
- bool has_data_reg;
-};
-
-struct bifrost_tex_ctrl {
- unsigned sampler_index : 4; // also used to signal indirects
- unsigned tex_index : 7;
- bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices
- bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather)
- unsigned unk0 : 2;
- bool texel_offset : 1; // *Offset()
- bool is_shadow : 1;
- bool is_array : 1;
- unsigned tex_type : 2; // 2D, 3D, Cube, Buffer
- bool compute_lod : 1; // 0 for *Lod()
- bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied
- bool calc_gradients : 1; // 0 for *Grad()
- unsigned unk1 : 1;
- unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits?
- unsigned unk2 : 4;
-};
-
-struct bifrost_dual_tex_ctrl {
- unsigned sampler_index0 : 2;
- unsigned unk0 : 2;
- unsigned tex_index0 : 2;
- unsigned sampler_index1 : 2;
- unsigned tex_index1 : 2;
- unsigned unk1 : 22;
-};
-
-enum branch_cond {
- BR_COND_LT = 0,
- BR_COND_LE = 1,
- BR_COND_GE = 2,
- BR_COND_GT = 3,
- // Equal vs. not-equal determined by src0/src1 comparison
- BR_COND_EQ = 4,
- // floating-point comparisons
- // Becomes UNE when you flip the arguments
- BR_COND_OEQ = 5,
- // TODO what happens when you flip the arguments?
- BR_COND_OGT = 6,
- BR_COND_OLT = 7,
-};
-
-enum branch_bit_size {
- BR_SIZE_32 = 0,
- BR_SIZE_16XX = 1,
- BR_SIZE_16YY = 2,
- // For the above combinations of bitsize and location, an extra bit is
- // encoded via comparing the sources. The only possible source of ambiguity
- // would be if the sources were the same, but then the branch condition
- // would be always true or always false anyways, so we can ignore it. But
- // this no longer works when comparing the y component to the x component,
- // since it's valid to compare the y component of a source against its own
- // x component. Instead, the extra bit is encoded via an extra bitsize.
- BR_SIZE_16YX0 = 3,
- BR_SIZE_16YX1 = 4,
- BR_SIZE_32_AND_16X = 5,
- BR_SIZE_32_AND_16Y = 6,
- // Used for comparisons with zero and always-true, see below. I think this
- // only works for integer comparisons.
- BR_SIZE_ZERO = 7,
-};
-
-enum branch_code {
- BR_ALWAYS = 63,
-};
-
-void dump_header(struct bifrost_header header, bool verbose);
-void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts,
- unsigned data_reg, unsigned offset, bool verbose);
-bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose);
-
-void dump_header(struct bifrost_header header, bool verbose) {
- if (header.clause_type != 0) {
- printf("id(%du) ", header.scoreboard_index);
- }
-
- if (header.scoreboard_deps != 0) {
- printf("next-wait(");
- bool first = true;
- for (unsigned i = 0; i < 8; i++) {
- if (header.scoreboard_deps & (1 << i)) {
- if (!first) {
- printf(", ");
- }
- printf("%d", i);
- first = false;
- }
- }
- printf(") ");
- }
-
- if (header.datareg_writebarrier)
- printf("data-reg-barrier ");
-
- if (!header.no_end_of_shader)
- printf("eos ");
-
- if (!header.back_to_back) {
- printf("nbb ");
- if (header.branch_cond)
- printf("branch-cond ");
- else
- printf("branch-uncond ");
- }
-
- if (header.elide_writes)
- printf("we ");
-
- if (header.suppress_inf)
- printf("suppress-inf ");
- if (header.suppress_nan)
- printf("suppress-nan ");
-
- if (header.unk0)
- printf("unk0 ");
- if (header.unk1)
- printf("unk1 ");
- if (header.unk2)
- printf("unk2 ");
- if (header.unk3)
- printf("unk3 ");
- if (header.unk4)
- printf("unk4 ");
-
- printf("\n");
-
- if (verbose) {
- printf("# clause type %d, next clause type %d\n",
- header.clause_type, header.next_clause_type);
- }
-}
-
-static struct bifrost_reg_ctrl DecodeRegCtrl(struct bifrost_regs regs)
-{
- struct bifrost_reg_ctrl decoded = {};
- unsigned ctrl;
- if (regs.ctrl == 0) {
- ctrl = regs.reg1 >> 2;
- decoded.read_reg0 = !(regs.reg1 & 0x2);
- decoded.read_reg1 = false;
- } else {
- ctrl = regs.ctrl;
- decoded.read_reg0 = decoded.read_reg1 = true;
- }
- switch (ctrl) {
- case 1:
- decoded.fma_write_unit = REG_WRITE_TWO;
- break;
- case 3:
- decoded.fma_write_unit = REG_WRITE_TWO;
- decoded.read_reg3 = true;
- break;
- case 4:
- decoded.read_reg3 = true;
- break;
- case 5:
- decoded.add_write_unit = REG_WRITE_TWO;
- break;
- case 6:
- decoded.add_write_unit = REG_WRITE_TWO;
- decoded.read_reg3 = true;
- break;
- case 8:
- decoded.clause_start = true;
- break;
- case 9:
- decoded.fma_write_unit = REG_WRITE_TWO;
- decoded.clause_start = true;
- break;
- case 11:
- break;
- case 12:
- decoded.read_reg3 = true;
- decoded.clause_start = true;
- break;
- case 13:
- decoded.add_write_unit = REG_WRITE_TWO;
- decoded.clause_start = true;
- break;
- case 15:
- decoded.fma_write_unit = REG_WRITE_THREE;
- decoded.add_write_unit = REG_WRITE_TWO;
- break;
- default:
- printf("# unknown reg ctrl %d\n", ctrl);
- }
-
- return decoded;
-}
-
-// Pass in the add_write_unit or fma_write_unit, and this returns which register
-// the ADD/FMA units are writing to
-static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs)
-{
- switch (unit) {
- case REG_WRITE_TWO:
- return regs.reg2;
- case REG_WRITE_THREE:
- return regs.reg3;
- default: /* REG_WRITE_NONE */
- assert(0);
- return 0;
- }
-}
-
-static void dump_regs(struct bifrost_regs srcs)
-{
- struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(srcs);
- printf("# ");
- if (ctrl.read_reg0)
- printf("port 0: R%d ", get_reg0(srcs));
- if (ctrl.read_reg1)
- printf("port 1: R%d ", get_reg1(srcs));
-
- if (ctrl.fma_write_unit == REG_WRITE_TWO)
- printf("port 2: R%d (write FMA) ", srcs.reg2);
- else if (ctrl.add_write_unit == REG_WRITE_TWO)
- printf("port 2: R%d (write ADD) ", srcs.reg2);
-
- if (ctrl.fma_write_unit == REG_WRITE_THREE)
- printf("port 3: R%d (write FMA) ", srcs.reg3);
- else if (ctrl.add_write_unit == REG_WRITE_THREE)
- printf("port 3: R%d (write ADD) ", srcs.reg3);
- else if (ctrl.read_reg3)
- printf("port 3: R%d (read) ", srcs.reg3);
-
- if (srcs.uniform_const) {
- if (srcs.uniform_const & 0x80) {
- printf("uniform: U%d", (srcs.uniform_const & 0x7f) * 2);
- }
- }
-
- printf("\n");
-}
-static void dump_const_imm(uint32_t imm)
-{
- union {
- float f;
- uint32_t i;
- } fi;
- fi.i = imm;
- printf("0x%08x /* %f */", imm, fi.f);
-}
-
-static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs)
-{
- unsigned low_bits = srcs.uniform_const & 0xf;
- uint64_t imm;
- switch (srcs.uniform_const >> 4) {
- case 4: imm = consts[0]; break;
- case 5: imm = consts[1]; break;
- case 6: imm = consts[2]; break;
- case 7: imm = consts[3]; break;
- case 2: imm = consts[4]; break;
- case 3: imm = consts[5]; break;
- default: assert(0); break;
- }
- return imm | low_bits;
-}
-
-static void dump_uniform_const_src(struct bifrost_regs srcs, uint64_t *consts, bool high32)
-{
- if (srcs.uniform_const & 0x80) {
- unsigned uniform = (srcs.uniform_const & 0x7f) * 2;
- printf("U%d", uniform + (high32 ? 1 : 0));
- } else if (srcs.uniform_const >= 0x20) {
- uint64_t imm = get_const(consts, srcs);
- if (high32)
- dump_const_imm(imm >> 32);
- else
- dump_const_imm(imm);
- } else {
- switch (srcs.uniform_const) {
- case 0: printf("0"); break;
- case 5: printf("atest-data"); break;
- case 6: printf("sample-ptr"); break;
- case 8:
- case 9:
- case 10:
- case 11:
- case 12:
- case 13:
- case 14:
- case 15:
- printf("blend-descriptor%u", (unsigned) srcs.uniform_const - 8);
- break;
- default:
- printf("unkConst%u", (unsigned) srcs.uniform_const);
- break;
- }
-
- if (high32)
- printf(".y");
- else
- printf(".x");
- }
-}
-
-static void dump_src(unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA)
-{
- switch (src) {
- case 0: printf("R%d", get_reg0(srcs)); break;
- case 1: printf("R%d", get_reg1(srcs)); break;
- case 2: printf("R%d", srcs.reg3); break;
- case 3:
- if (isFMA)
- printf("0");
- else
- printf("T"); // i.e. the output of FMA this cycle
- break;
- case 4:
- dump_uniform_const_src(srcs, consts, false);
- break;
- case 5:
- dump_uniform_const_src(srcs, consts, true);
- break;
- case 6: printf("T0"); break;
- case 7: printf("T1"); break;
- }
-}
-
-static void dump_output_mod(unsigned mod)
-{
- switch (mod) {
- case 0:
- break;
- case 1:
- printf(".clamp_0_inf"); break; // max(out, 0)
- case 2:
- printf(".clamp_m1_1"); break; // clamp(out, -1, 1)
- case 3:
- printf(".clamp_0_1"); break; // clamp(out, 0, 1)
- default:
- break;
- }
-}
-
-static void dump_minmax_mode(unsigned mod)
-{
- switch (mod) {
- case 0:
- /* Same as fmax() and fmin() -- return the other number if any
- * number is NaN. Also always return +0 if one argument is +0 and
- * the other is -0.
- */
- break;
- case 1:
- /* Instead of never returning a NaN, always return one. The
- * "greater"/"lesser" NaN is always returned, first by checking the
- * sign and then the mantissa bits.
- */
- printf(".nan_wins"); break;
- case 2:
- /* For max, implement src0 > src1 ? src0 : src1
- * For min, implement src0 < src1 ? src0 : src1
- *
- * This includes handling NaN's and signedness of 0 differently
- * from above, since +0 and -0 compare equal and comparisons always
- * return false for NaN's. As a result, this mode is *not*
- * commutative.
- */
- printf(".src1_wins"); break;
- case 3:
- /* For max, implement src0 < src1 ? src1 : src0
- * For min, implement src0 > src1 ? src1 : src0
- */
- printf(".src0_wins"); break;
- default:
- break;
- }
-}
-
-static void dump_round_mode(unsigned mod)
-{
- switch (mod) {
- case 0:
- /* roundTiesToEven, the IEEE default. */
- break;
- case 1:
- /* roundTowardPositive in the IEEE spec. */
- printf(".round_pos"); break;
- case 2:
- /* roundTowardNegative in the IEEE spec. */
- printf(".round_neg"); break;
- case 3:
- /* roundTowardZero in the IEEE spec. */
- printf(".round_zero"); break;
- default:
- break;
- }
-}
-
-static const struct fma_op_info FMAOpInfos[] = {
- { 0x00000, "FMA.f32", FMA_FMA },
- { 0x40000, "MAX.f32", FMA_FMINMAX },
- { 0x44000, "MIN.f32", FMA_FMINMAX },
- { 0x48000, "FCMP.GL", FMA_FCMP },
- { 0x4c000, "FCMP.D3D", FMA_FCMP },
- { 0x4ff98, "ADD.i32", FMA_TWO_SRC },
- { 0x4ffd8, "SUB.i32", FMA_TWO_SRC },
- { 0x4fff0, "SUBB.i32", FMA_TWO_SRC },
- { 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE },
- { 0x58000, "ADD.f32", FMA_FADD },
- { 0x5c000, "CSEL.FEQ.f32", FMA_FOUR_SRC },
- { 0x5c200, "CSEL.FGT.f32", FMA_FOUR_SRC },
- { 0x5c400, "CSEL.FGE.f32", FMA_FOUR_SRC },
- { 0x5c600, "CSEL.IEQ.f32", FMA_FOUR_SRC },
- { 0x5c800, "CSEL.IGT.i32", FMA_FOUR_SRC },
- { 0x5ca00, "CSEL.IGE.i32", FMA_FOUR_SRC },
- { 0x5cc00, "CSEL.UGT.i32", FMA_FOUR_SRC },
- { 0x5ce00, "CSEL.UGE.i32", FMA_FOUR_SRC },
- { 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC },
- { 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC },
- { 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC },
- { 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC },
- { 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC },
- { 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0
- { 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC },
- { 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC },
- { 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC },
- { 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC },
- { 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0
- { 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC },
- { 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC },
- { 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC },
- { 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC },
- { 0x60200, "RSHIFT_NAND.i32", FMA_THREE_SRC },
- { 0x603c0, "RSHIFT_NAND.v2i16", FMA_THREE_SRC },
- { 0x60e00, "RSHIFT_OR.i32", FMA_THREE_SRC },
- { 0x60fc0, "RSHIFT_OR.v2i16", FMA_THREE_SRC },
- { 0x61200, "RSHIFT_AND.i32", FMA_THREE_SRC },
- { 0x613c0, "RSHIFT_AND.v2i16", FMA_THREE_SRC },
- { 0x61e00, "RSHIFT_NOR.i32", FMA_THREE_SRC }, // ~((src0 << src2) | src1)
- { 0x61fc0, "RSHIFT_NOR.v2i16", FMA_THREE_SRC }, // ~((src0 << src2) | src1)
- { 0x62200, "LSHIFT_NAND.i32", FMA_THREE_SRC },
- { 0x623c0, "LSHIFT_NAND.v2i16", FMA_THREE_SRC },
- { 0x62e00, "LSHIFT_OR.i32", FMA_THREE_SRC }, // (src0 << src2) | src1
- { 0x62fc0, "LSHIFT_OR.v2i16", FMA_THREE_SRC }, // (src0 << src2) | src1
- { 0x63200, "LSHIFT_AND.i32", FMA_THREE_SRC }, // (src0 << src2) & src1
- { 0x633c0, "LSHIFT_AND.v2i16", FMA_THREE_SRC },
- { 0x63e00, "LSHIFT_NOR.i32", FMA_THREE_SRC },
- { 0x63fc0, "LSHIFT_NOR.v2i16", FMA_THREE_SRC },
- { 0x64200, "RSHIFT_XOR.i32", FMA_THREE_SRC },
- { 0x643c0, "RSHIFT_XOR.v2i16", FMA_THREE_SRC },
- { 0x64600, "RSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
- { 0x647c0, "RSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
- { 0x64a00, "LSHIFT_XOR.i32", FMA_THREE_SRC },
- { 0x64bc0, "LSHIFT_XOR.v2i16", FMA_THREE_SRC },
- { 0x64e00, "LSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
- { 0x64fc0, "LSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
- { 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC },
- { 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1
- { 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2)
- { 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC },
- { 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC },
- { 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC },
- { 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC },
- { 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC },
- { 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC },
- { 0x80000, "FMA.v2f16", FMA_FMA16 },
- { 0xc0000, "MAX.v2f16", FMA_FMINMAX16 },
- { 0xc4000, "MIN.v2f16", FMA_FMINMAX16 },
- { 0xc8000, "FCMP.GL", FMA_FCMP16 },
- { 0xcc000, "FCMP.D3D", FMA_FCMP16 },
- { 0xcf900, "ADD.v2i16", FMA_TWO_SRC },
- { 0xcfc10, "ADDC.i32", FMA_TWO_SRC },
- { 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC },
- { 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC },
- { 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC },
- { 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC },
- { 0xd8000, "ADD.v2f16", FMA_FADD16 },
- { 0xdc000, "CSEL.FEQ.v2f16", FMA_FOUR_SRC },
- { 0xdc200, "CSEL.FGT.v2f16", FMA_FOUR_SRC },
- { 0xdc400, "CSEL.FGE.v2f16", FMA_FOUR_SRC },
- { 0xdc600, "CSEL.IEQ.v2f16", FMA_FOUR_SRC },
- { 0xdc800, "CSEL.IGT.v2i16", FMA_FOUR_SRC },
- { 0xdca00, "CSEL.IGE.v2i16", FMA_FOUR_SRC },
- { 0xdcc00, "CSEL.UGT.v2i16", FMA_FOUR_SRC },
- { 0xdce00, "CSEL.UGE.v2i16", FMA_FOUR_SRC },
- { 0xdd000, "F32_TO_F16", FMA_TWO_SRC },
- { 0xe0046, "F16_TO_I16.XX", FMA_ONE_SRC },
- { 0xe0047, "F16_TO_U16.XX", FMA_ONE_SRC },
- { 0xe004e, "F16_TO_I16.YX", FMA_ONE_SRC },
- { 0xe004f, "F16_TO_U16.YX", FMA_ONE_SRC },
- { 0xe0056, "F16_TO_I16.XY", FMA_ONE_SRC },
- { 0xe0057, "F16_TO_U16.XY", FMA_ONE_SRC },
- { 0xe005e, "F16_TO_I16.YY", FMA_ONE_SRC },
- { 0xe005f, "F16_TO_U16.YY", FMA_ONE_SRC },
- { 0xe00c0, "I16_TO_F16.XX", FMA_ONE_SRC },
- { 0xe00c1, "U16_TO_F16.XX", FMA_ONE_SRC },
- { 0xe00c8, "I16_TO_F16.YX", FMA_ONE_SRC },
- { 0xe00c9, "U16_TO_F16.YX", FMA_ONE_SRC },
- { 0xe00d0, "I16_TO_F16.XY", FMA_ONE_SRC },
- { 0xe00d1, "U16_TO_F16.XY", FMA_ONE_SRC },
- { 0xe00d8, "I16_TO_F16.YY", FMA_ONE_SRC },
- { 0xe00d9, "U16_TO_F16.YY", FMA_ONE_SRC },
- { 0xe0136, "F32_TO_I32", FMA_ONE_SRC },
- { 0xe0137, "F32_TO_U32", FMA_ONE_SRC },
- { 0xe0178, "I32_TO_F32", FMA_ONE_SRC },
- { 0xe0179, "U32_TO_F32", FMA_ONE_SRC },
- { 0xe0198, "I16_TO_I32.X", FMA_ONE_SRC },
- { 0xe0199, "U16_TO_U32.X", FMA_ONE_SRC },
- { 0xe019a, "I16_TO_I32.Y", FMA_ONE_SRC },
- { 0xe019b, "U16_TO_U32.Y", FMA_ONE_SRC },
- { 0xe019c, "I16_TO_F32.X", FMA_ONE_SRC },
- { 0xe019d, "U16_TO_F32.X", FMA_ONE_SRC },
- { 0xe019e, "I16_TO_F32.Y", FMA_ONE_SRC },
- { 0xe019f, "U16_TO_F32.Y", FMA_ONE_SRC },
- { 0xe01a2, "F16_TO_F32.X", FMA_ONE_SRC },
- { 0xe01a3, "F16_TO_F32.Y", FMA_ONE_SRC },
- { 0xe032c, "NOP", FMA_ONE_SRC },
- { 0xe032d, "MOV", FMA_ONE_SRC },
- { 0xe032f, "SWZ.YY.v2i16", FMA_ONE_SRC },
- // From the ARM patent US20160364209A1:
- // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s,
- // and x1 is a floating point value in a predetermined range where the
- // value 1 is within the range and not at one extremity of the range (e.g.
- // choose a range where 1 is towards middle of range)."
- //
- // This computes x1.
- { 0xe0345, "LOG_FREXPM", FMA_ONE_SRC },
- // Given a floating point number m * 2^e, returns m * 2^{-1}. This is
- // exactly the same as the mantissa part of frexp().
- { 0xe0365, "FRCP_FREXPM", FMA_ONE_SRC },
- // Given a floating point number m * 2^e, returns m * 2^{-2} if e is even,
- // and m * 2^{-1} if e is odd. In other words, scales by powers of 4 until
- // within the range [0.25, 1). Used for square-root and reciprocal
- // square-root.
- { 0xe0375, "FSQRT_FREXPM", FMA_ONE_SRC },
- // Given a floating point number m * 2^e, computes -e - 1 as an integer.
- // Zero and infinity/NaN return 0.
- { 0xe038d, "FRCP_FREXPE", FMA_ONE_SRC },
- // Computes floor(e/2) + 1.
- { 0xe03a5, "FSQRT_FREXPE", FMA_ONE_SRC },
- // Given a floating point number m * 2^e, computes -floor(e/2) - 1 as an
- // integer.
- { 0xe03ad, "FRSQ_FREXPE", FMA_ONE_SRC },
- { 0xe03c5, "LOG_FREXPE", FMA_ONE_SRC },
- { 0xe0b80, "IMAX3", FMA_THREE_SRC },
- { 0xe0bc0, "UMAX3", FMA_THREE_SRC },
- { 0xe0c00, "IMIN3", FMA_THREE_SRC },
- { 0xe0c40, "UMIN3", FMA_THREE_SRC },
- { 0xe0f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0
- { 0xe0fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment
- { 0xe1845, "CEIL", FMA_ONE_SRC },
- { 0xe1885, "FLOOR", FMA_ONE_SRC },
- { 0xe19b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC },
- { 0xe19b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC },
- // These instructions in the FMA slot, together with LSHIFT_ADD_HIGH32.i32
- // in the ADD slot, allow one to do a 64-bit addition with an extra small
- // shift on one of the sources. There are three possible scenarios:
- //
- // 1) Full 64-bit addition. Do:
- // out.x = LSHIFT_ADD_LOW32.i64 src1.x, src2.x, shift
- // out.y = LSHIFT_ADD_HIGH32.i32 src1.y, src2.y
- //
- // The shift amount is applied to src2 before adding. The shift amount, and
- // any extra bits from src2 plus the overflow bit, are sent directly from
- // FMA to ADD instead of being passed explicitly. Hence, these two must be
- // bundled together into the same instruction.
- //
- // 2) Add a 64-bit value src1 to a zero-extended 32-bit value src2. Do:
- // out.x = LSHIFT_ADD_LOW32.u32 src1.x, src2, shift
- // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0
- //
- // Note that in this case, the second argument to LSHIFT_ADD_HIGH32 is
- // ignored, so it can actually be anything. As before, the shift is applied
- // to src2 before adding.
- //
- // 3) Add a 64-bit value to a sign-extended 32-bit value src2. Do:
- // out.x = LSHIFT_ADD_LOW32.i32 src1.x, src2, shift
- // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0
- //
- // The only difference is the .i32 instead of .u32. Otherwise, this is
- // exactly the same as before.
- //
- // In all these instructions, the shift amount is stored where the third
- // source would be, so the shift has to be a small immediate from 0 to 7.
- // This is fine for the expected use-case of these instructions, which is
- // manipulating 64-bit pointers.
- //
- // These instructions can also be combined with various load/store
- // instructions which normally take a 64-bit pointer in order to add a
- // 32-bit or 64-bit offset to the pointer before doing the operation,
- // optionally shifting the offset. The load/store op implicity does
- // LSHIFT_ADD_HIGH32.i32 internally. Letting ptr be the pointer, and offset
- // the desired offset, the cases go as follows:
- //
- // 1) Add a 64-bit offset:
- // LSHIFT_ADD_LOW32.i64 ptr.x, offset.x, shift
- // ld_st_op ptr.y, offset.y, ...
- //
- // Note that the output of LSHIFT_ADD_LOW32.i64 is not used, instead being
- // implicitly sent to the load/store op to serve as the low 32 bits of the
- // pointer.
- //
- // 2) Add a 32-bit unsigned offset:
- // temp = LSHIFT_ADD_LOW32.u32 ptr.x, offset, shift
- // ld_st_op temp, ptr.y, ...
- //
- // Now, the low 32 bits of offset << shift + ptr are passed explicitly to
- // the ld_st_op, to match the case where there is no offset and ld_st_op is
- // called directly.
- //
- // 3) Add a 32-bit signed offset:
- // temp = LSHIFT_ADD_LOW32.i32 ptr.x, offset, shift
- // ld_st_op temp, ptr.y, ...
- //
- // Again, the same as the unsigned case except for the offset.
- { 0xe1c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 },
- { 0xe1cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 },
- { 0xe1d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 },
- { 0xe1e00, "SEL.XX.i16", FMA_TWO_SRC },
- { 0xe1e08, "SEL.YX.i16", FMA_TWO_SRC },
- { 0xe1e10, "SEL.XY.i16", FMA_TWO_SRC },
- { 0xe1e18, "SEL.YY.i16", FMA_TWO_SRC },
- { 0xe7800, "IMAD", FMA_THREE_SRC },
- { 0xe78db, "POPCNT", FMA_ONE_SRC },
-};
-
-static struct fma_op_info find_fma_op_info(unsigned op)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) {
- unsigned opCmp = ~0;
- switch (FMAOpInfos[i].src_type) {
- case FMA_ONE_SRC:
- opCmp = op;
- break;
- case FMA_TWO_SRC:
- opCmp = op & ~0x7;
- break;
- case FMA_FCMP:
- case FMA_FCMP16:
- opCmp = op & ~0x1fff;
- break;
- case FMA_THREE_SRC:
- case FMA_SHIFT_ADD64:
- opCmp = op & ~0x3f;
- break;
- case FMA_FADD:
- case FMA_FMINMAX:
- case FMA_FADD16:
- case FMA_FMINMAX16:
- opCmp = op & ~0x3fff;
- break;
- case FMA_FMA:
- case FMA_FMA16:
- opCmp = op & ~0x3ffff;
- break;
- case FMA_FOUR_SRC:
- opCmp = op & ~0x1ff;
- break;
- case FMA_FMA_MSCALE:
- opCmp = op & ~0x7fff;
- break;
- default:
- opCmp = ~0;
- break;
- }
- if (FMAOpInfos[i].op == opCmp)
- return FMAOpInfos[i];
- }
-
- struct fma_op_info info;
- snprintf(info.name, sizeof(info.name), "op%04x", op);
- info.op = op;
- info.src_type = FMA_THREE_SRC;
- return info;
-}
-
-static void dump_fcmp(unsigned op)
-{
- switch (op) {
- case 0:
- printf(".OEQ");
- break;
- case 1:
- printf(".OGT");
- break;
- case 2:
- printf(".OGE");
- break;
- case 3:
- printf(".UNE");
- break;
- case 4:
- printf(".OLT");
- break;
- case 5:
- printf(".OLE");
- break;
- default:
- printf(".unk%d", op);
- break;
- }
-}
-
-static void dump_16swizzle(unsigned swiz)
-{
- if (swiz == 2)
- return;
- printf(".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]);
-}
-
-static void dump_fma_expand_src0(unsigned ctrl)
-{
- switch (ctrl) {
- case 3:
- case 4:
- case 6:
- printf(".x");
- break;
- case 5:
- case 7:
- printf(".y");
- break;
- case 0:
- case 1:
- case 2:
- break;
- default:
- printf(".unk");
- break;
- }
-}
-
-static void dump_fma_expand_src1(unsigned ctrl)
-{
- switch (ctrl) {
- case 1:
- case 3:
- printf(".x");
- break;
- case 2:
- case 4:
- case 5:
- printf(".y");
- break;
- case 0:
- case 6:
- case 7:
- break;
- default:
- printf(".unk");
- break;
- }
-}
-
-static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose)
-{
- if (verbose) {
- printf("# FMA: %016" PRIx64 "\n", word);
- }
- struct bifrost_fma_inst FMA;
- memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst));
- struct fma_op_info info = find_fma_op_info(FMA.op);
-
- printf("%s", info.name);
- if (info.src_type == FMA_FADD ||
- info.src_type == FMA_FMINMAX ||
- info.src_type == FMA_FMA ||
- info.src_type == FMA_FADD16 ||
- info.src_type == FMA_FMINMAX16 ||
- info.src_type == FMA_FMA16) {
- dump_output_mod(bits(FMA.op, 12, 14));
- switch (info.src_type) {
- case FMA_FADD:
- case FMA_FMA:
- case FMA_FADD16:
- case FMA_FMA16:
- dump_round_mode(bits(FMA.op, 10, 12));
- break;
- case FMA_FMINMAX:
- case FMA_FMINMAX16:
- dump_minmax_mode(bits(FMA.op, 10, 12));
- break;
- default:
- assert(0);
- }
- } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) {
- dump_fcmp(bits(FMA.op, 10, 13));
- if (info.src_type == FMA_FCMP)
- printf(".f32");
- else
- printf(".v2f16");
- } else if (info.src_type == FMA_FMA_MSCALE) {
- if (FMA.op & (1 << 11)) {
- switch ((FMA.op >> 9) & 0x3) {
- case 0:
- /* This mode seems to do a few things:
- * - Makes 0 * infinity (and incidentally 0 * nan) return 0,
- * since generating a nan would poison the result of
- * 1/infinity and 1/0.
- * - Fiddles with which nan is returned in nan * nan,
- * presumably to make sure that the same exact nan is
- * returned for 1/nan.
- */
- printf(".rcp_mode");
- break;
- case 3:
- /* Similar to the above, but src0 always wins when multiplying
- * 0 by infinity.
- */
- printf(".sqrt_mode");
- break;
- default:
- printf(".unk%d_mode", (int) (FMA.op >> 9) & 0x3);
- }
- } else {
- dump_output_mod(bits(FMA.op, 9, 11));
- }
- }
-
- printf(" ");
-
- struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs);
- if (next_ctrl.fma_write_unit != REG_WRITE_NONE) {
- printf("{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs));
- } else {
- printf("T0, ");
- }
-
- switch (info.src_type) {
- case FMA_ONE_SRC:
- dump_src(FMA.src0, regs, consts, true);
- break;
- case FMA_TWO_SRC:
- dump_src(FMA.src0, regs, consts, true);
- printf(", ");
- dump_src(FMA.op & 0x7, regs, consts, true);
- break;
- case FMA_FADD:
- case FMA_FMINMAX:
- if (FMA.op & 0x10)
- printf("-");
- if (FMA.op & 0x200)
- printf("abs(");
- dump_src(FMA.src0, regs, consts, true);
- dump_fma_expand_src0((FMA.op >> 6) & 0x7);
- if (FMA.op & 0x200)
- printf(")");
- printf(", ");
- if (FMA.op & 0x20)
- printf("-");
- if (FMA.op & 0x8)
- printf("abs(");
- dump_src(FMA.op & 0x7, regs, consts, true);
- dump_fma_expand_src1((FMA.op >> 6) & 0x7);
- if (FMA.op & 0x8)
- printf(")");
- break;
- case FMA_FADD16:
- case FMA_FMINMAX16: {
- bool abs1 = FMA.op & 0x8;
- bool abs2 = (FMA.op & 0x7) < FMA.src0;
- if (FMA.op & 0x10)
- printf("-");
- if (abs1 || abs2)
- printf("abs(");
- dump_src(FMA.src0, regs, consts, true);
- dump_16swizzle((FMA.op >> 6) & 0x3);
- if (abs1 || abs2)
- printf(")");
- printf(", ");
- if (FMA.op & 0x20)
- printf("-");
- if (abs1 && abs2)
- printf("abs(");
- dump_src(FMA.op & 0x7, regs, consts, true);
- dump_16swizzle((FMA.op >> 8) & 0x3);
- if (abs1 && abs2)
- printf(")");
- break;
- }
- case FMA_FCMP:
- if (FMA.op & 0x200)
- printf("abs(");
- dump_src(FMA.src0, regs, consts, true);
- dump_fma_expand_src0((FMA.op >> 6) & 0x7);
- if (FMA.op & 0x200)
- printf(")");
- printf(", ");
- if (FMA.op & 0x20)
- printf("-");
- if (FMA.op & 0x8)
- printf("abs(");
- dump_src(FMA.op & 0x7, regs, consts, true);
- dump_fma_expand_src1((FMA.op >> 6) & 0x7);
- if (FMA.op & 0x8)
- printf(")");
- break;
- case FMA_FCMP16:
- dump_src(FMA.src0, regs, consts, true);
- // Note: this is kinda a guess, I haven't seen the blob set this to
- // anything other than the identity, but it matches FMA_TWO_SRCFmod16
- dump_16swizzle((FMA.op >> 6) & 0x3);
- printf(", ");
- dump_src(FMA.op & 0x7, regs, consts, true);
- dump_16swizzle((FMA.op >> 8) & 0x3);
- break;
- case FMA_SHIFT_ADD64:
- dump_src(FMA.src0, regs, consts, true);
- printf(", ");
- dump_src(FMA.op & 0x7, regs, consts, true);
- printf(", ");
- printf("shift:%u", (FMA.op >> 3) & 0x7);
- break;
- case FMA_THREE_SRC:
- dump_src(FMA.src0, regs, consts, true);
- printf(", ");
- dump_src(FMA.op & 0x7, regs, consts, true);
- printf(", ");
- dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
- break;
- case FMA_FMA:
- if (FMA.op & (1 << 14))
- printf("-");
- if (FMA.op & (1 << 9))
- printf("abs(");
- dump_src(FMA.src0, regs, consts, true);
- dump_fma_expand_src0((FMA.op >> 6) & 0x7);
- if (FMA.op & (1 << 9))
- printf(")");
- printf(", ");
- if (FMA.op & (1 << 16))
- printf("abs(");
- dump_src(FMA.op & 0x7, regs, consts, true);
- dump_fma_expand_src1((FMA.op >> 6) & 0x7);
- if (FMA.op & (1 << 16))
- printf(")");
- printf(", ");
- if (FMA.op & (1 << 15))
- printf("-");
- if (FMA.op & (1 << 17))
- printf("abs(");
- dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
- if (FMA.op & (1 << 17))
- printf(")");
- break;
- case FMA_FMA16:
- if (FMA.op & (1 << 14))
- printf("-");
- dump_src(FMA.src0, regs, consts, true);
- dump_16swizzle((FMA.op >> 6) & 0x3);
- printf(", ");
- dump_src(FMA.op & 0x7, regs, consts, true);
- dump_16swizzle((FMA.op >> 8) & 0x3);
- printf(", ");
- if (FMA.op & (1 << 15))
- printf("-");
- dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
- dump_16swizzle((FMA.op >> 16) & 0x3);
- break;
- case FMA_FOUR_SRC:
- dump_src(FMA.src0, regs, consts, true);
- printf(", ");
- dump_src(FMA.op & 0x7, regs, consts, true);
- printf(", ");
- dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
- printf(", ");
- dump_src((FMA.op >> 6) & 0x7, regs, consts, true);
- break;
- case FMA_FMA_MSCALE:
- if (FMA.op & (1 << 12))
- printf("abs(");
- dump_src(FMA.src0, regs, consts, true);
- if (FMA.op & (1 << 12))
- printf(")");
- printf(", ");
- if (FMA.op & (1 << 13))
- printf("-");
- dump_src(FMA.op & 0x7, regs, consts, true);
- printf(", ");
- if (FMA.op & (1 << 14))
- printf("-");
- dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
- printf(", ");
- dump_src((FMA.op >> 6) & 0x7, regs, consts, true);
- break;
- }
- printf("\n");
-}
-
-static const struct add_op_info add_op_infos[] = {
- { 0x00000, "MAX.f32", ADD_FMINMAX },
- { 0x02000, "MIN.f32", ADD_FMINMAX },
- { 0x04000, "ADD.f32", ADD_FADD },
- { 0x06000, "FCMP.GL", ADD_FCMP },
- { 0x07000, "FCMP.D3D", ADD_FCMP },
- { 0x07856, "F16_TO_I16", ADD_ONE_SRC },
- { 0x07857, "F16_TO_U16", ADD_ONE_SRC },
- { 0x078c0, "I16_TO_F16.XX", ADD_ONE_SRC },
- { 0x078c1, "U16_TO_F16.XX", ADD_ONE_SRC },
- { 0x078c8, "I16_TO_F16.YX", ADD_ONE_SRC },
- { 0x078c9, "U16_TO_F16.YX", ADD_ONE_SRC },
- { 0x078d0, "I16_TO_F16.XY", ADD_ONE_SRC },
- { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC },
- { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC },
- { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC },
- { 0x07936, "F32_TO_I32", ADD_ONE_SRC },
- { 0x07937, "F32_TO_U32", ADD_ONE_SRC },
- { 0x07978, "I32_TO_F32", ADD_ONE_SRC },
- { 0x07979, "U32_TO_F32", ADD_ONE_SRC },
- { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC },
- { 0x07999, "U16_TO_U32.X", ADD_ONE_SRC },
- { 0x0799a, "I16_TO_I32.Y", ADD_ONE_SRC },
- { 0x0799b, "U16_TO_U32.Y", ADD_ONE_SRC },
- { 0x0799c, "I16_TO_F32.X", ADD_ONE_SRC },
- { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC },
- { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC },
- { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC },
- // take the low 16 bits, and expand it to a 32-bit float
- { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC },
- // take the high 16 bits, ...
- { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC },
- { 0x07b2b, "SWZ.YX.v2i16", ADD_ONE_SRC },
- { 0x07b2c, "NOP", ADD_ONE_SRC },
- { 0x07b29, "SWZ.XX.v2i16", ADD_ONE_SRC },
- // Logically, this should be SWZ.XY, but that's equivalent to a move, and
- // this seems to be the canonical way the blob generates a MOV.
- { 0x07b2d, "MOV", ADD_ONE_SRC },
- { 0x07b2f, "SWZ.YY.v2i16", ADD_ONE_SRC },
- // Given a floating point number m * 2^e, returns m ^ 2^{-1}.
- { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC },
- { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC },
- { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC },
- { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC },
- { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC },
- // From the ARM patent US20160364209A1:
- // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s,
- // and x1 is a floating point value in a predetermined range where the
- // value 1 is within the range and not at one extremity of the range (e.g.
- // choose a range where 1 is towards middle of range)."
- //
- // This computes s.
- { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC },
- { 0x07d45, "CEIL", ADD_ONE_SRC },
- { 0x07d85, "FLOOR", ADD_ONE_SRC },
- { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC },
- { 0x08000, "LD_ATTR.f16", ADD_LOAD_ATTR, true },
- { 0x08100, "LD_ATTR.v2f16", ADD_LOAD_ATTR, true },
- { 0x08200, "LD_ATTR.v3f16", ADD_LOAD_ATTR, true },
- { 0x08300, "LD_ATTR.v4f16", ADD_LOAD_ATTR, true },
- { 0x08400, "LD_ATTR.f32", ADD_LOAD_ATTR, true },
- { 0x08500, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
- { 0x08600, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
- { 0x08700, "LD_ATTR.v4f32", ADD_LOAD_ATTR, true },
- { 0x08800, "LD_ATTR.i32", ADD_LOAD_ATTR, true },
- { 0x08900, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
- { 0x08a00, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
- { 0x08b00, "LD_ATTR.v4i32", ADD_LOAD_ATTR, true },
- { 0x08c00, "LD_ATTR.u32", ADD_LOAD_ATTR, true },
- { 0x08d00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
- { 0x08e00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
- { 0x08f00, "LD_ATTR.v4u32", ADD_LOAD_ATTR, true },
- { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true },
- { 0x0b000, "TEX", ADD_TEX_COMPACT, true },
- { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true },
- { 0x0c1a0, "LD_UBO.i32", ADD_TWO_SRC, true },
- { 0x0c1b8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
- { 0x0c1c8, "LOAD.v2i32", ADD_TWO_SRC, true },
- { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true },
- { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
- { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true },
- // src0 = offset, src1 = binding
- { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true },
- { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true },
- { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true },
- { 0x0c278, "ST_SCRATCH.v4i32", ADD_TWO_SRC, true },
- { 0x0c588, "STORE.i32", ADD_TWO_SRC, true },
- { 0x0c5b8, "ST_SCRATCH.i32", ADD_TWO_SRC, true },
- { 0x0c5c8, "STORE.v2i32", ADD_TWO_SRC, true },
- { 0x0c5f8, "ST_SCRATCH.v2i32", ADD_TWO_SRC, true },
- { 0x0c648, "LOAD.u16", ADD_TWO_SRC, true }, // zero-extends
- { 0x0ca88, "LOAD.v3i32", ADD_TWO_SRC, true },
- { 0x0caa0, "LD_UBO.v3i32", ADD_TWO_SRC, true },
- { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true },
- { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true },
- { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true },
- // *_FAST does not exist on G71 (added to G51, G72, and everything after)
- { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC },
- { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC },
- // Given a floating point number m * 2^e, produces a table-based
- // approximation of 2/m using the top 17 bits. Includes special cases for
- // infinity, NaN, and zero, and copies the sign bit.
- { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC },
- // Exists on G71
- { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC },
- // A similar table for inverse square root, using the high 17 bits of the
- // mantissa as well as the low bit of the exponent.
- { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC },
- { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC },
- { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC },
- // Used in the argument reduction for log. Given a floating-point number
- // m * 2^e, uses the top 4 bits of m to produce an approximation to 1/m
- // with the exponent forced to 0 and only the top 5 bits are nonzero. 0,
- // infinity, and NaN all return 1.0.
- // See the ARM patent for more information.
- { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC },
- { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC },
- { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC },
- { 0x0cf48, "ATAN_TABLE", ADD_TWO_SRC },
- { 0x0cf50, "SIN_TABLE", ADD_ONE_SRC },
- { 0x0cf51, "COS_TABLE", ADD_ONE_SRC },
- { 0x0cf58, "EXP_TABLE", ADD_ONE_SRC },
- { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC },
- { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC },
- { 0x0d000, "BRANCH", ADD_BRANCH },
- // For each bit i, return src2[i] ? src0[i] : src1[i]. In other words, this
- // is the same as (src2 & src0) | (~src2 & src1).
- { 0x0e8c0, "MUX", ADD_THREE_SRC },
- { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC },
- { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC },
- { 0x0ea60, "SEL.XX.i16", ADD_TWO_SRC },
- { 0x0ea70, "SEL.XY.i16", ADD_TWO_SRC },
- { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC },
- { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC },
- { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC },
- { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0
- { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC },
- { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC },
- { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC },
- { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC },
- { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0
- { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC },
- { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC },
- { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC },
- { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC },
- { 0x10000, "MAX.v2f16", ADD_FMINMAX16 },
- { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale },
- { 0x12000, "MIN.v2f16", ADD_FMINMAX16 },
- { 0x14000, "ADD.v2f16", ADD_FADD16 },
- { 0x17000, "FCMP.D3D", ADD_FCMP16 },
- { 0x178c0, "ADD.i32", ADD_TWO_SRC },
- { 0x17900, "ADD.v2i16", ADD_TWO_SRC },
- { 0x17ac0, "SUB.i32", ADD_TWO_SRC },
- { 0x17c10, "ADDC.i32", ADD_TWO_SRC }, // adds src0 to the bottom bit of src1
- { 0x17d80, "ADD.i32.i16.X", ADD_TWO_SRC },
- { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC },
- { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC },
- { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC },
- // Compute varying address and datatype (for storing in the vertex shader),
- // and store the vec3 result in the data register. The result is passed as
- // the 3 normal arguments to ST_VAR.
- { 0x18000, "LD_VAR_ADDR.f16", ADD_VARYING_ADDRESS, true },
- { 0x18100, "LD_VAR_ADDR.f32", ADD_VARYING_ADDRESS, true },
- { 0x18200, "LD_VAR_ADDR.i32", ADD_VARYING_ADDRESS, true },
- { 0x18300, "LD_VAR_ADDR.u32", ADD_VARYING_ADDRESS, true },
- // Implements alpha-to-coverage, as well as possibly the late depth and
- // stencil tests. The first source is the existing sample mask in R60
- // (possibly modified by gl_SampleMask), and the second source is the alpha
- // value. The sample mask is written right away based on the
- // alpha-to-coverage result using the normal register write mechanism,
- // since that doesn't need to read from any memory, and then written again
- // later based on the result of the stencil and depth tests using the
- // special register.
- { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true },
- { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true },
- { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true },
- // store a varying given the address and datatype from LD_VAR_ADDR
- { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true },
- { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true },
- { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true },
- { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true },
- // This takes the sample coverage mask (computed by ATEST above) as a
- // regular argument, in addition to the vec4 color in the special register.
- { 0x1952c, "BLEND", ADD_BLENDING, true },
- { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true },
- { 0x1ae60, "TEX", ADD_TEX, true },
- { 0x1c000, "RSHIFT_NAND.i32", ADD_THREE_SRC },
- { 0x1c300, "RSHIFT_OR.i32", ADD_THREE_SRC },
- { 0x1c400, "RSHIFT_AND.i32", ADD_THREE_SRC },
- { 0x1c700, "RSHIFT_NOR.i32", ADD_THREE_SRC },
- { 0x1c800, "LSHIFT_NAND.i32", ADD_THREE_SRC },
- { 0x1cb00, "LSHIFT_OR.i32", ADD_THREE_SRC },
- { 0x1cc00, "LSHIFT_AND.i32", ADD_THREE_SRC },
- { 0x1cf00, "LSHIFT_NOR.i32", ADD_THREE_SRC },
- { 0x1d000, "RSHIFT_XOR.i32", ADD_THREE_SRC },
- { 0x1d100, "RSHIFT_XNOR.i32", ADD_THREE_SRC },
- { 0x1d200, "LSHIFT_XOR.i32", ADD_THREE_SRC },
- { 0x1d300, "LSHIFT_XNOR.i32", ADD_THREE_SRC },
- { 0x1d400, "LSHIFT_ADD.i32", ADD_THREE_SRC },
- { 0x1d500, "LSHIFT_SUB.i32", ADD_THREE_SRC },
- { 0x1d500, "LSHIFT_RSUB.i32", ADD_THREE_SRC },
- { 0x1d700, "RSHIFT_ADD.i32", ADD_THREE_SRC },
- { 0x1d800, "RSHIFT_SUB.i32", ADD_THREE_SRC },
- { 0x1d900, "RSHIFT_RSUB.i32", ADD_THREE_SRC },
- { 0x1da00, "ARSHIFT_ADD.i32", ADD_THREE_SRC },
- { 0x1db00, "ARSHIFT_SUB.i32", ADD_THREE_SRC },
- { 0x1dc00, "ARSHIFT_RSUB.i32", ADD_THREE_SRC },
- { 0x1dd18, "OR.i32", ADD_TWO_SRC },
- { 0x1dd20, "AND.i32", ADD_TWO_SRC },
- { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC },
- { 0x1dd50, "XOR.i32", ADD_TWO_SRC },
- { 0x1dd80, "RSHIFT.i32", ADD_TWO_SRC },
- { 0x1dda0, "ARSHIFT.i32", ADD_TWO_SRC },
-};
-
-static struct add_op_info find_add_op_info(unsigned op)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) {
- unsigned opCmp = ~0;
- switch (add_op_infos[i].src_type) {
- case ADD_ONE_SRC:
- case ADD_BLENDING:
- opCmp = op;
- break;
- case ADD_TWO_SRC:
- opCmp = op & ~0x7;
- break;
- case ADD_THREE_SRC:
- opCmp = op & ~0x3f;
- break;
- case ADD_TEX:
- opCmp = op & ~0xf;
- break;
- case ADD_FADD:
- case ADD_FMINMAX:
- case ADD_FADD16:
- opCmp = op & ~0x1fff;
- break;
- case ADD_FMINMAX16:
- case ADD_FADDMscale:
- opCmp = op & ~0xfff;
- break;
- case ADD_FCMP:
- case ADD_FCMP16:
- opCmp = op & ~0x7ff;
- break;
- case ADD_TEX_COMPACT:
- opCmp = op & ~0x3ff;
- break;
- case ADD_VARYING_INTERP:
- opCmp = op & ~0x7ff;
- break;
- case ADD_VARYING_ADDRESS:
- opCmp = op & ~0xff;
- break;
- case ADD_LOAD_ATTR:
- opCmp = op & ~0x7f;
- break;
- case ADD_BRANCH:
- opCmp = op & ~0xfff;
- break;
- default:
- opCmp = ~0;
- break;
- }
- if (add_op_infos[i].op == opCmp)
- return add_op_infos[i];
- }
-
- struct add_op_info info;
- snprintf(info.name, sizeof(info.name), "op%04x", op);
- info.op = op;
- info.src_type = ADD_TWO_SRC;
- info.has_data_reg = true;
- return info;
-}
-
-static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts,
- unsigned data_reg, unsigned offset, bool verbose)
-{
- if (verbose) {
- printf("# ADD: %016" PRIx64 "\n", word);
- }
- struct bifrost_add_inst ADD;
- memcpy((char *) &ADD, (char *) &word, sizeof(ADD));
- struct add_op_info info = find_add_op_info(ADD.op);
-
- printf("%s", info.name);
-
- // float16 seems like it doesn't support output modifiers
- if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) {
- // output modifiers
- dump_output_mod(bits(ADD.op, 8, 10));
- if (info.src_type == ADD_FADD)
- dump_round_mode(bits(ADD.op, 10, 12));
- else
- dump_minmax_mode(bits(ADD.op, 10, 12));
- } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) {
- dump_fcmp(bits(ADD.op, 3, 6));
- if (info.src_type == ADD_FCMP)
- printf(".f32");
- else
- printf(".v2f16");
- } else if (info.src_type == ADD_FADDMscale) {
- switch ((ADD.op >> 6) & 0x7) {
- case 0: break;
- // causes GPU hangs on G71
- case 1: printf(".invalid"); break;
- // Same as usual outmod value.
- case 2: printf(".clamp_0_1"); break;
- // If src0 is infinite or NaN, flush it to zero so that the other
- // source is passed through unmodified.
- case 3: printf(".flush_src0_inf_nan"); break;
- // Vice versa.
- case 4: printf(".flush_src1_inf_nan"); break;
- // Every other case seems to behave the same as the above?
- default: printf(".unk%d", (ADD.op >> 6) & 0x7); break;
- }
- } else if (info.src_type == ADD_VARYING_INTERP) {
- if (ADD.op & 0x200)
- printf(".reuse");
- if (ADD.op & 0x400)
- printf(".flat");
- switch ((ADD.op >> 7) & 0x3) {
- case 0: printf(".per_frag"); break;
- case 1: printf(".centroid"); break;
- case 2: break;
- case 3: printf(".explicit"); break;
- }
- printf(".v%d", ((ADD.op >> 5) & 0x3) + 1);
- } else if (info.src_type == ADD_BRANCH) {
- enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f);
- if (branchCode == BR_ALWAYS) {
- // unconditional branch
- } else {
- enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7);
- enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
- bool portSwapped = (ADD.op & 0x7) < ADD.src0;
- // See the comment in branch_bit_size
- if (size == BR_SIZE_16YX0)
- portSwapped = true;
- if (size == BR_SIZE_16YX1)
- portSwapped = false;
- // These sizes are only for floating point comparisons, so the
- // non-floating-point comparisons are reused to encode the flipped
- // versions.
- if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y)
- portSwapped = false;
- // There's only one argument, so we reuse the extra argument to
- // encode this.
- if (size == BR_SIZE_ZERO)
- portSwapped = !(ADD.op & 1);
-
- switch (cond) {
- case BR_COND_LT:
- if (portSwapped)
- printf(".LT.u");
- else
- printf(".LT.i");
- break;
- case BR_COND_LE:
- if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) {
- printf(".UNE.f");
- } else {
- if (portSwapped)
- printf(".LE.u");
- else
- printf(".LE.i");
- }
- break;
- case BR_COND_GT:
- if (portSwapped)
- printf(".GT.u");
- else
- printf(".GT.i");
- break;
- case BR_COND_GE:
- if (portSwapped)
- printf(".GE.u");
- else
- printf(".GE.i");
- break;
- case BR_COND_EQ:
- if (portSwapped)
- printf(".NE.i");
- else
- printf(".EQ.i");
- break;
- case BR_COND_OEQ:
- if (portSwapped)
- printf(".UNE.f");
- else
- printf(".OEQ.f");
- break;
- case BR_COND_OGT:
- if (portSwapped)
- printf(".OGT.unk.f");
- else
- printf(".OGT.f");
- break;
- case BR_COND_OLT:
- if (portSwapped)
- printf(".OLT.unk.f");
- else
- printf(".OLT.f");
- break;
- }
- switch (size) {
- case BR_SIZE_32:
- case BR_SIZE_32_AND_16X:
- case BR_SIZE_32_AND_16Y:
- printf("32");
- break;
- case BR_SIZE_16XX:
- case BR_SIZE_16YY:
- case BR_SIZE_16YX0:
- case BR_SIZE_16YX1:
- printf("16");
- break;
- case BR_SIZE_ZERO: {
- unsigned ctrl = (ADD.op >> 1) & 0x3;
- if (ctrl == 0)
- printf("32.Z");
- else
- printf("16.Z");
- break;
- }
- }
- }
- }
- printf(" ");
-
- struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs);
- if (next_ctrl.add_write_unit != REG_WRITE_NONE) {
- printf("{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs));
- } else {
- printf("T1, ");
- }
-
- switch (info.src_type) {
- case ADD_BLENDING:
- // Note: in this case, regs.uniform_const == location | 0x8
- // This probably means we can't load uniforms or immediates in the
- // same instruction. This re-uses the encoding that normally means
- // "disabled", where the low 4 bits are ignored. Perhaps the extra
- // 0x8 or'd in indicates this is happening.
- printf("location:%d, ", regs.uniform_const & 0x7);
- // fallthrough
- case ADD_ONE_SRC:
- dump_src(ADD.src0, regs, consts, false);
- break;
- case ADD_TEX:
- case ADD_TEX_COMPACT: {
- int tex_index;
- int sampler_index;
- bool dualTex = false;
- if (info.src_type == ADD_TEX_COMPACT) {
- tex_index = (ADD.op >> 3) & 0x7;
- sampler_index = (ADD.op >> 7) & 0x7;
- bool unknown = (ADD.op & 0x40);
- // TODO: figure out if the unknown bit is ever 0
- if (!unknown)
- printf("unknown ");
- } else {
- uint64_t constVal = get_const(consts, regs);
- uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal;
- struct bifrost_tex_ctrl ctrl;
- memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl));
-
- // TODO: figure out what actually triggers dual-tex
- if (ctrl.result_type == 9) {
- struct bifrost_dual_tex_ctrl dualCtrl;
- memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl));
- printf("(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ",
- dualCtrl.tex_index0, dualCtrl.sampler_index0,
- dualCtrl.tex_index1, dualCtrl.sampler_index1);
- if (dualCtrl.unk0 != 3)
- printf("unk:%d ", dualCtrl.unk0);
- dualTex = true;
- } else {
- if (ctrl.no_merge_index) {
- tex_index = ctrl.tex_index;
- sampler_index = ctrl.sampler_index;
- } else {
- tex_index = sampler_index = ctrl.tex_index;
- unsigned unk = ctrl.sampler_index >> 2;
- if (unk != 3)
- printf("unk:%d ", unk);
- if (ctrl.sampler_index & 1)
- tex_index = -1;
- if (ctrl.sampler_index & 2)
- sampler_index = -1;
- }
-
- if (ctrl.unk0 != 3)
- printf("unk0:%d ", ctrl.unk0);
- if (ctrl.unk1)
- printf("unk1 ");
- if (ctrl.unk2 != 0xf)
- printf("unk2:%x ", ctrl.unk2);
-
- switch (ctrl.result_type) {
- case 0x4:
- printf("f32 "); break;
- case 0xe:
- printf("i32 "); break;
- case 0xf:
- printf("u32 "); break;
- default:
- printf("unktype(%x) ", ctrl.result_type);
- }
-
- switch (ctrl.tex_type) {
- case 0:
- printf("cube "); break;
- case 1:
- printf("buffer "); break;
- case 2:
- printf("2D "); break;
- case 3:
- printf("3D "); break;
- }
-
- if (ctrl.is_shadow)
- printf("shadow ");
- if (ctrl.is_array)
- printf("array ");
-
- if (!ctrl.filter) {
- if (ctrl.calc_gradients) {
- int comp = (controlBits >> 20) & 0x3;
- printf("txg comp:%d ", comp);
- } else {
- printf("txf ");
- }
- } else {
- if (!ctrl.not_supply_lod) {
- if (ctrl.compute_lod)
- printf("lod_bias ");
- else
- printf("lod ");
- }
-
- if (!ctrl.calc_gradients)
- printf("grad ");
- }
-
- if (ctrl.texel_offset)
- printf("offset ");
- }
- }
-
- if (!dualTex) {
- if (tex_index == -1)
- printf("tex:indirect ");
- else
- printf("tex:%d ", tex_index);
-
- if (sampler_index == -1)
- printf("samp:indirect ");
- else
- printf("samp:%d ", sampler_index);
- }
- break;
- }
- case ADD_VARYING_INTERP: {
- unsigned addr = ADD.op & 0x1f;
- if (addr < 0b10100) {
- // direct addr
- printf("%d", addr);
- } else if (addr < 0b11000) {
- if (addr == 22)
- printf("fragw");
- else if (addr == 23)
- printf("fragz");
- else
- printf("unk%d", addr);
- } else {
- dump_src(ADD.op & 0x7, regs, consts, false);
- }
- printf(", ");
- dump_src(ADD.src0, regs, consts, false);
- break;
- }
- case ADD_VARYING_ADDRESS: {
- dump_src(ADD.src0, regs, consts, false);
- printf(", ");
- dump_src(ADD.op & 0x7, regs, consts, false);
- printf(", ");
- unsigned location = (ADD.op >> 3) & 0x1f;
- if (location < 16) {
- printf("location:%d", location);
- } else if (location == 20) {
- printf("location:%u", (uint32_t) get_const(consts, regs));
- } else if (location == 21) {
- printf("location:%u", (uint32_t) (get_const(consts, regs) >> 32));
- } else {
- printf("location:%d(unk)", location);
- }
- break;
- }
- case ADD_LOAD_ATTR:
- printf("location:%d, ", (ADD.op >> 3) & 0xf);
- case ADD_TWO_SRC:
- dump_src(ADD.src0, regs, consts, false);
- printf(", ");
- dump_src(ADD.op & 0x7, regs, consts, false);
- break;
- case ADD_THREE_SRC:
- dump_src(ADD.src0, regs, consts, false);
- printf(", ");
- dump_src(ADD.op & 0x7, regs, consts, false);
- printf(", ");
- dump_src((ADD.op >> 3) & 0x7, regs, consts, false);
- break;
- case ADD_FADD:
- case ADD_FMINMAX:
- if (ADD.op & 0x10)
- printf("-");
- if (ADD.op & 0x1000)
- printf("abs(");
- dump_src(ADD.src0, regs, consts, false);
- switch ((ADD.op >> 6) & 0x3) {
- case 3:
- printf(".x");
- break;
- default:
- break;
- }
- if (ADD.op & 0x1000)
- printf(")");
- printf(", ");
- if (ADD.op & 0x20)
- printf("-");
- if (ADD.op & 0x8)
- printf("abs(");
- dump_src(ADD.op & 0x7, regs, consts, false);
- switch ((ADD.op >> 6) & 0x3) {
- case 1:
- case 3:
- printf(".x");
- break;
- case 2:
- printf(".y");
- break;
- case 0:
- break;
- default:
- printf(".unk");
- break;
- }
- if (ADD.op & 0x8)
- printf(")");
- break;
- case ADD_FADD16:
- if (ADD.op & 0x10)
- printf("-");
- if (ADD.op & 0x1000)
- printf("abs(");
- dump_src(ADD.src0, regs, consts, false);
- if (ADD.op & 0x1000)
- printf(")");
- dump_16swizzle((ADD.op >> 6) & 0x3);
- printf(", ");
- if (ADD.op & 0x20)
- printf("-");
- if (ADD.op & 0x8)
- printf("abs(");
- dump_src(ADD.op & 0x7, regs, consts, false);
- dump_16swizzle((ADD.op >> 8) & 0x3);
- if (ADD.op & 0x8)
- printf(")");
- break;
- case ADD_FMINMAX16: {
- bool abs1 = ADD.op & 0x8;
- bool abs2 = (ADD.op & 0x7) < ADD.src0;
- if (ADD.op & 0x10)
- printf("-");
- if (abs1 || abs2)
- printf("abs(");
- dump_src(ADD.src0, regs, consts, false);
- dump_16swizzle((ADD.op >> 6) & 0x3);
- if (abs1 || abs2)
- printf(")");
- printf(", ");
- if (ADD.op & 0x20)
- printf("-");
- if (abs1 && abs2)
- printf("abs(");
- dump_src(ADD.op & 0x7, regs, consts, false);
- dump_16swizzle((ADD.op >> 8) & 0x3);
- if (abs1 && abs2)
- printf(")");
- break;
- }
- case ADD_FADDMscale: {
- if (ADD.op & 0x400)
- printf("-");
- if (ADD.op & 0x200)
- printf("abs(");
- dump_src(ADD.src0, regs, consts, false);
- if (ADD.op & 0x200)
- printf(")");
-
- printf(", ");
-
- if (ADD.op & 0x800)
- printf("-");
- dump_src(ADD.op & 0x7, regs, consts, false);
-
- printf(", ");
-
- dump_src((ADD.op >> 3) & 0x7, regs, consts, false);
- break;
- }
- case ADD_FCMP:
- if (ADD.op & 0x400) {
- printf("-");
- }
- if (ADD.op & 0x100) {
- printf("abs(");
- }
- dump_src(ADD.src0, regs, consts, false);
- switch ((ADD.op >> 6) & 0x3) {
- case 3:
- printf(".x");
- break;
- default:
- break;
- }
- if (ADD.op & 0x100) {
- printf(")");
- }
- printf(", ");
- if (ADD.op & 0x200) {
- printf("abs(");
- }
- dump_src(ADD.op & 0x7, regs, consts, false);
- switch ((ADD.op >> 6) & 0x3) {
- case 1:
- case 3:
- printf(".x");
- break;
- case 2:
- printf(".y");
- break;
- case 0:
- break;
- default:
- printf(".unk");
- break;
- }
- if (ADD.op & 0x200) {
- printf(")");
- }
- break;
- case ADD_FCMP16:
- dump_src(ADD.src0, regs, consts, false);
- dump_16swizzle((ADD.op >> 6) & 0x3);
- printf(", ");
- dump_src(ADD.op & 0x7, regs, consts, false);
- dump_16swizzle((ADD.op >> 8) & 0x3);
- break;
- case ADD_BRANCH: {
- enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f);
- enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
- if (code != BR_ALWAYS) {
- dump_src(ADD.src0, regs, consts, false);
- switch (size) {
- case BR_SIZE_16XX:
- printf(".x");
- break;
- case BR_SIZE_16YY:
- case BR_SIZE_16YX0:
- case BR_SIZE_16YX1:
- printf(".y");
- break;
- case BR_SIZE_ZERO: {
- unsigned ctrl = (ADD.op >> 1) & 0x3;
- switch (ctrl) {
- case 1:
- printf(".y");
- break;
- case 2:
- printf(".x");
- break;
- default:
- break;
- }
- }
- default:
- break;
- }
- printf(", ");
- }
- if (code != BR_ALWAYS && size != BR_SIZE_ZERO) {
- dump_src(ADD.op & 0x7, regs, consts, false);
- switch (size) {
- case BR_SIZE_16XX:
- case BR_SIZE_16YX0:
- case BR_SIZE_16YX1:
- case BR_SIZE_32_AND_16X:
- printf(".x");
- break;
- case BR_SIZE_16YY:
- case BR_SIZE_32_AND_16Y:
- printf(".y");
- break;
- default:
- break;
- }
- printf(", ");
- }
- // I haven't had the chance to test if this actually specifies the
- // branch offset, since I couldn't get it to produce values other
- // than 5 (uniform/const high), but these three bits are always
- // consistent across branch instructions, so it makes sense...
- int offsetSrc = (ADD.op >> 3) & 0x7;
- if (offsetSrc == 4 || offsetSrc == 5) {
- // If the offset is known/constant, we can decode it
- uint32_t raw_offset;
- if (offsetSrc == 4)
- raw_offset = get_const(consts, regs);
- else
- raw_offset = get_const(consts, regs) >> 32;
- // The high 4 bits are flags, while the rest is the
- // twos-complement offset in bytes (here we convert to
- // clauses).
- int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8;
-
- // If high4 is the high 4 bits of the last 64-bit constant,
- // this is calculated as (high4 + 4) & 0xf, or 0 if the branch
- // offset itself is the last constant. Not sure if this is
- // actually used, or just garbage in unused bits, but in any
- // case, we can just ignore it here since it's redundant. Note
- // that if there is any padding, this will be 4 since the
- // padding counts as the last constant.
- unsigned flags = raw_offset >> 28;
- (void) flags;
-
- // Note: the offset is in bytes, relative to the beginning of the
- // current clause, so a zero offset would be a loop back to the
- // same clause (annoyingly different from Midgard).
- printf("clause_%d", offset + branch_offset);
- } else {
- dump_src(offsetSrc, regs, consts, false);
- }
- }
- }
- if (info.has_data_reg) {
- printf(", R%d", data_reg);
- }
- printf("\n");
-}
-
-void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts,
- unsigned data_reg, unsigned offset, bool verbose)
-{
- struct bifrost_regs regs;
- memcpy((char *) ®s, (char *) &instr->reg_bits, sizeof(regs));
-
- if (verbose) {
- printf("# regs: %016" PRIx64 "\n", instr->reg_bits);
- dump_regs(regs);
- }
- dump_fma(instr->fma_bits, regs, next_regs, consts, verbose);
- dump_add(instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose);
-}
-
-bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) {
- // State for a decoded clause
- struct bifrost_alu_inst instrs[8] = {};
- uint64_t consts[6] = {};
- unsigned num_instrs = 0;
- unsigned num_consts = 0;
- uint64_t header_bits = 0;
- bool stopbit = false;
-
- unsigned i;
- for (i = 0; ; i++, words += 4) {
- if (verbose) {
- printf("# ");
- for (int j = 0; j < 4; j++)
- printf("%08x ", words[3 - j]); // low bit on the right
- printf("\n");
- }
- unsigned tag = bits(words[0], 0, 8);
-
- // speculatively decode some things that are common between many formats, so we can share some code
- struct bifrost_alu_inst main_instr = {};
- // 20 bits
- main_instr.add_bits = bits(words[2], 2, 32 - 13);
- // 23 bits
- main_instr.fma_bits = bits(words[1], 11, 32) | bits(words[2], 0, 2) << (32 - 11);
- // 35 bits
- main_instr.reg_bits = ((uint64_t) bits(words[1], 0, 11)) << 24 | (uint64_t) bits(words[0], 8, 32);
-
- uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60;
- uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32;
-
- bool stop = tag & 0x40;
-
- if (verbose) {
- printf("# tag: 0x%02x\n", tag);
- }
- if (tag & 0x80) {
- unsigned idx = stop ? 5 : 2;
- main_instr.add_bits |= ((tag >> 3) & 0x7) << 17;
- instrs[idx + 1] = main_instr;
- instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17);
- instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10;
- consts[0] = bits(words[3], 17, 32) << 4;
- } else {
- bool done = false;
- switch ((tag >> 3) & 0x7) {
- case 0x0:
- switch (tag & 0x7) {
- case 0x3:
- main_instr.add_bits |= bits(words[3], 29, 32) << 17;
- instrs[1] = main_instr;
- num_instrs = 2;
- done = stop;
- break;
- case 0x4:
- instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
- instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
- consts[0] = const0;
- num_instrs = 3;
- num_consts = 1;
- done = stop;
- break;
- case 0x1:
- case 0x5:
- instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
- instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
- main_instr.add_bits |= bits(words[3], 26, 29) << 17;
- instrs[3] = main_instr;
- if ((tag & 0x7) == 0x5) {
- num_instrs = 4;
- done = stop;
- }
- break;
- case 0x6:
- instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
- instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
- consts[0] = const0;
- num_instrs = 6;
- num_consts = 1;
- done = stop;
- break;
- case 0x7:
- instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
- instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
- main_instr.add_bits |= bits(words[3], 26, 29) << 17;
- instrs[6] = main_instr;
- num_instrs = 7;
- done = stop;
- break;
- default:
- printf("unknown tag bits 0x%02x\n", tag);
- }
- break;
- case 0x2:
- case 0x3: {
- unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7;
- main_instr.add_bits |= (tag & 0x7) << 17;
- instrs[idx] = main_instr;
- consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19;
- num_consts = 1;
- num_instrs = idx + 1;
- done = stop;
- break;
- }
- case 0x4: {
- unsigned idx = stop ? 4 : 1;
- main_instr.add_bits |= (tag & 0x7) << 17;
- instrs[idx] = main_instr;
- instrs[idx + 1].fma_bits |= bits(words[3], 22, 32);
- instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19));
- break;
- }
- case 0x1:
- // only constants can come after this
- num_instrs = 1;
- done = stop;
- case 0x5:
- header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19));
- main_instr.add_bits |= (tag & 0x7) << 17;
- instrs[0] = main_instr;
- break;
- case 0x6:
- case 0x7: {
- unsigned pos = tag & 0xf;
- // note that `pos' encodes both the total number of
- // instructions and the position in the constant stream,
- // presumably because decoded constants and instructions
- // share a buffer in the decoder, but we only care about
- // the position in the constant stream; the total number of
- // instructions is redundant.
- unsigned const_idx = 7;
- switch (pos) {
- case 0:
- case 1:
- case 2:
- case 6:
- const_idx = 0;
- break;
- case 3:
- case 4:
- case 7:
- case 9:
- const_idx = 1;
- break;
- case 5:
- case 0xa:
- const_idx = 2;
- break;
- case 8:
- case 0xb:
- case 0xc:
- const_idx = 3;
- break;
- case 0xd:
- const_idx = 4;
- break;
- default:
- printf("# unknown pos 0x%x\n", pos);
- }
- if (num_consts < const_idx + 2)
- num_consts = const_idx + 2;
- consts[const_idx] = const0;
- consts[const_idx + 1] = const1;
- done = stop;
- break;
- }
- default:
- break;
- }
-
- if (done)
- break;
- }
- }
-
- *size = i + 1;
-
- if (verbose) {
- printf("# header: %012" PRIx64 "\n", header_bits);
- }
-
- struct bifrost_header header;
- memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header));
- dump_header(header, verbose);
- if (!header.no_end_of_shader)
- stopbit = true;
-
- printf("{\n");
- for (i = 0; i < num_instrs; i++) {
- struct bifrost_regs next_regs;
- if (i + 1 == num_instrs) {
- memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits,
- sizeof(next_regs));
- } else {
- memcpy((char *) &next_regs, (char *) &instrs[i + 1].reg_bits,
- sizeof(next_regs));
- }
-
- dump_instr(&instrs[i], next_regs, consts, header.datareg, offset, verbose);
- }
- printf("}\n");
-
- if (verbose) {
- for (unsigned i = 0; i < num_consts; i++) {
- printf("# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff);
- printf("# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32);
- }
- }
- return stopbit;
-}
-
-void disassemble_bifrost(uint8_t *code, size_t size, bool verbose)
-{
- uint32_t *words = (uint32_t *) code;
- uint32_t *words_end = words + (size / 4);
- // used for displaying branch targets
- unsigned offset = 0;
- while (words != words_end)
- {
- // we don't know what the program-end bit is quite yet, so for now just
- // assume that an all-0 quadword is padding
- uint32_t zero[4] = {};
- if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0)
- break;
- printf("clause_%d:\n", offset);
- unsigned size;
- if (dump_clause(words, &size, offset, verbose) == true) {
- break;
- }
- words += size * 4;
- offset += size;
- }
-}
-
+++ /dev/null
-/*
- * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com>
- * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com>
- * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdbool.h>
-#include <stddef.h>
-void disassemble_bifrost(uint8_t *code, size_t size, bool verbose);
+++ /dev/null
-/*
- * © Copyright 2017-2018 Alyssa Rosenzweig
- * © Copyright 2017-2018 Connor Abbott
- * © Copyright 2017-2018 Lyude Paul
- * © Copyright2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef __PANFROST_JOB_H__
-#define __PANFROST_JOB_H__
-
-#include <stdint.h>
-#include <panfrost-misc.h>
-
-#define MALI_SHORT_PTR_BITS (sizeof(uintptr_t)*8)
-
-#define MALI_FBD_HIERARCHY_WEIGHTS 8
-
-#define MALI_PAYLOAD_SIZE 256
-
-typedef u32 mali_jd_core_req;
-
-enum mali_job_type {
- JOB_NOT_STARTED = 0,
- JOB_TYPE_NULL = 1,
- JOB_TYPE_SET_VALUE = 2,
- JOB_TYPE_CACHE_FLUSH = 3,
- JOB_TYPE_COMPUTE = 4,
- JOB_TYPE_VERTEX = 5,
- JOB_TYPE_GEOMETRY = 6,
- JOB_TYPE_TILER = 7,
- JOB_TYPE_FUSED = 8,
- JOB_TYPE_FRAGMENT = 9,
-};
-
-enum mali_draw_mode {
- MALI_DRAW_NONE = 0x0,
- MALI_POINTS = 0x1,
- MALI_LINES = 0x2,
- MALI_LINE_STRIP = 0x4,
- MALI_LINE_LOOP = 0x6,
- MALI_TRIANGLES = 0x8,
- MALI_TRIANGLE_STRIP = 0xA,
- MALI_TRIANGLE_FAN = 0xC,
- MALI_POLYGON = 0xD,
- MALI_QUADS = 0xE,
- MALI_QUAD_STRIP = 0xF,
-
- /* All other modes invalid */
-};
-
-/* Applies to tiler_gl_enables */
-
-
-#define MALI_OCCLUSION_QUERY (1 << 3)
-#define MALI_OCCLUSION_PRECISE (1 << 4)
-
-/* Set for a glFrontFace(GL_CCW) in a Y=0=TOP coordinate system (like Gallium).
- * In OpenGL, this would corresponds to glFrontFace(GL_CW). Mesa and the blob
- * disagree about how to do viewport flipping, so the blob actually sets this
- * for GL_CW but then has a negative viewport stride */
-#define MALI_FRONT_CCW_TOP (1 << 5)
-
-#define MALI_CULL_FACE_FRONT (1 << 6)
-#define MALI_CULL_FACE_BACK (1 << 7)
-
-/* TODO: Might this actually be a finer bitfield? */
-#define MALI_DEPTH_STENCIL_ENABLE 0x6400
-
-#define DS_ENABLE(field) \
- (field == MALI_DEPTH_STENCIL_ENABLE) \
- ? "MALI_DEPTH_STENCIL_ENABLE" \
- : (field == 0) ? "0" \
- : "0 /* XXX: Unknown, check hexdump */"
-
-/* Used in stencil and depth tests */
-
-enum mali_func {
- MALI_FUNC_NEVER = 0,
- MALI_FUNC_LESS = 1,
- MALI_FUNC_EQUAL = 2,
- MALI_FUNC_LEQUAL = 3,
- MALI_FUNC_GREATER = 4,
- MALI_FUNC_NOTEQUAL = 5,
- MALI_FUNC_GEQUAL = 6,
- MALI_FUNC_ALWAYS = 7
-};
-
-/* Same OpenGL, but mixed up. Why? Because forget me, that's why! */
-
-enum mali_alt_func {
- MALI_ALT_FUNC_NEVER = 0,
- MALI_ALT_FUNC_GREATER = 1,
- MALI_ALT_FUNC_EQUAL = 2,
- MALI_ALT_FUNC_GEQUAL = 3,
- MALI_ALT_FUNC_LESS = 4,
- MALI_ALT_FUNC_NOTEQUAL = 5,
- MALI_ALT_FUNC_LEQUAL = 6,
- MALI_ALT_FUNC_ALWAYS = 7
-};
-
-/* Flags apply to unknown2_3? */
-
-#define MALI_HAS_MSAA (1 << 0)
-#define MALI_CAN_DISCARD (1 << 5)
-
-/* Applies on SFBD systems, specifying that programmable blending is in use */
-#define MALI_HAS_BLEND_SHADER (1 << 6)
-
-/* func is mali_func */
-#define MALI_DEPTH_FUNC(func) (func << 8)
-#define MALI_GET_DEPTH_FUNC(flags) ((flags >> 8) & 0x7)
-#define MALI_DEPTH_FUNC_MASK MALI_DEPTH_FUNC(0x7)
-
-#define MALI_DEPTH_TEST (1 << 11)
-
-/* Next flags to unknown2_4 */
-#define MALI_STENCIL_TEST (1 << 0)
-
-/* What?! */
-#define MALI_SAMPLE_ALPHA_TO_COVERAGE_NO_BLEND_SHADER (1 << 1)
-
-#define MALI_NO_DITHER (1 << 9)
-#define MALI_DEPTH_RANGE_A (1 << 12)
-#define MALI_DEPTH_RANGE_B (1 << 13)
-#define MALI_NO_MSAA (1 << 14)
-
-/* Stencil test state is all encoded in a single u32, just with a lot of
- * enums... */
-
-enum mali_stencil_op {
- MALI_STENCIL_KEEP = 0,
- MALI_STENCIL_REPLACE = 1,
- MALI_STENCIL_ZERO = 2,
- MALI_STENCIL_INVERT = 3,
- MALI_STENCIL_INCR_WRAP = 4,
- MALI_STENCIL_DECR_WRAP = 5,
- MALI_STENCIL_INCR = 6,
- MALI_STENCIL_DECR = 7
-};
-
-struct mali_stencil_test {
- unsigned ref : 8;
- unsigned mask : 8;
- enum mali_func func : 3;
- enum mali_stencil_op sfail : 3;
- enum mali_stencil_op dpfail : 3;
- enum mali_stencil_op dppass : 3;
- unsigned zero : 4;
-} __attribute__((packed));
-
-#define MALI_MASK_R (1 << 0)
-#define MALI_MASK_G (1 << 1)
-#define MALI_MASK_B (1 << 2)
-#define MALI_MASK_A (1 << 3)
-
-enum mali_nondominant_mode {
- MALI_BLEND_NON_MIRROR = 0,
- MALI_BLEND_NON_ZERO = 1
-};
-
-enum mali_dominant_blend {
- MALI_BLEND_DOM_SOURCE = 0,
- MALI_BLEND_DOM_DESTINATION = 1
-};
-
-enum mali_dominant_factor {
- MALI_DOMINANT_UNK0 = 0,
- MALI_DOMINANT_ZERO = 1,
- MALI_DOMINANT_SRC_COLOR = 2,
- MALI_DOMINANT_DST_COLOR = 3,
- MALI_DOMINANT_UNK4 = 4,
- MALI_DOMINANT_SRC_ALPHA = 5,
- MALI_DOMINANT_DST_ALPHA = 6,
- MALI_DOMINANT_CONSTANT = 7,
-};
-
-enum mali_blend_modifier {
- MALI_BLEND_MOD_UNK0 = 0,
- MALI_BLEND_MOD_NORMAL = 1,
- MALI_BLEND_MOD_SOURCE_ONE = 2,
- MALI_BLEND_MOD_DEST_ONE = 3,
-};
-
-struct mali_blend_mode {
- enum mali_blend_modifier clip_modifier : 2;
- unsigned unused_0 : 1;
- unsigned negate_source : 1;
-
- enum mali_dominant_blend dominant : 1;
-
- enum mali_nondominant_mode nondominant_mode : 1;
-
- unsigned unused_1 : 1;
-
- unsigned negate_dest : 1;
-
- enum mali_dominant_factor dominant_factor : 3;
- unsigned complement_dominant : 1;
-} __attribute__((packed));
-
-struct mali_blend_equation {
- /* Of type mali_blend_mode */
- unsigned rgb_mode : 12;
- unsigned alpha_mode : 12;
-
- unsigned zero1 : 4;
-
- /* Corresponds to MALI_MASK_* above and glColorMask arguments */
-
- unsigned color_mask : 4;
-} __attribute__((packed));
-
-/* Used with channel swizzling */
-enum mali_channel {
- MALI_CHANNEL_RED = 0,
- MALI_CHANNEL_GREEN = 1,
- MALI_CHANNEL_BLUE = 2,
- MALI_CHANNEL_ALPHA = 3,
- MALI_CHANNEL_ZERO = 4,
- MALI_CHANNEL_ONE = 5,
- MALI_CHANNEL_RESERVED_0 = 6,
- MALI_CHANNEL_RESERVED_1 = 7,
-};
-
-struct mali_channel_swizzle {
- enum mali_channel r : 3;
- enum mali_channel g : 3;
- enum mali_channel b : 3;
- enum mali_channel a : 3;
-} __attribute__((packed));
-
-/* Compressed per-pixel formats. Each of these formats expands to one to four
- * floating-point or integer numbers, as defined by the OpenGL specification.
- * There are various places in OpenGL where the user can specify a compressed
- * format in memory, which all use the same 8-bit enum in the various
- * descriptors, although different hardware units support different formats.
- */
-
-/* The top 3 bits specify how the bits of each component are interpreted. */
-
-/* e.g. R11F_G11F_B10F */
-#define MALI_FORMAT_SPECIAL (2 << 5)
-
-/* signed normalized, e.g. RGBA8_SNORM */
-#define MALI_FORMAT_SNORM (3 << 5)
-
-/* e.g. RGBA8UI */
-#define MALI_FORMAT_UINT (4 << 5)
-
-/* e.g. RGBA8 and RGBA32F */
-#define MALI_FORMAT_UNORM (5 << 5)
-
-/* e.g. RGBA8I and RGBA16F */
-#define MALI_FORMAT_SINT (6 << 5)
-
-/* These formats seem to largely duplicate the others. They're used at least
- * for Bifrost framebuffer output.
- */
-#define MALI_FORMAT_SPECIAL2 (7 << 5)
-
-/* If the high 3 bits are 3 to 6 these two bits say how many components
- * there are.
- */
-#define MALI_NR_CHANNELS(n) ((n - 1) << 3)
-
-/* If the high 3 bits are 3 to 6, then the low 3 bits say how big each
- * component is, except the special MALI_CHANNEL_FLOAT which overrides what the
- * bits mean.
- */
-
-#define MALI_CHANNEL_4 2
-
-#define MALI_CHANNEL_8 3
-
-#define MALI_CHANNEL_16 4
-
-#define MALI_CHANNEL_32 5
-
-/* For MALI_FORMAT_SINT it means a half-float (e.g. RG16F). For
- * MALI_FORMAT_UNORM, it means a 32-bit float.
- */
-#define MALI_CHANNEL_FLOAT 7
-
-enum mali_format {
- MALI_RGB565 = MALI_FORMAT_SPECIAL | 0x0,
- MALI_RGB5_A1_UNORM = MALI_FORMAT_SPECIAL | 0x2,
- MALI_RGB10_A2_UNORM = MALI_FORMAT_SPECIAL | 0x3,
- MALI_RGB10_A2_SNORM = MALI_FORMAT_SPECIAL | 0x5,
- MALI_RGB10_A2UI = MALI_FORMAT_SPECIAL | 0x7,
- MALI_RGB10_A2I = MALI_FORMAT_SPECIAL | 0x9,
-
- /* YUV formats */
- MALI_NV12 = MALI_FORMAT_SPECIAL | 0xc,
-
- MALI_Z32_UNORM = MALI_FORMAT_SPECIAL | 0xD,
- MALI_R32_FIXED = MALI_FORMAT_SPECIAL | 0x11,
- MALI_RG32_FIXED = MALI_FORMAT_SPECIAL | 0x12,
- MALI_RGB32_FIXED = MALI_FORMAT_SPECIAL | 0x13,
- MALI_RGBA32_FIXED = MALI_FORMAT_SPECIAL | 0x14,
- MALI_R11F_G11F_B10F = MALI_FORMAT_SPECIAL | 0x19,
- MALI_R9F_G9F_B9F_E5F = MALI_FORMAT_SPECIAL | 0x1b,
- /* Only used for varyings, to indicate the transformed gl_Position */
- MALI_VARYING_POS = MALI_FORMAT_SPECIAL | 0x1e,
- /* Only used for varyings, to indicate that the write should be
- * discarded.
- */
- MALI_VARYING_DISCARD = MALI_FORMAT_SPECIAL | 0x1f,
-
- MALI_R8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
- MALI_R16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
- MALI_R32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
- MALI_RG8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
- MALI_RG16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
- MALI_RG32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
- MALI_RGB8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
- MALI_RGB16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
- MALI_RGB32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
- MALI_RGBA8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
- MALI_RGBA16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
- MALI_RGBA32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
-
- MALI_R8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
- MALI_R16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
- MALI_R32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
- MALI_RG8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
- MALI_RG16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
- MALI_RG32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
- MALI_RGB8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
- MALI_RGB16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
- MALI_RGB32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
- MALI_RGBA8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
- MALI_RGBA16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
- MALI_RGBA32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
-
- MALI_R8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
- MALI_R16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
- MALI_R32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
- MALI_R32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT,
- MALI_RG8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
- MALI_RG16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
- MALI_RG32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
- MALI_RG32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT,
- MALI_RGB8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
- MALI_RGB16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
- MALI_RGB32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
- MALI_RGB32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT,
- MALI_RGBA4_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_4,
- MALI_RGBA8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
- MALI_RGBA16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
- MALI_RGBA32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
- MALI_RGBA32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT,
-
- MALI_R8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
- MALI_R16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
- MALI_R32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
- MALI_R16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT,
- MALI_RG8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
- MALI_RG16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
- MALI_RG32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
- MALI_RG16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT,
- MALI_RGB8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
- MALI_RGB16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
- MALI_RGB32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
- MALI_RGB16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT,
- MALI_RGBA8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
- MALI_RGBA16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
- MALI_RGBA32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
- MALI_RGBA16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT,
-
- MALI_RGBA4 = MALI_FORMAT_SPECIAL2 | 0x8,
- MALI_RGBA8_2 = MALI_FORMAT_SPECIAL2 | 0xd,
- MALI_RGB10_A2_2 = MALI_FORMAT_SPECIAL2 | 0xe,
-};
-
-
-/* Alpha coverage is encoded as 4-bits (from a clampf), with inversion
- * literally performing a bitwise invert. This function produces slightly wrong
- * results and I'm not sure why; some rounding issue I suppose... */
-
-#define MALI_ALPHA_COVERAGE(clampf) ((uint16_t) (int) (clampf * 15.0f))
-#define MALI_GET_ALPHA_COVERAGE(nibble) ((float) nibble / 15.0f)
-
-/* Applies to midgard1.flags */
-
-/* Should the hardware perform early-Z testing? Normally should be set
- * for performance reasons. Clear if you use: discard,
- * alpha-to-coverage... * It's also possible this disables
- * forward-pixel kill; we're not quite sure which bit is which yet.
- * TODO: How does this interact with blending?*/
-
-#define MALI_EARLY_Z (1 << 6)
-
-/* Should the hardware calculate derivatives (via helper invocations)? Set in a
- * fragment shader that uses texturing or derivative functions */
-
-#define MALI_HELPER_INVOCATIONS (1 << 7)
-
-/* Flags denoting the fragment shader's use of tilebuffer readback. If the
- * shader might read any part of the tilebuffer, set MALI_READS_TILEBUFFER. If
- * it might read depth/stencil in particular, also set MALI_READS_ZS */
-
-#define MALI_READS_ZS (1 << 8)
-#define MALI_READS_TILEBUFFER (1 << 12)
-
-/* The raw Midgard blend payload can either be an equation or a shader
- * address, depending on the context */
-
-union midgard_blend {
- mali_ptr shader;
-
- struct {
- struct mali_blend_equation equation;
- float constant;
- };
-};
-
-/* On MRT Midgard systems (using an MFBD), each render target gets its own
- * blend descriptor */
-
-#define MALI_BLEND_SRGB (0x400)
-
-struct midgard_blend_rt {
- /* Flags base value of 0x200 to enable the render target.
- * OR with 0x1 for blending (anything other than REPLACE).
- * OR with 0x2 for programmable blending with 0-2 registers
- * OR with 0x3 for programmable blending with 2+ registers
- * OR with MALI_BLEND_SRGB for implicit sRGB
- */
-
- u64 flags;
- union midgard_blend blend;
-} __attribute__((packed));
-
-/* On Bifrost systems (all MRT), each render target gets one of these
- * descriptors */
-
-struct bifrost_blend_rt {
- /* This is likely an analogue of the flags on
- * midgard_blend_rt */
-
- u16 flags; // = 0x200
-
- /* Single-channel blend constants are encoded in a sort of
- * fixed-point. Basically, the float is mapped to a byte, becoming
- * a high byte, and then the lower-byte is added for precision.
- * For the original float f:
- *
- * f = (constant_hi / 255) + (constant_lo / 65535)
- *
- * constant_hi = int(f / 255)
- * constant_lo = 65535*f - (65535/255) * constant_hi
- */
-
- u16 constant;
-
- struct mali_blend_equation equation;
- /*
- * - 0x19 normally
- * - 0x3 when this slot is unused (everything else is 0 except the index)
- * - 0x11 when this is the fourth slot (and it's used)
-+ * - 0 when there is a blend shader
- */
- u16 unk2;
- /* increments from 0 to 3 */
- u16 index;
-
- union {
- struct {
- /* So far, I've only seen:
- * - R001 for 1-component formats
- * - RG01 for 2-component formats
- * - RGB1 for 3-component formats
- * - RGBA for 4-component formats
- */
- u32 swizzle : 12;
- enum mali_format format : 8;
-
- /* Type of the shader output variable. Note, this can
- * be different from the format.
- *
- * 0: f16 (mediump float)
- * 1: f32 (highp float)
- * 2: i32 (highp int)
- * 3: u32 (highp uint)
- * 4: i16 (mediump int)
- * 5: u16 (mediump uint)
- */
- u32 shader_type : 3;
- u32 zero : 9;
- };
-
- /* Only the low 32 bits of the blend shader are stored, the
- * high 32 bits are implicitly the same as the original shader.
- * According to the kernel driver, the program counter for
- * shaders is actually only 24 bits, so shaders cannot cross
- * the 2^24-byte boundary, and neither can the blend shader.
- * The blob handles this by allocating a 2^24 byte pool for
- * shaders, and making sure that any blend shaders are stored
- * in the same pool as the original shader. The kernel will
- * make sure this allocation is aligned to 2^24 bytes.
- */
- u32 shader;
- };
-} __attribute__((packed));
-
-/* Descriptor for the shader. Following this is at least one, up to four blend
- * descriptors for each active render target */
-
-struct mali_shader_meta {
- mali_ptr shader;
- u16 texture_count;
- u16 sampler_count;
- u16 attribute_count;
- u16 varying_count;
-
- union {
- struct {
- u32 uniform_buffer_count : 4;
- u32 unk1 : 28; // = 0x800000 for vertex, 0x958020 for tiler
- } bifrost1;
- struct {
- unsigned uniform_buffer_count : 4;
- unsigned flags : 12;
-
- /* Whole number of uniform registers used, times two;
- * whole number of work registers used (no scale).
- */
- unsigned work_count : 5;
- unsigned uniform_count : 5;
- unsigned unknown2 : 6;
- } midgard1;
- };
-
- /* On bifrost: Exactly the same as glPolygonOffset() for both.
- * On midgard: Depth factor is exactly as passed to glPolygonOffset.
- * Depth units is equal to the value passed to glDeptOhffset + 1.0f
- * (use MALI_NEGATIVE)
- */
- float depth_units;
- float depth_factor;
-
- u32 unknown2_2;
-
- u16 alpha_coverage;
- u16 unknown2_3;
-
- u8 stencil_mask_front;
- u8 stencil_mask_back;
- u16 unknown2_4;
-
- struct mali_stencil_test stencil_front;
- struct mali_stencil_test stencil_back;
-
- union {
- struct {
- u32 unk3 : 7;
- /* On Bifrost, some system values are preloaded in
- * registers R55-R62 by the thread dispatcher prior to
- * the start of shader execution. This is a bitfield
- * with one entry for each register saying which
- * registers need to be preloaded. Right now, the known
- * values are:
- *
- * Vertex/compute:
- * - R55 : gl_LocalInvocationID.xy
- * - R56 : gl_LocalInvocationID.z + unknown in high 16 bits
- * - R57 : gl_WorkGroupID.x
- * - R58 : gl_WorkGroupID.y
- * - R59 : gl_WorkGroupID.z
- * - R60 : gl_GlobalInvocationID.x
- * - R61 : gl_GlobalInvocationID.y/gl_VertexID (without base)
- * - R62 : gl_GlobalInvocationID.z/gl_InstanceID (without base)
- *
- * Fragment:
- * - R55 : unknown, never seen (but the bit for this is
- * always set?)
- * - R56 : unknown (bit always unset)
- * - R57 : gl_PrimitiveID
- * - R58 : gl_FrontFacing in low bit, potentially other stuff
- * - R59 : u16 fragment coordinates (used to compute
- * gl_FragCoord.xy, together with sample positions)
- * - R60 : gl_SampleMask (used in epilog, so pretty
- * much always used, but the bit is always 0 -- is
- * this just always pushed?)
- * - R61 : gl_SampleMaskIn and gl_SampleID, used by
- * varying interpolation.
- * - R62 : unknown (bit always unset).
- */
- u32 preload_regs : 8;
- /* In units of 8 bytes or 64 bits, since the
- * uniform/const port loads 64 bits at a time.
- */
- u32 uniform_count : 7;
- u32 unk4 : 10; // = 2
- } bifrost2;
- struct {
- u32 unknown2_7;
- } midgard2;
- };
-
- /* zero on bifrost */
- u32 unknown2_8;
-
- /* Blending information for the older non-MRT Midgard HW. Check for
- * MALI_HAS_BLEND_SHADER to decide how to interpret.
- */
-
- union midgard_blend blend;
-} __attribute__((packed));
-
-/* This only concerns hardware jobs */
-
-/* Possible values for job_descriptor_size */
-
-#define MALI_JOB_32 0
-#define MALI_JOB_64 1
-
-struct mali_job_descriptor_header {
- u32 exception_status;
- u32 first_incomplete_task;
- u64 fault_pointer;
- u8 job_descriptor_size : 1;
- enum mali_job_type job_type : 7;
- u8 job_barrier : 1;
- u8 unknown_flags : 7;
- u16 job_index;
- u16 job_dependency_index_1;
- u16 job_dependency_index_2;
-
- union {
- u64 next_job_64;
- u32 next_job_32;
- };
-} __attribute__((packed));
-
-struct mali_payload_set_value {
- u64 out;
- u64 unknown;
-} __attribute__((packed));
-
-/* Special attributes have a fixed index */
-#define MALI_SPECIAL_ATTRIBUTE_BASE 16
-#define MALI_VERTEX_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 0)
-#define MALI_INSTANCE_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 1)
-
-/*
- * Mali Attributes
- *
- * This structure lets the attribute unit compute the address of an attribute
- * given the vertex and instance ID. Unfortunately, the way this works is
- * rather complicated when instancing is enabled.
- *
- * To explain this, first we need to explain how compute and vertex threads are
- * dispatched. This is a guess (although a pretty firm guess!) since the
- * details are mostly hidden from the driver, except for attribute instancing.
- * When a quad is dispatched, it receives a single, linear index. However, we
- * need to translate that index into a (vertex id, instance id) pair, or a
- * (local id x, local id y, local id z) triple for compute shaders (although
- * vertex shaders and compute shaders are handled almost identically).
- * Focusing on vertex shaders, one option would be to do:
- *
- * vertex_id = linear_id % num_vertices
- * instance_id = linear_id / num_vertices
- *
- * but this involves a costly division and modulus by an arbitrary number.
- * Instead, we could pad num_vertices. We dispatch padded_num_vertices *
- * num_instances threads instead of num_vertices * num_instances, which results
- * in some "extra" threads with vertex_id >= num_vertices, which we have to
- * discard. The more we pad num_vertices, the more "wasted" threads we
- * dispatch, but the division is potentially easier.
- *
- * One straightforward choice is to pad num_vertices to the next power of two,
- * which means that the division and modulus are just simple bit shifts and
- * masking. But the actual algorithm is a bit more complicated. The thread
- * dispatcher has special support for dividing by 3, 5, 7, and 9, in addition
- * to dividing by a power of two. This is possibly using the technique
- * described in patent US20170010862A1. As a result, padded_num_vertices can be
- * 1, 3, 5, 7, or 9 times a power of two. This results in less wasted threads,
- * since we need less padding.
- *
- * padded_num_vertices is picked by the hardware. The driver just specifies the
- * actual number of vertices. At least for Mali G71, the first few cases are
- * given by:
- *
- * num_vertices | padded_num_vertices
- * 3 | 4
- * 4-7 | 8
- * 8-11 | 12 (3 * 4)
- * 12-15 | 16
- * 16-19 | 20 (5 * 4)
- *
- * Note that padded_num_vertices is a multiple of four (presumably because
- * threads are dispatched in groups of 4). Also, padded_num_vertices is always
- * at least one more than num_vertices, which seems like a quirk of the
- * hardware. For larger num_vertices, the hardware uses the following
- * algorithm: using the binary representation of num_vertices, we look at the
- * most significant set bit as well as the following 3 bits. Let n be the
- * number of bits after those 4 bits. Then we set padded_num_vertices according
- * to the following table:
- *
- * high bits | padded_num_vertices
- * 1000 | 9 * 2^n
- * 1001 | 5 * 2^(n+1)
- * 101x | 3 * 2^(n+2)
- * 110x | 7 * 2^(n+1)
- * 111x | 2^(n+4)
- *
- * For example, if num_vertices = 70 is passed to glDraw(), its binary
- * representation is 1000110, so n = 3 and the high bits are 1000, and
- * therefore padded_num_vertices = 9 * 2^3 = 72.
- *
- * The attribute unit works in terms of the original linear_id. if
- * num_instances = 1, then they are the same, and everything is simple.
- * However, with instancing things get more complicated. There are four
- * possible modes, two of them we can group together:
- *
- * 1. Use the linear_id directly. Only used when there is no instancing.
- *
- * 2. Use the linear_id modulo a constant. This is used for per-vertex
- * attributes with instancing enabled by making the constant equal
- * padded_num_vertices. Because the modulus is always padded_num_vertices, this
- * mode only supports a modulus that is a power of 2 times 1, 3, 5, 7, or 9.
- * The shift field specifies the power of two, while the extra_flags field
- * specifies the odd number. If shift = n and extra_flags = m, then the modulus
- * is (2m + 1) * 2^n. As an example, if num_vertices = 70, then as computed
- * above, padded_num_vertices = 9 * 2^3, so we should set extra_flags = 4 and
- * shift = 3. Note that we must exactly follow the hardware algorithm used to
- * get padded_num_vertices in order to correctly implement per-vertex
- * attributes.
- *
- * 3. Divide the linear_id by a constant. In order to correctly implement
- * instance divisors, we have to divide linear_id by padded_num_vertices times
- * to user-specified divisor. So first we compute padded_num_vertices, again
- * following the exact same algorithm that the hardware uses, then multiply it
- * by the GL-level divisor to get the hardware-level divisor. This case is
- * further divided into two more cases. If the hardware-level divisor is a
- * power of two, then we just need to shift. The shift amount is specified by
- * the shift field, so that the hardware-level divisor is just 2^shift.
- *
- * If it isn't a power of two, then we have to divide by an arbitrary integer.
- * For that, we use the well-known technique of multiplying by an approximation
- * of the inverse. The driver must compute the magic multiplier and shift
- * amount, and then the hardware does the multiplication and shift. The
- * hardware and driver also use the "round-down" optimization as described in
- * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf.
- * The hardware further assumes the multiplier is between 2^31 and 2^32, so the
- * high bit is implicitly set to 1 even though it is set to 0 by the driver --
- * presumably this simplifies the hardware multiplier a little. The hardware
- * first multiplies linear_id by the multiplier and takes the high 32 bits,
- * then applies the round-down correction if extra_flags = 1, then finally
- * shifts right by the shift field.
- *
- * There are some differences between ridiculousfish's algorithm and the Mali
- * hardware algorithm, which means that the reference code from ridiculousfish
- * doesn't always produce the right constants. Mali does not use the pre-shift
- * optimization, since that would make a hardware implementation slower (it
- * would have to always do the pre-shift, multiply, and post-shift operations).
- * It also forces the multplier to be at least 2^31, which means that the
- * exponent is entirely fixed, so there is no trial-and-error. Altogether,
- * given the divisor d, the algorithm the driver must follow is:
- *
- * 1. Set shift = floor(log2(d)).
- * 2. Compute m = ceil(2^(shift + 32) / d) and e = 2^(shift + 32) % d.
- * 3. If e <= 2^shift, then we need to use the round-down algorithm. Set
- * magic_divisor = m - 1 and extra_flags = 1.
- * 4. Otherwise, set magic_divisor = m and extra_flags = 0.
- */
-
-enum mali_attr_mode {
- MALI_ATTR_UNUSED = 0,
- MALI_ATTR_LINEAR = 1,
- MALI_ATTR_POT_DIVIDE = 2,
- MALI_ATTR_MODULO = 3,
- MALI_ATTR_NPOT_DIVIDE = 4,
-};
-
-/* This magic "pseudo-address" is used as `elements` to implement
- * gl_PointCoord. When read from a fragment shader, it generates a point
- * coordinate per the OpenGL ES 2.0 specification. Flipped coordinate spaces
- * require an affine transformation in the shader. */
-
-#define MALI_VARYING_POINT_COORD (0x60)
-
-union mali_attr {
- /* This is used for actual attributes. */
- struct {
- /* The bottom 3 bits are the mode */
- mali_ptr elements : 64 - 8;
- u32 shift : 5;
- u32 extra_flags : 3;
- u32 stride;
- u32 size;
- };
- /* The entry after an NPOT_DIVIDE entry has this format. It stores
- * extra information that wouldn't fit in a normal entry.
- */
- struct {
- u32 unk; /* = 0x20 */
- u32 magic_divisor;
- u32 zero;
- /* This is the original, GL-level divisor. */
- u32 divisor;
- };
-} __attribute__((packed));
-
-struct mali_attr_meta {
- /* Vertex buffer index */
- u8 index;
-
- unsigned unknown1 : 2;
- unsigned swizzle : 12;
- enum mali_format format : 8;
-
- /* Always observed to be zero at the moment */
- unsigned unknown3 : 2;
-
- /* When packing multiple attributes in a buffer, offset addresses by
- * this value. Obscurely, this is signed. */
- int32_t src_offset;
-} __attribute__((packed));
-
-enum mali_fbd_type {
- MALI_SFBD = 0,
- MALI_MFBD = 1,
-};
-
-#define FBD_TYPE (1)
-#define FBD_MASK (~0x3f)
-
-struct mali_uniform_buffer_meta {
- /* This is actually the size minus 1 (MALI_POSITIVE), in units of 16
- * bytes. This gives a maximum of 2^14 bytes, which just so happens to
- * be the GL minimum-maximum for GL_MAX_UNIFORM_BLOCK_SIZE.
- */
- u64 size : 10;
-
- /* This is missing the bottom 2 bits and top 8 bits. The top 8 bits
- * should be 0 for userspace pointers, according to
- * https://lwn.net/Articles/718895/. By reusing these bits, we can make
- * each entry in the table only 64 bits.
- */
- mali_ptr ptr : 64 - 10;
-};
-
-/* On Bifrost, these fields are the same between the vertex and tiler payloads.
- * They also seem to be the same between Bifrost and Midgard. They're shared in
- * fused payloads.
- */
-
-/* Applies to unknown_draw */
-
-#define MALI_DRAW_INDEXED_UINT8 (0x10)
-#define MALI_DRAW_INDEXED_UINT16 (0x20)
-#define MALI_DRAW_INDEXED_UINT32 (0x30)
-#define MALI_DRAW_VARYING_SIZE (0x100)
-#define MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX (0x10000)
-
-struct mali_vertex_tiler_prefix {
- /* This is a dynamic bitfield containing the following things in this order:
- *
- * - gl_WorkGroupSize.x
- * - gl_WorkGroupSize.y
- * - gl_WorkGroupSize.z
- * - gl_NumWorkGroups.x
- * - gl_NumWorkGroups.y
- * - gl_NumWorkGroups.z
- *
- * The number of bits allocated for each number is based on the *_shift
- * fields below. For example, workgroups_y_shift gives the bit that
- * gl_NumWorkGroups.y starts at, and workgroups_z_shift gives the bit
- * that gl_NumWorkGroups.z starts at (and therefore one after the bit
- * that gl_NumWorkGroups.y ends at). The actual value for each gl_*
- * value is one more than the stored value, since if any of the values
- * are zero, then there would be no invocations (and hence no job). If
- * there were 0 bits allocated to a given field, then it must be zero,
- * and hence the real value is one.
- *
- * Vertex jobs reuse the same job dispatch mechanism as compute jobs,
- * effectively doing glDispatchCompute(1, vertex_count, instance_count)
- * where vertex count is the number of vertices.
- */
- u32 invocation_count;
-
- u32 size_y_shift : 5;
- u32 size_z_shift : 5;
- u32 workgroups_x_shift : 6;
- u32 workgroups_y_shift : 6;
- u32 workgroups_z_shift : 6;
- /* This is max(workgroups_x_shift, 2) in all the cases I've seen. */
- u32 workgroups_x_shift_2 : 4;
-
- u32 draw_mode : 4;
- u32 unknown_draw : 22;
-
- /* This is the the same as workgroups_x_shift_2 in compute shaders, but
- * always 5 for vertex jobs and 6 for tiler jobs. I suspect this has
- * something to do with how many quads get put in the same execution
- * engine, which is a balance (you don't want to starve the engine, but
- * you also want to distribute work evenly).
- */
- u32 workgroups_x_shift_3 : 6;
-
-
- /* Negative of draw_start for TILER jobs from what I've seen */
- int32_t negative_start;
- u32 zero1;
-
- /* Like many other strictly nonzero quantities, index_count is
- * subtracted by one. For an indexed cube, this is equal to 35 = 6
- * faces * 2 triangles/per face * 3 vertices/per triangle - 1. That is,
- * for an indexed draw, index_count is the number of actual vertices
- * rendered whereas invocation_count is the number of unique vertices
- * rendered (the number of times the vertex shader must be invoked).
- * For non-indexed draws, this is just equal to invocation_count. */
-
- u32 index_count;
-
- /* No hidden structure; literally just a pointer to an array of uint
- * indices (width depends on flags). Thanks, guys, for not making my
- * life insane for once! NULL for non-indexed draws. */
-
- uintptr_t indices;
-} __attribute__((packed));
-
-/* Point size / line width can either be specified as a 32-bit float (for
- * constant size) or as a [machine word size]-bit GPU pointer (for varying size). If a pointer
- * is selected, by setting the appropriate MALI_DRAW_VARYING_SIZE bit in the tiler
- * payload, the contents of varying_pointer will be intepreted as an array of
- * fp16 sizes, one for each vertex. gl_PointSize is therefore implemented by
- * creating a special MALI_R16F varying writing to varying_pointer. */
-
-union midgard_primitive_size {
- float constant;
- uintptr_t pointer;
-};
-
-struct bifrost_vertex_only {
- u32 unk2; /* =0x2 */
-
- u32 zero0;
-
- u64 zero1;
-} __attribute__((packed));
-
-struct bifrost_tiler_heap_meta {
- u32 zero;
- u32 heap_size;
- /* note: these are just guesses! */
- mali_ptr tiler_heap_start;
- mali_ptr tiler_heap_free;
- mali_ptr tiler_heap_end;
-
- /* hierarchy weights? but they're still 0 after the job has run... */
- u32 zeros[12];
-} __attribute__((packed));
-
-struct bifrost_tiler_meta {
- u64 zero0;
- u16 hierarchy_mask;
- u16 flags;
- u16 width;
- u16 height;
- u64 zero1;
- mali_ptr tiler_heap_meta;
- /* TODO what is this used for? */
- u64 zeros[20];
-} __attribute__((packed));
-
-struct bifrost_tiler_only {
- /* 0x20 */
- union midgard_primitive_size primitive_size;
-
- mali_ptr tiler_meta;
-
- u64 zero1, zero2, zero3, zero4, zero5, zero6;
-
- u32 gl_enables;
- u32 zero7;
- u64 zero8;
-} __attribute__((packed));
-
-struct bifrost_scratchpad {
- u32 zero;
- u32 flags; // = 0x1f
- /* This is a pointer to a CPU-inaccessible buffer, 16 pages, allocated
- * during startup. It seems to serve the same purpose as the
- * gpu_scratchpad in the SFBD for Midgard, although it's slightly
- * larger.
- */
- mali_ptr gpu_scratchpad;
-} __attribute__((packed));
-
-struct mali_vertex_tiler_postfix {
- /* Zero for vertex jobs. Pointer to the position (gl_Position) varying
- * output from the vertex shader for tiler jobs.
- */
-
- uintptr_t position_varying;
-
- /* An array of mali_uniform_buffer_meta's. The size is given by the
- * shader_meta.
- */
- uintptr_t uniform_buffers;
-
- /* This is a pointer to an array of pointers to the texture
- * descriptors, number of pointers bounded by number of textures. The
- * indirection is needed to accomodate varying numbers and sizes of
- * texture descriptors */
- uintptr_t texture_trampoline;
-
- /* For OpenGL, from what I've seen, this is intimately connected to
- * texture_meta. cwabbott says this is not the case under Vulkan, hence
- * why this field is seperate (Midgard is Vulkan capable). Pointer to
- * array of sampler descriptors (which are uniform in size) */
- uintptr_t sampler_descriptor;
-
- uintptr_t uniforms;
- u8 flags : 4;
- uintptr_t _shader_upper : MALI_SHORT_PTR_BITS - 4; /* struct shader_meta */
- uintptr_t attributes; /* struct attribute_buffer[] */
- uintptr_t attribute_meta; /* attribute_meta[] */
- uintptr_t varyings; /* struct attr */
- uintptr_t varying_meta; /* pointer */
- uintptr_t viewport;
- uintptr_t occlusion_counter; /* A single bit as far as I can tell */
-
- /* Note: on Bifrost, this isn't actually the FBD. It points to
- * bifrost_scratchpad instead. However, it does point to the same thing
- * in vertex and tiler jobs.
- */
- mali_ptr framebuffer;
-} __attribute__((packed));
-
-struct midgard_payload_vertex_tiler {
-#ifndef __LP64__
- union midgard_primitive_size primitive_size;
-#endif
-
- struct mali_vertex_tiler_prefix prefix;
-
-#ifndef __LP64__
- u32 zero3;
-#endif
-
- u16 gl_enables; // 0x5
-
- /* Both zero for non-instanced draws. For instanced draws, a
- * decomposition of padded_num_vertices. See the comments about the
- * corresponding fields in mali_attr for context. */
-
- unsigned instance_shift : 5;
- unsigned instance_odd : 3;
-
- u8 zero4;
-
- /* Offset for first vertex in buffer */
- u32 draw_start;
-
- uintptr_t zero5;
-
- struct mali_vertex_tiler_postfix postfix;
-
-#ifdef __LP64__
- union midgard_primitive_size primitive_size;
-#endif
-} __attribute__((packed));
-
-struct bifrost_payload_vertex {
- struct mali_vertex_tiler_prefix prefix;
- struct bifrost_vertex_only vertex;
- struct mali_vertex_tiler_postfix postfix;
-} __attribute__((packed));
-
-struct bifrost_payload_tiler {
- struct mali_vertex_tiler_prefix prefix;
- struct bifrost_tiler_only tiler;
- struct mali_vertex_tiler_postfix postfix;
-} __attribute__((packed));
-
-struct bifrost_payload_fused {
- struct mali_vertex_tiler_prefix prefix;
- struct bifrost_tiler_only tiler;
- struct mali_vertex_tiler_postfix tiler_postfix;
- u64 padding; /* zero */
- struct bifrost_vertex_only vertex;
- struct mali_vertex_tiler_postfix vertex_postfix;
-} __attribute__((packed));
-
-/* Purposeful off-by-one in width, height fields. For example, a (64, 64)
- * texture is stored as (63, 63) in these fields. This adjusts for that.
- * There's an identical pattern in the framebuffer descriptor. Even vertex
- * count fields work this way, hence the generic name -- integral fields that
- * are strictly positive generally need this adjustment. */
-
-#define MALI_POSITIVE(dim) (dim - 1)
-
-/* Opposite of MALI_POSITIVE, found in the depth_units field */
-
-#define MALI_NEGATIVE(dim) (dim + 1)
-
-/* Used with wrapping. Incomplete (this is a 4-bit field...) */
-
-enum mali_wrap_mode {
- MALI_WRAP_REPEAT = 0x8,
- MALI_WRAP_CLAMP_TO_EDGE = 0x9,
- MALI_WRAP_CLAMP_TO_BORDER = 0xB,
- MALI_WRAP_MIRRORED_REPEAT = 0xC
-};
-
-/* Shared across both command stream and Midgard, and even with Bifrost */
-
-enum mali_texture_type {
- MALI_TEX_CUBE = 0x0,
- MALI_TEX_1D = 0x1,
- MALI_TEX_2D = 0x2,
- MALI_TEX_3D = 0x3
-};
-
-/* 8192x8192 */
-#define MAX_MIP_LEVELS (13)
-
-/* Cubemap bloats everything up */
-#define MAX_CUBE_FACES (6)
-
-/* For each pointer, there is an address and optionally also a stride */
-#define MAX_ELEMENTS (2)
-
-/* Corresponds to the type passed to glTexImage2D and so forth */
-
-/* Flags for usage2 */
-#define MALI_TEX_MANUAL_STRIDE (0x20)
-
-struct mali_texture_format {
- unsigned swizzle : 12;
- enum mali_format format : 8;
-
- unsigned srgb : 1;
- unsigned unknown1 : 1;
-
- enum mali_texture_type type : 2;
-
- unsigned usage2 : 8;
-} __attribute__((packed));
-
-struct mali_texture_descriptor {
- uint16_t width;
- uint16_t height;
- uint16_t depth;
- uint16_t array_size;
-
- struct mali_texture_format format;
-
- uint16_t unknown3;
-
- /* One for non-mipmapped, zero for mipmapped */
- uint8_t unknown3A;
-
- /* Zero for non-mipmapped, (number of levels - 1) for mipmapped */
- uint8_t nr_mipmap_levels;
-
- /* Swizzling is a single 32-bit word, broken up here for convenience.
- * Here, swizzling refers to the ES 3.0 texture parameters for channel
- * level swizzling, not the internal pixel-level swizzling which is
- * below OpenGL's reach */
-
- unsigned swizzle : 12;
- unsigned swizzle_zero : 20;
-
- uint32_t unknown5;
- uint32_t unknown6;
- uint32_t unknown7;
-
- mali_ptr payload[MAX_MIP_LEVELS * MAX_CUBE_FACES * MAX_ELEMENTS];
-} __attribute__((packed));
-
-/* Used as part of filter_mode */
-
-#define MALI_LINEAR 0
-#define MALI_NEAREST 1
-#define MALI_MIP_LINEAR (0x18)
-
-/* Used to construct low bits of filter_mode */
-
-#define MALI_TEX_MAG(mode) (((mode) & 1) << 0)
-#define MALI_TEX_MIN(mode) (((mode) & 1) << 1)
-
-#define MALI_TEX_MAG_MASK (1)
-#define MALI_TEX_MIN_MASK (2)
-
-#define MALI_FILTER_NAME(filter) (filter ? "MALI_NEAREST" : "MALI_LINEAR")
-
-/* Used for lod encoding. Thanks @urjaman for pointing out these routines can
- * be cleaned up a lot. */
-
-#define DECODE_FIXED_16(x) ((float) (x / 256.0))
-
-static inline uint16_t
-FIXED_16(float x)
-{
- /* Clamp inputs, accounting for float error */
- float max_lod = (32.0 - (1.0 / 512.0));
-
- x = ((x > max_lod) ? max_lod : ((x < 0.0) ? 0.0 : x));
-
- return (int) (x * 256.0);
-}
-
-struct mali_sampler_descriptor {
- uint32_t filter_mode;
-
- /* Fixed point. Upper 8-bits is before the decimal point, although it
- * caps [0-31]. Lower 8-bits is after the decimal point: int(round(x *
- * 256)) */
-
- uint16_t min_lod;
- uint16_t max_lod;
-
- /* All one word in reality, but packed a bit */
-
- enum mali_wrap_mode wrap_s : 4;
- enum mali_wrap_mode wrap_t : 4;
- enum mali_wrap_mode wrap_r : 4;
- enum mali_alt_func compare_func : 3;
-
- /* No effect on 2D textures. For cubemaps, set for ES3 and clear for
- * ES2, controlling seamless cubemapping */
- unsigned seamless_cube_map : 1;
-
- unsigned zero : 16;
-
- uint32_t zero2;
- float border_color[4];
-} __attribute__((packed));
-
-/* viewport0/viewport1 form the arguments to glViewport. viewport1 is
- * modified by MALI_POSITIVE; viewport0 is as-is.
- */
-
-struct mali_viewport {
- /* XY clipping planes */
- float clip_minx;
- float clip_miny;
- float clip_maxx;
- float clip_maxy;
-
- /* Depth clipping planes */
- float clip_minz;
- float clip_maxz;
-
- u16 viewport0[2];
- u16 viewport1[2];
-} __attribute__((packed));
-
-/* From presentations, 16x16 tiles externally. Use shift for fast computation
- * of tile numbers. */
-
-#define MALI_TILE_SHIFT 4
-#define MALI_TILE_LENGTH (1 << MALI_TILE_SHIFT)
-
-/* Tile coordinates are stored as a compact u32, as only 12 bits are needed to
- * each component. Notice that this provides a theoretical upper bound of (1 <<
- * 12) = 4096 tiles in each direction, addressing a maximum framebuffer of size
- * 65536x65536. Multiplying that together, times another four given that Mali
- * framebuffers are 32-bit ARGB8888, means that this upper bound would take 16
- * gigabytes of RAM just to store the uncompressed framebuffer itself, let
- * alone rendering in real-time to such a buffer.
- *
- * Nice job, guys.*/
-
-/* From mali_kbase_10969_workaround.c */
-#define MALI_X_COORD_MASK 0x00000FFF
-#define MALI_Y_COORD_MASK 0x0FFF0000
-
-/* Extract parts of a tile coordinate */
-
-#define MALI_TILE_COORD_X(coord) ((coord) & MALI_X_COORD_MASK)
-#define MALI_TILE_COORD_Y(coord) (((coord) & MALI_Y_COORD_MASK) >> 16)
-#define MALI_TILE_COORD_FLAGS(coord) ((coord) & ~(MALI_X_COORD_MASK | MALI_Y_COORD_MASK))
-
-/* No known flags yet, but just in case...? */
-
-#define MALI_TILE_NO_FLAG (0)
-
-/* Helpers to generate tile coordinates based on the boundary coordinates in
- * screen space. So, with the bounds (0, 0) to (128, 128) for the screen, these
- * functions would convert it to the bounding tiles (0, 0) to (7, 7).
- * Intentional "off-by-one"; finding the tile number is a form of fencepost
- * problem. */
-
-#define MALI_MAKE_TILE_COORDS(X, Y) ((X) | ((Y) << 16))
-#define MALI_BOUND_TO_TILE(B, bias) ((B - bias) >> MALI_TILE_SHIFT)
-#define MALI_COORDINATE_TO_TILE(W, H, bias) MALI_MAKE_TILE_COORDS(MALI_BOUND_TO_TILE(W, bias), MALI_BOUND_TO_TILE(H, bias))
-#define MALI_COORDINATE_TO_TILE_MIN(W, H) MALI_COORDINATE_TO_TILE(W, H, 0)
-#define MALI_COORDINATE_TO_TILE_MAX(W, H) MALI_COORDINATE_TO_TILE(W, H, 1)
-
-struct mali_payload_fragment {
- u32 min_tile_coord;
- u32 max_tile_coord;
- mali_ptr framebuffer;
-} __attribute__((packed));
-
-/* Single Framebuffer Descriptor */
-
-/* Flags apply to format. With just MSAA_A and MSAA_B, the framebuffer is
- * configured for 4x. With MSAA_8, it is configured for 8x. */
-
-#define MALI_FRAMEBUFFER_MSAA_8 (1 << 3)
-#define MALI_FRAMEBUFFER_MSAA_A (1 << 4)
-#define MALI_FRAMEBUFFER_MSAA_B (1 << 23)
-
-/* Fast/slow based on whether all three buffers are cleared at once */
-
-#define MALI_CLEAR_FAST (1 << 18)
-#define MALI_CLEAR_SLOW (1 << 28)
-#define MALI_CLEAR_SLOW_STENCIL (1 << 31)
-
-/* Configures hierarchical tiling on Midgard for both SFBD/MFBD (embedded
- * within the larget framebuffer descriptor). Analogous to
- * bifrost_tiler_heap_meta and bifrost_tiler_meta*/
-
-struct midgard_tiler_descriptor {
- /* Size of the entire polygon list; see pan_tiler.c for the
- * computation. It's based on hierarchical tiling */
-
- u32 polygon_list_size;
-
- /* Name known from the replay workaround in the kernel. What exactly is
- * flagged here is less known. We do that (tiler_hierarchy_mask & 0x1ff)
- * specifies a mask of hierarchy weights, which explains some of the
- * performance mysteries around setting it. We also see the bottom bit
- * of tiler_flags set in the kernel, but no comment why. */
-
- u16 hierarchy_mask;
- u16 flags;
-
- /* See mali_tiler.c for an explanation */
- mali_ptr polygon_list;
- mali_ptr polygon_list_body;
-
- /* Names based on we see symmetry with replay jobs which name these
- * explicitly */
-
- mali_ptr heap_start; /* tiler heap_free_address */
- mali_ptr heap_end;
-
- /* Hierarchy weights. We know these are weights based on the kernel,
- * but I've never seen them be anything other than zero */
- u32 weights[8];
-};
-
-struct mali_single_framebuffer {
- u32 unknown1;
- u32 unknown2;
- u64 unknown_address_0;
- u64 zero1;
- u64 zero0;
-
- /* Exact format is ironically not known, since EGL is finnicky with the
- * blob. MSAA, colourspace, etc are configured here. */
-
- u32 format;
-
- u32 clear_flags;
- u32 zero2;
-
- /* Purposeful off-by-one in these fields should be accounted for by the
- * MALI_DIMENSION macro */
-
- u16 width;
- u16 height;
-
- u32 zero3[8];
-
- /* By default, the framebuffer is upside down from OpenGL's
- * perspective. Set framebuffer to the end and negate the stride to
- * flip in the Y direction */
-
- mali_ptr framebuffer;
- int32_t stride;
-
- u32 zero4;
-
- /* Depth and stencil buffers are interleaved, it appears, as they are
- * set to the same address in captures. Both fields set to zero if the
- * buffer is not being cleared. Depending on GL_ENABLE magic, you might
- * get a zero enable despite the buffer being present; that still is
- * disabled. */
-
- mali_ptr depth_buffer; // not SAME_VA
- u64 depth_buffer_enable;
-
- mali_ptr stencil_buffer; // not SAME_VA
- u64 stencil_buffer_enable;
-
- u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware
- u32 clear_color_2; // always equal, but unclear function?
- u32 clear_color_3; // always equal, but unclear function?
- u32 clear_color_4; // always equal, but unclear function?
-
- /* Set to zero if not cleared */
-
- float clear_depth_1; // float32, ditto
- float clear_depth_2; // float32, ditto
- float clear_depth_3; // float32, ditto
- float clear_depth_4; // float32, ditto
-
- u32 clear_stencil; // Exactly as it appears in OpenGL
-
- u32 zero6[7];
-
- struct midgard_tiler_descriptor tiler;
-
- /* More below this, maybe */
-} __attribute__((packed));
-
-/* On Midgard, this "framebuffer descriptor" is used for the framebuffer field
- * of compute jobs. Superficially resembles a single framebuffer descriptor */
-
-struct mali_compute_fbd {
- u32 unknown1[16];
-} __attribute__((packed));
-
-/* Format bits for the render target flags */
-
-#define MALI_MFBD_FORMAT_MSAA (1 << 1)
-#define MALI_MFBD_FORMAT_SRGB (1 << 2)
-
-enum mali_mfbd_block_format {
- MALI_MFBD_BLOCK_TILED = 0x0,
- MALI_MFBD_BLOCK_UNKNOWN = 0x1,
- MALI_MFBD_BLOCK_LINEAR = 0x2,
- MALI_MFBD_BLOCK_AFBC = 0x3,
-};
-
-struct mali_rt_format {
- unsigned unk1 : 32;
- unsigned unk2 : 3;
-
- unsigned nr_channels : 2; /* MALI_POSITIVE */
-
- unsigned unk3 : 5;
- enum mali_mfbd_block_format block : 2;
- unsigned flags : 4;
-
- unsigned swizzle : 12;
-
- unsigned unk4 : 4;
-} __attribute__((packed));
-
-struct bifrost_render_target {
- struct mali_rt_format format;
-
- u64 zero1;
-
- union {
- struct {
- /* Stuff related to ARM Framebuffer Compression. When AFBC is enabled,
- * there is an extra metadata buffer that contains 16 bytes per tile.
- * The framebuffer needs to be the same size as before, since we don't
- * know ahead of time how much space it will take up. The
- * framebuffer_stride is set to 0, since the data isn't stored linearly
- * anymore.
- */
-
- mali_ptr metadata;
- u32 stride; // stride in units of tiles
- u32 unk; // = 0x20000
- } afbc;
-
- struct {
- /* Heck if I know */
- u64 unk;
- mali_ptr pointer;
- } chunknown;
- };
-
- mali_ptr framebuffer;
-
- u32 zero2 : 4;
- u32 framebuffer_stride : 28; // in units of bytes
- u32 zero3;
-
- u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware
- u32 clear_color_2; // always equal, but unclear function?
- u32 clear_color_3; // always equal, but unclear function?
- u32 clear_color_4; // always equal, but unclear function?
-} __attribute__((packed));
-
-/* An optional part of bifrost_framebuffer. It comes between the main structure
- * and the array of render targets. It must be included if any of these are
- * enabled:
- *
- * - Transaction Elimination
- * - Depth/stencil
- * - TODO: Anything else?
- */
-
-/* Flags field: note, these are guesses */
-
-#define MALI_EXTRA_PRESENT (0x400)
-#define MALI_EXTRA_AFBC (0x20)
-#define MALI_EXTRA_AFBC_ZS (0x10)
-#define MALI_EXTRA_ZS (0x4)
-
-struct bifrost_fb_extra {
- mali_ptr checksum;
- /* Each tile has an 8 byte checksum, so the stride is "width in tiles * 8" */
- u32 checksum_stride;
-
- u32 flags;
-
- union {
- /* Note: AFBC is only allowed for 24/8 combined depth/stencil. */
- struct {
- mali_ptr depth_stencil_afbc_metadata;
- u32 depth_stencil_afbc_stride; // in units of tiles
- u32 zero1;
-
- mali_ptr depth_stencil;
-
- u64 padding;
- } ds_afbc;
-
- struct {
- /* Depth becomes depth/stencil in case of combined D/S */
- mali_ptr depth;
- u32 depth_stride_zero : 4;
- u32 depth_stride : 28;
- u32 zero1;
-
- mali_ptr stencil;
- u32 stencil_stride_zero : 4;
- u32 stencil_stride : 28;
- u32 zero2;
- } ds_linear;
- };
-
-
- u64 zero3, zero4;
-} __attribute__((packed));
-
-/* Flags for mfbd_flags */
-
-/* Enables writing depth results back to main memory (rather than keeping them
- * on-chip in the tile buffer and then discarding) */
-
-#define MALI_MFBD_DEPTH_WRITE (1 << 10)
-
-/* The MFBD contains the extra bifrost_fb_extra section */
-
-#define MALI_MFBD_EXTRA (1 << 13)
-
-struct bifrost_framebuffer {
- u32 unk0; // = 0x10
-
- u32 unknown2; // = 0x1f, same as SFBD
- mali_ptr scratchpad;
-
- /* 0x10 */
- mali_ptr sample_locations;
- mali_ptr unknown1;
- /* 0x20 */
- u16 width1, height1;
- u32 zero3;
- u16 width2, height2;
- u32 unk1 : 19; // = 0x01000
- u32 rt_count_1 : 2; // off-by-one (use MALI_POSITIVE)
- u32 unk2 : 3; // = 0
- u32 rt_count_2 : 3; // no off-by-one
- u32 zero4 : 5;
- /* 0x30 */
- u32 clear_stencil : 8;
- u32 mfbd_flags : 24; // = 0x100
- float clear_depth;
-
- struct midgard_tiler_descriptor tiler;
-
- /* optional: struct bifrost_fb_extra extra */
- /* struct bifrost_render_target rts[] */
-} __attribute__((packed));
-
-#endif /* __PANFROST_JOB_H__ */
+++ /dev/null
-/*
- * © Copyright 2017-2018 The Panfrost Community
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef __PANFROST_MISC_H__
-#define __PANFROST_MISC_H__
-
-#include <inttypes.h>
-
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-
-typedef int8_t s8;
-typedef int16_t s16;
-typedef int32_t s32;
-typedef int64_t s64;
-
-typedef uint64_t mali_ptr;
-
-#define MALI_PTR_FMT "0x%" PRIx64
-
-/* FIXME: put this somewhere more fitting */
-#define MALI_MEM_MAP_TRACKING_HANDLE (3ull << 12)
-
-#endif
'pan_resource.c',
'pan_resource.h',
- 'midgard/midgard_compile.c',
- 'midgard/mir.c',
- 'midgard/midgard_print.c',
- 'midgard/midgard_schedule.c',
- 'midgard/midgard_emit.c',
- 'midgard/midgard_ra.c',
- 'midgard/midgard_ra_pipeline.c',
- 'midgard/midgard_liveness.c',
- 'midgard/midgard_ops.c',
- 'midgard/cppwrap.cpp',
- 'midgard/disassemble.c',
-
'nir/nir_undef_to_zero.c',
'nir/nir_lower_blend.c',
'nir/nir_lower_framebuffer.c',
- 'bifrost/disassemble.c',
-
- 'pandecode/common.c',
- 'pandecode/decode.c',
-
'pan_context.c',
'pan_afbc.c',
'pan_blit.c',
'pan_blending.c',
'pan_blend_shaders.c',
'pan_blend_cso.c',
- 'pan_pretty_print.c',
'pan_fragment.c',
'pan_invocation.c',
'pan_instancing.c',
inc_include,
inc_src,
inc_panfrost,
- include_directories('include'),
- include_directories('midgard'),
- include_directories('bifrost'),
]
compile_args_panfrost = [
'-Wno-pointer-arith'
]
-midgard_nir_algebraic_c = custom_target(
- 'midgard_nir_algebraic.c',
- input : 'midgard/midgard_nir_algebraic.py',
- output : 'midgard_nir_algebraic.c',
- command : [
- prog_python, '@INPUT@',
- '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
- ],
- capture : true,
- depend_files : nir_algebraic_py,
-)
-
libpanfrost = static_library(
'panfrost',
- [files_panfrost, midgard_nir_algebraic_c],
+ files_panfrost,
dependencies: [
dep_thread,
dep_libdrm,
driver_panfrost = declare_dependency(
compile_args : compile_args_panfrost,
- link_with : [libpanfrost, libpanfrostwinsys, libpanfrost_shared],
-)
-
-files_bifrost = files(
- 'bifrost/disassemble.c',
- 'bifrost/cmdline.c',
-)
-
-bifrost_compiler = executable(
- 'bifrost_compiler',
- [files_bifrost],
- include_directories : [
- inc_common,
- inc_src,
- inc_include,
- inc_gallium,
- inc_gallium_aux,
- include_directories('bifrost')
- ],
- dependencies : [
- dep_thread,
- idep_nir
- ],
- link_with : [
- libgallium,
- libglsl_standalone,
- libmesa_util
- ],
- build_by_default : true
-)
-
-files_pandecode = files(
- 'pandecode/cmdline.c',
- 'pandecode/common.c',
- 'pandecode/decode.c',
-
- 'pan_pretty_print.c',
-
- 'midgard/disassemble.c',
- 'midgard/midgard_ops.c',
- 'bifrost/disassemble.c',
-)
-
-pandecode = executable(
- 'pandecode',
- files_pandecode,
- include_directories : panfrost_includes,
- dependencies : [
- dep_thread,
- ],
- link_with : [
- libmesa_util
- ],
- build_by_default : true
+ link_with : [libpanfrost, libpanfrostwinsys, libpanfrost_shared, libpanfrost_midgard, libpanfrost_bifrost, libpanfrost_decode],
)
+++ /dev/null
-/*
- * Copyright (C) 2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _MDG_COMPILER_H
-#define _MDG_COMPILER_H
-
-#include "midgard.h"
-#include "helpers.h"
-#include "midgard_compile.h"
-
-#include "util/hash_table.h"
-#include "util/u_dynarray.h"
-#include "util/set.h"
-#include "util/list.h"
-
-#include "main/mtypes.h"
-#include "compiler/nir_types.h"
-#include "compiler/nir/nir.h"
-
-/* Forward declare */
-struct midgard_block;
-
-/* Target types. Defaults to TARGET_GOTO (the type corresponding directly to
- * the hardware), hence why that must be zero. TARGET_DISCARD signals this
- * instruction is actually a discard op. */
-
-#define TARGET_GOTO 0
-#define TARGET_BREAK 1
-#define TARGET_CONTINUE 2
-#define TARGET_DISCARD 3
-
-typedef struct midgard_branch {
- /* If conditional, the condition is specified in r31.w */
- bool conditional;
-
- /* For conditionals, if this is true, we branch on FALSE. If false, we branch on TRUE. */
- bool invert_conditional;
-
- /* Branch targets: the start of a block, the start of a loop (continue), the end of a loop (break). Value is one of TARGET_ */
- unsigned target_type;
-
- /* The actual target */
- union {
- int target_block;
- int target_break;
- int target_continue;
- };
-} midgard_branch;
-
-/* Instruction arguments represented as block-local SSA indices, rather than
- * registers. Negative values mean unused. */
-
-typedef struct {
- int src0;
- int src1;
- int dest;
-
- /* src1 is -not- SSA but instead a 16-bit inline constant to be smudged
- * in. Only valid for ALU ops. */
- bool inline_constant;
-} ssa_args;
-
-/* Generic in-memory data type repesenting a single logical instruction, rather
- * than a single instruction group. This is the preferred form for code gen.
- * Multiple midgard_insturctions will later be combined during scheduling,
- * though this is not represented in this structure. Its format bridges
- * the low-level binary representation with the higher level semantic meaning.
- *
- * Notably, it allows registers to be specified as block local SSA, for code
- * emitted before the register allocation pass.
- */
-
-typedef struct midgard_instruction {
- /* Must be first for casting */
- struct list_head link;
-
- unsigned type; /* ALU, load/store, texture */
-
- /* If the register allocator has not run yet... */
- ssa_args ssa_args;
-
- /* Special fields for an ALU instruction */
- midgard_reg_info registers;
-
- /* I.e. (1 << alu_bit) */
- int unit;
-
- /* When emitting bundle, should this instruction have a break forced
- * before it? Used for r31 writes which are valid only within a single
- * bundle and *need* to happen as early as possible... this is a hack,
- * TODO remove when we have a scheduler */
- bool precede_break;
-
- bool has_constants;
- float constants[4];
- uint16_t inline_constant;
- bool has_blend_constant;
-
- bool compact_branch;
- bool writeout;
- bool prepacked_branch;
-
- /* Masks in a saneish format. One bit per channel, not packed fancy.
- * Use this instead of the op specific ones, and switch over at emit
- * time */
- uint16_t mask;
-
- union {
- midgard_load_store_word load_store;
- midgard_vector_alu alu;
- midgard_texture_word texture;
- midgard_branch_extended branch_extended;
- uint16_t br_compact;
-
- /* General branch, rather than packed br_compact. Higher level
- * than the other components */
- midgard_branch branch;
- };
-} midgard_instruction;
-
-typedef struct midgard_block {
- /* Link to next block. Must be first for mir_get_block */
- struct list_head link;
-
- /* List of midgard_instructions emitted for the current block */
- struct list_head instructions;
-
- bool is_scheduled;
-
- /* List of midgard_bundles emitted (after the scheduler has run) */
- struct util_dynarray bundles;
-
- /* Number of quadwords _actually_ emitted, as determined after scheduling */
- unsigned quadword_count;
-
- /* Successors: always one forward (the block after us), maybe
- * one backwards (for a backward branch). No need for a second
- * forward, since graph traversal would get there eventually
- * anyway */
- struct midgard_block *successors[2];
- unsigned nr_successors;
-
- /* The successors pointer form a graph, and in the case of
- * complex control flow, this graph has a cycles. To aid
- * traversal during liveness analysis, we have a visited?
- * boolean for passes to use as they see fit, provided they
- * clean up later */
- bool visited;
-} midgard_block;
-
-typedef struct midgard_bundle {
- /* Tag for the overall bundle */
- int tag;
-
- /* Instructions contained by the bundle */
- int instruction_count;
- midgard_instruction *instructions[5];
-
- /* Bundle-wide ALU configuration */
- int padding;
- int control;
- bool has_embedded_constants;
- float constants[4];
- bool has_blend_constant;
-} midgard_bundle;
-
-typedef struct compiler_context {
- nir_shader *nir;
- gl_shader_stage stage;
-
- /* Is internally a blend shader? Depends on stage == FRAGMENT */
- bool is_blend;
-
- /* Tracking for blend constant patching */
- int blend_constant_offset;
-
- /* Current NIR function */
- nir_function *func;
-
- /* Unordered list of midgard_blocks */
- int block_count;
- struct list_head blocks;
-
- midgard_block *initial_block;
- midgard_block *previous_source_block;
- midgard_block *final_block;
-
- /* List of midgard_instructions emitted for the current block */
- midgard_block *current_block;
-
- /* The current "depth" of the loop, for disambiguating breaks/continues
- * when using nested loops */
- int current_loop_depth;
-
- /* Total number of loops for shader-db */
- unsigned loop_count;
-
- /* Constants which have been loaded, for later inlining */
- struct hash_table_u64 *ssa_constants;
-
- /* SSA values / registers which have been aliased. Naively, these
- * demand a fmov output; instead, we alias them in a later pass to
- * avoid the wasted op.
- *
- * A note on encoding: to avoid dynamic memory management here, rather
- * than ampping to a pointer, we map to the source index; the key
- * itself is just the destination index. */
-
- struct hash_table_u64 *ssa_to_alias;
- struct set *leftover_ssa_to_alias;
-
- /* Actual SSA-to-register for RA */
- struct hash_table_u64 *ssa_to_register;
-
- /* Mapping of hashes computed from NIR indices to the sequential temp indices ultimately used in MIR */
- struct hash_table_u64 *hash_to_temp;
- int temp_count;
- int max_hash;
-
- /* Just the count of the max register used. Higher count => higher
- * register pressure */
- int work_registers;
-
- /* Used for cont/last hinting. Increase when a tex op is added.
- * Decrease when a tex op is removed. */
- int texture_op_count;
-
- /* Mapping of texture register -> SSA index for unaliasing */
- int texture_index[2];
-
- /* If any path hits a discard instruction */
- bool can_discard;
-
- /* The number of uniforms allowable for the fast path */
- int uniform_cutoff;
-
- /* Count of instructions emitted from NIR overall, across all blocks */
- int instruction_count;
-
- /* Alpha ref value passed in */
- float alpha_ref;
-
- /* The index corresponding to the fragment output */
- unsigned fragment_output;
-
- /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */
- unsigned sysvals[MAX_SYSVAL_COUNT];
- unsigned sysval_count;
- struct hash_table_u64 *sysval_to_id;
-} compiler_context;
-
-/* Helpers for manipulating the above structures (forming the driver IR) */
-
-/* Append instruction to end of current block */
-
-static inline midgard_instruction *
-mir_upload_ins(struct midgard_instruction ins)
-{
- midgard_instruction *heap = malloc(sizeof(ins));
- memcpy(heap, &ins, sizeof(ins));
- return heap;
-}
-
-static inline void
-emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins)
-{
- list_addtail(&(mir_upload_ins(ins))->link, &ctx->current_block->instructions);
-}
-
-static inline void
-mir_insert_instruction_before(struct midgard_instruction *tag, struct midgard_instruction ins)
-{
- list_addtail(&(mir_upload_ins(ins))->link, &tag->link);
-}
-
-static inline void
-mir_remove_instruction(struct midgard_instruction *ins)
-{
- list_del(&ins->link);
-}
-
-static inline midgard_instruction*
-mir_prev_op(struct midgard_instruction *ins)
-{
- return list_last_entry(&(ins->link), midgard_instruction, link);
-}
-
-static inline midgard_instruction*
-mir_next_op(struct midgard_instruction *ins)
-{
- return list_first_entry(&(ins->link), midgard_instruction, link);
-}
-
-#define mir_foreach_block(ctx, v) \
- list_for_each_entry(struct midgard_block, v, &ctx->blocks, link)
-
-#define mir_foreach_block_from(ctx, from, v) \
- list_for_each_entry_from(struct midgard_block, v, from, &ctx->blocks, link)
-
-#define mir_foreach_instr(ctx, v) \
- list_for_each_entry(struct midgard_instruction, v, &ctx->current_block->instructions, link)
-
-#define mir_foreach_instr_safe(ctx, v) \
- list_for_each_entry_safe(struct midgard_instruction, v, &ctx->current_block->instructions, link)
-
-#define mir_foreach_instr_in_block(block, v) \
- list_for_each_entry(struct midgard_instruction, v, &block->instructions, link)
-
-#define mir_foreach_instr_in_block_safe(block, v) \
- list_for_each_entry_safe(struct midgard_instruction, v, &block->instructions, link)
-
-#define mir_foreach_instr_in_block_safe_rev(block, v) \
- list_for_each_entry_safe_rev(struct midgard_instruction, v, &block->instructions, link)
-
-#define mir_foreach_instr_in_block_from(block, v, from) \
- list_for_each_entry_from(struct midgard_instruction, v, from, &block->instructions, link)
-
-#define mir_foreach_instr_in_block_from_rev(block, v, from) \
- list_for_each_entry_from_rev(struct midgard_instruction, v, from, &block->instructions, link)
-
-#define mir_foreach_bundle_in_block(block, v) \
- util_dynarray_foreach(&block->bundles, midgard_bundle, v)
-
-#define mir_foreach_instr_global(ctx, v) \
- mir_foreach_block(ctx, v_block) \
- mir_foreach_instr_in_block(v_block, v)
-
-
-static inline midgard_instruction *
-mir_last_in_block(struct midgard_block *block)
-{
- return list_last_entry(&block->instructions, struct midgard_instruction, link);
-}
-
-static inline midgard_block *
-mir_get_block(compiler_context *ctx, int idx)
-{
- struct list_head *lst = &ctx->blocks;
-
- while ((idx--) + 1)
- lst = lst->next;
-
- return (struct midgard_block *) lst;
-}
-
-static inline bool
-mir_is_alu_bundle(midgard_bundle *bundle)
-{
- return IS_ALU(bundle->tag);
-}
-
-/* MIR manipulation */
-
-void mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new);
-void mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new);
-void mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new);
-
-/* MIR printing */
-
-void mir_print_instruction(midgard_instruction *ins);
-void mir_print_bundle(midgard_bundle *ctx);
-void mir_print_block(midgard_block *block);
-void mir_print_shader(compiler_context *ctx);
-
-/* MIR goodies */
-
-static const midgard_vector_alu_src blank_alu_src = {
- .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
-};
-
-static const midgard_vector_alu_src blank_alu_src_xxxx = {
- .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X),
-};
-
-static const midgard_scalar_alu_src blank_scalar_alu_src = {
- .full = true
-};
-
-/* Used for encoding the unused source of 1-op instructions */
-static const midgard_vector_alu_src zero_alu_src = { 0 };
-
-/* 'Intrinsic' move for aliasing */
-
-static inline midgard_instruction
-v_mov(unsigned src, midgard_vector_alu_src mod, unsigned dest)
-{
- midgard_instruction ins = {
- .type = TAG_ALU_4,
- .mask = 0xF,
- .ssa_args = {
- .src0 = SSA_UNUSED_1,
- .src1 = src,
- .dest = dest,
- },
- .alu = {
- .op = midgard_alu_op_imov,
- .reg_mode = midgard_reg_mode_32,
- .dest_override = midgard_dest_override_none,
- .outmod = midgard_outmod_int_wrap,
- .src1 = vector_alu_srco_unsigned(zero_alu_src),
- .src2 = vector_alu_srco_unsigned(mod)
- },
- };
-
- return ins;
-}
-
-/* Scheduling */
-
-void schedule_program(compiler_context *ctx);
-
-/* Register allocation */
-
-struct ra_graph;
-
-struct ra_graph* allocate_registers(compiler_context *ctx);
-void install_registers(compiler_context *ctx, struct ra_graph *g);
-bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src);
-bool mir_has_multiple_writes(compiler_context *ctx, int src);
-
-void mir_create_pipeline_registers(compiler_context *ctx);
-
-/* Final emission */
-
-void emit_binary_bundle(
- compiler_context *ctx,
- midgard_bundle *bundle,
- struct util_dynarray *emission,
- int next_tag);
-
-/* NIR stuff */
-
-bool
-nir_undef_to_zero(nir_shader *shader);
-
-#endif
+++ /dev/null
-struct exec_list;
-
-bool do_mat_op_to_vec(struct exec_list *instructions);
-
-extern "C" {
- bool c_do_mat_op_to_vec(struct exec_list *instructions) {
- return do_mat_op_to_vec(instructions);
- }
-};
+++ /dev/null
-/* Author(s):
- * Connor Abbott
- * Alyssa Rosenzweig
- *
- * Copyright (c) 2013 Connor Abbott (connor@abbott.cx)
- * Copyright (c) 2018 Alyssa Rosenzweig (alyssa@rosenzweig.io)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <assert.h>
-#include <inttypes.h>
-#include <ctype.h>
-#include <string.h>
-#include "midgard.h"
-#include "midgard-parse.h"
-#include "midgard_ops.h"
-#include "disassemble.h"
-#include "helpers.h"
-#include "util/half_float.h"
-#include "util/u_math.h"
-
-#define DEFINE_CASE(define, str) case define: { printf(str); break; }
-
-static bool is_instruction_int = false;
-
-/* Prints a short form of the tag for branching, the minimum needed to be
- * legible and unambiguous */
-
-static void
-print_tag_short(unsigned tag)
-{
- switch (midgard_word_types[tag]) {
- case midgard_word_type_texture:
- printf("tex/%X", tag);
- break;
-
- case midgard_word_type_load_store:
- printf("ldst");
- break;
-
- case midgard_word_type_alu:
- printf("alu%d/%X", midgard_word_size[tag], tag);
- break;
-
- default:
- printf("%s%X", (tag > 0) ? "" : "unk", tag);
- break;
- }
-}
-
-static void
-print_alu_opcode(midgard_alu_op op)
-{
- bool int_op = false;
-
- if (alu_opcode_props[op].name) {
- printf("%s", alu_opcode_props[op].name);
-
- int_op = midgard_is_integer_op(op);
- } else
- printf("alu_op_%02X", op);
-
- /* For constant analysis */
- is_instruction_int = int_op;
-}
-
-static void
-print_ld_st_opcode(midgard_load_store_op op)
-{
- if (load_store_opcode_names[op])
- printf("%s", load_store_opcode_names[op]);
- else
- printf("ldst_op_%02X", op);
-}
-
-static bool is_embedded_constant_half = false;
-static bool is_embedded_constant_int = false;
-
-static char
-prefix_for_bits(unsigned bits)
-{
- switch (bits) {
- case 8:
- return 'q';
- case 16:
- return 'h';
- case 64:
- return 'd';
- default:
- return 0;
- }
-}
-
-static void
-print_reg(unsigned reg, unsigned bits)
-{
- /* Perform basic static analysis for expanding constants correctly */
-
- if (reg == 26) {
- is_embedded_constant_int = is_instruction_int;
- is_embedded_constant_half = (bits < 32);
- }
-
- char prefix = prefix_for_bits(bits);
-
- if (prefix)
- putchar(prefix);
-
- printf("r%u", reg);
-}
-
-static char *outmod_names_float[4] = {
- "",
- ".pos",
- ".unk2",
- ".sat"
-};
-
-static char *outmod_names_int[4] = {
- ".isat",
- ".usat",
- "",
- ".hi"
-};
-
-static char *srcmod_names_int[4] = {
- "sext(",
- "zext(",
- "",
- "("
-};
-
-static void
-print_outmod(unsigned outmod, bool is_int)
-{
- printf("%s", is_int ? outmod_names_int[outmod] :
- outmod_names_float[outmod]);
-}
-
-static void
-print_quad_word(uint32_t *words, unsigned tabs)
-{
- unsigned i;
-
- for (i = 0; i < 4; i++)
- printf("0x%08X%s ", words[i], i == 3 ? "" : ",");
-
- printf("\n");
-}
-
-static const char components[16] = "xyzwefghijklmnop";
-
-/* Helper to print 4 chars of a swizzle */
-static void
-print_swizzle_helper(unsigned swizzle, bool upper)
-{
- for (unsigned i = 0; i < 4; ++i) {
- unsigned c = (swizzle >> (i * 2)) & 3;
- c += upper*4;
- printf("%c", components[c]);
- }
-}
-
-/* Helper to print 8 chars of a swizzle, duplicating over */
-static void
-print_swizzle_helper_8(unsigned swizzle, bool upper)
-{
- for (unsigned i = 0; i < 4; ++i) {
- unsigned c = (swizzle >> (i * 2)) & 3;
- c *= 2;
- c += upper*8;
- printf("%c%c", components[c], components[c+1]);
- }
-}
-
-static void
-print_swizzle_vec16(unsigned swizzle, bool rep_high, bool rep_low,
- midgard_dest_override override)
-{
- printf(".");
-
- if (override == midgard_dest_override_upper) {
- if (rep_high)
- printf(" /* rep_high */ ");
- if (rep_low)
- printf(" /* rep_low */ ");
-
- if (!rep_high && rep_low)
- print_swizzle_helper_8(swizzle, true);
- else
- print_swizzle_helper_8(swizzle, false);
- } else {
- print_swizzle_helper_8(swizzle, rep_high & 1);
- print_swizzle_helper_8(swizzle, !rep_low & 1);
- }
-}
-
-static void
-print_swizzle_vec8(unsigned swizzle, bool rep_high, bool rep_low)
-{
- printf(".");
-
- print_swizzle_helper(swizzle, rep_high & 1);
- print_swizzle_helper(swizzle, !rep_low & 1);
-}
-
-static void
-print_swizzle_vec4(unsigned swizzle, bool rep_high, bool rep_low)
-{
- if (rep_high)
- printf(" /* rep_high */ ");
- if (rep_low)
- printf(" /* rep_low */ ");
-
- if (swizzle == 0xE4) return; /* xyzw */
-
- printf(".");
- print_swizzle_helper(swizzle, 0);
-}
-static void
-print_swizzle_vec2(unsigned swizzle, bool rep_high, bool rep_low)
-{
- if (rep_high)
- printf(" /* rep_high */ ");
- if (rep_low)
- printf(" /* rep_low */ ");
-
- if (swizzle == 0xE4) return; /* XY */
-
- printf(".");
-
- for (unsigned i = 0; i < 4; i += 2) {
- unsigned a = (swizzle >> (i * 2)) & 3;
- unsigned b = (swizzle >> ((i+1) * 2)) & 3;
-
- /* Normally we're adjacent, but if there's an issue, don't make
- * it ambiguous */
-
- if (a & 0x1)
- printf("[%c%c]", components[a], components[b]);
- else if (a == b)
- printf("%c", components[a >> 1]);
- else if (b == (a + 1))
- printf("%c", "XY"[a >> 1]);
- else
- printf("[%c%c]", components[a], components[b]);
- }
-}
-
-static int
-bits_for_mode(midgard_reg_mode mode)
-{
- switch (mode) {
- case midgard_reg_mode_8:
- return 8;
- case midgard_reg_mode_16:
- return 16;
- case midgard_reg_mode_32:
- return 32;
- case midgard_reg_mode_64:
- return 64;
- default:
- return 0;
- }
-}
-
-static int
-bits_for_mode_halved(midgard_reg_mode mode, bool half)
-{
- unsigned bits = bits_for_mode(mode);
-
- if (half)
- bits >>= 1;
-
- return bits;
-}
-
-static void
-print_vector_src(unsigned src_binary,
- midgard_reg_mode mode, unsigned reg,
- midgard_dest_override override, bool is_int)
-{
- midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary;
-
- /* Modifiers change meaning depending on the op's context */
-
- midgard_int_mod int_mod = src->mod;
-
- if (is_int) {
- printf("%s", srcmod_names_int[int_mod]);
- } else {
- if (src->mod & MIDGARD_FLOAT_MOD_NEG)
- printf("-");
-
- if (src->mod & MIDGARD_FLOAT_MOD_ABS)
- printf("abs(");
- }
-
- //register
- unsigned bits = bits_for_mode_halved(mode, src->half);
- print_reg(reg, bits);
-
- //swizzle
- if (bits == 16)
- print_swizzle_vec8(src->swizzle, src->rep_high, src->rep_low);
- else if (bits == 8)
- print_swizzle_vec16(src->swizzle, src->rep_high, src->rep_low, override);
- else if (bits == 32)
- print_swizzle_vec4(src->swizzle, src->rep_high, src->rep_low);
- else if (bits == 64)
- print_swizzle_vec2(src->swizzle, src->rep_high, src->rep_low);
-
- /* Since we wrapped with a function-looking thing */
-
- if (is_int && int_mod == midgard_int_shift)
- printf(") << %d", bits);
- else if ((is_int && (int_mod != midgard_int_normal))
- || (!is_int && src->mod & MIDGARD_FLOAT_MOD_ABS))
- printf(")");
-}
-
-static uint16_t
-decode_vector_imm(unsigned src2_reg, unsigned imm)
-{
- uint16_t ret;
- ret = src2_reg << 11;
- ret |= (imm & 0x7) << 8;
- ret |= (imm >> 3) & 0xFF;
- return ret;
-}
-
-static void
-print_immediate(uint16_t imm)
-{
- if (is_instruction_int)
- printf("#%d", imm);
- else
- printf("#%g", _mesa_half_to_float(imm));
-}
-
-static unsigned
-print_dest(unsigned reg, midgard_reg_mode mode, midgard_dest_override override)
-{
- /* Depending on the mode and override, we determine the type of
- * destination addressed. Absent an override, we address just the
- * type of the operation itself */
-
- unsigned bits = bits_for_mode(mode);
-
- if (override != midgard_dest_override_none)
- bits /= 2;
-
- print_reg(reg, bits);
-
- return bits;
-}
-
-static void
-print_mask_vec16(uint8_t mask, midgard_dest_override override)
-{
- printf(".");
-
- if (override == midgard_dest_override_none) {
- for (unsigned i = 0; i < 8; i++) {
- if (mask & (1 << i))
- printf("%c%c",
- components[i*2 + 0],
- components[i*2 + 1]);
- }
- } else {
- bool upper = (override == midgard_dest_override_upper);
-
- for (unsigned i = 0; i < 8; i++) {
- if (mask & (1 << i))
- printf("%c", components[i + (upper ? 8 : 0)]);
- }
- }
-}
-
-/* For 16-bit+ masks, we read off from the 8-bit mask field. For 16-bit (vec8),
- * it's just one bit per channel, easy peasy. For 32-bit (vec4), it's one bit
- * per channel with one duplicate bit in the middle. For 64-bit (vec2), it's
- * one-bit per channel with _3_ duplicate bits in the middle. Basically, just
- * subdividing the 128-bit word in 16-bit increments. For 64-bit, we uppercase
- * the mask to make it obvious what happened */
-
-static void
-print_mask(uint8_t mask, unsigned bits, midgard_dest_override override)
-{
- if (bits == 8) {
- print_mask_vec16(mask, override);
- return;
- }
-
- /* Skip 'complete' masks */
-
- if (bits >= 32 && mask == 0xFF) return;
-
- if (bits == 16) {
- if (mask == 0x0F)
- return;
- else if (mask == 0xF0) {
- printf("'");
- return;
- }
- }
-
- printf(".");
-
- unsigned skip = (bits / 16);
- bool uppercase = bits > 32;
- bool tripped = false;
-
- for (unsigned i = 0; i < 8; i += skip) {
- bool a = (mask & (1 << i)) != 0;
-
- for (unsigned j = 1; j < skip; ++j) {
- bool dupe = (mask & (1 << (i + j))) != 0;
- tripped |= (dupe != a);
- }
-
- if (a) {
- char c = components[i / skip];
-
- if (uppercase)
- c = toupper(c);
-
- printf("%c", c);
- }
- }
-
- if (tripped)
- printf(" /* %X */", mask);
-}
-
-/* Prints the 4-bit masks found in texture and load/store ops, as opposed to
- * the 8-bit masks found in (vector) ALU ops */
-
-static void
-print_mask_4(unsigned mask)
-{
- if (mask == 0xF) return;
-
- printf(".");
-
- for (unsigned i = 0; i < 4; ++i) {
- bool a = (mask & (1 << i)) != 0;
- if (a)
- printf("%c", components[i]);
- }
-}
-
-static void
-print_vector_field(const char *name, uint16_t *words, uint16_t reg_word,
- unsigned tabs)
-{
- midgard_reg_info *reg_info = (midgard_reg_info *)®_word;
- midgard_vector_alu *alu_field = (midgard_vector_alu *) words;
- midgard_reg_mode mode = alu_field->reg_mode;
- unsigned override = alu_field->dest_override;
-
- /* For now, prefix instruction names with their unit, until we
- * understand how this works on a deeper level */
- printf("%s.", name);
-
- print_alu_opcode(alu_field->op);
-
- /* Postfix with the size to disambiguate if necessary */
- char postfix = prefix_for_bits(bits_for_mode(mode));
- bool size_ambiguous = override != midgard_dest_override_none;
-
- if (size_ambiguous)
- printf("%c", postfix ? postfix : 'r');
-
- /* Print the outmod, if there is one */
- print_outmod(alu_field->outmod,
- midgard_is_integer_out_op(alu_field->op));
-
- printf(" ");
-
- /* Mask denoting status of 8-lanes */
- uint8_t mask = alu_field->mask;
-
- /* First, print the destination */
- unsigned dest_size =
- print_dest(reg_info->out_reg, mode, alu_field->dest_override);
-
- /* Apply the destination override to the mask */
-
- if (mode == midgard_reg_mode_32 || mode == midgard_reg_mode_64) {
- if (override == midgard_dest_override_lower)
- mask &= 0x0F;
- else if (override == midgard_dest_override_upper)
- mask &= 0xF0;
- } else if (mode == midgard_reg_mode_16
- && override == midgard_dest_override_lower) {
- /* stub */
- }
-
- if (override != midgard_dest_override_none) {
- bool modeable = (mode != midgard_reg_mode_8);
- bool known = override != 0x3; /* Unused value */
-
- if (!(modeable && known))
- printf("/* do%d */ ", override);
- }
-
- print_mask(mask, dest_size, override);
-
- printf(", ");
-
- bool is_int = midgard_is_integer_op(alu_field->op);
- print_vector_src(alu_field->src1, mode, reg_info->src1_reg, override, is_int);
-
- printf(", ");
-
- if (reg_info->src2_imm) {
- uint16_t imm = decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2);
- print_immediate(imm);
- } else {
- print_vector_src(alu_field->src2, mode,
- reg_info->src2_reg, override, is_int);
- }
-
- printf("\n");
-}
-
-static void
-print_scalar_src(unsigned src_binary, unsigned reg)
-{
- midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary;
-
- if (src->negate)
- printf("-");
-
- if (src->abs)
- printf("abs(");
-
- print_reg(reg, src->full ? 32 : 16);
-
- unsigned c = src->component;
-
- if (src->full) {
- assert((c & 1) == 0);
- c >>= 1;
- }
-
- printf(".%c", components[c]);
-
- if (src->abs)
- printf(")");
-
-}
-
-static uint16_t
-decode_scalar_imm(unsigned src2_reg, unsigned imm)
-{
- uint16_t ret;
- ret = src2_reg << 11;
- ret |= (imm & 3) << 9;
- ret |= (imm & 4) << 6;
- ret |= (imm & 0x38) << 2;
- ret |= imm >> 6;
- return ret;
-}
-
-static void
-print_scalar_field(const char *name, uint16_t *words, uint16_t reg_word,
- unsigned tabs)
-{
- midgard_reg_info *reg_info = (midgard_reg_info *)®_word;
- midgard_scalar_alu *alu_field = (midgard_scalar_alu *) words;
-
- if (alu_field->unknown)
- printf("scalar ALU unknown bit set\n");
-
- printf("%s.", name);
- print_alu_opcode(alu_field->op);
- print_outmod(alu_field->outmod,
- midgard_is_integer_out_op(alu_field->op));
- printf(" ");
-
- bool full = alu_field->output_full;
- print_reg(reg_info->out_reg, full ? 32 : 16);
- unsigned c = alu_field->output_component;
-
- if (full) {
- assert((c & 1) == 0);
- c >>= 1;
- }
-
- printf(".%c, ", components[c]);
-
- print_scalar_src(alu_field->src1, reg_info->src1_reg);
-
- printf(", ");
-
- if (reg_info->src2_imm) {
- uint16_t imm = decode_scalar_imm(reg_info->src2_reg,
- alu_field->src2);
- print_immediate(imm);
- } else
- print_scalar_src(alu_field->src2, reg_info->src2_reg);
-
- printf("\n");
-}
-
-static void
-print_branch_op(int op)
-{
- switch (op) {
- case midgard_jmp_writeout_op_branch_uncond:
- printf("uncond.");
- break;
-
- case midgard_jmp_writeout_op_branch_cond:
- printf("cond.");
- break;
-
- case midgard_jmp_writeout_op_writeout:
- printf("write.");
- break;
-
- case midgard_jmp_writeout_op_tilebuffer_pending:
- printf("tilebuffer.");
- break;
-
- case midgard_jmp_writeout_op_discard:
- printf("discard.");
- break;
-
- default:
- printf("unk%d.", op);
- break;
- }
-}
-
-static void
-print_branch_cond(int cond)
-{
- switch (cond) {
- case midgard_condition_write0:
- printf("write0");
- break;
-
- case midgard_condition_false:
- printf("false");
- break;
-
- case midgard_condition_true:
- printf("true");
- break;
-
- case midgard_condition_always:
- printf("always");
- break;
-
- default:
- printf("unk%X", cond);
- break;
- }
-}
-
-static void
-print_compact_branch_writeout_field(uint16_t word)
-{
- midgard_jmp_writeout_op op = word & 0x7;
-
- switch (op) {
- case midgard_jmp_writeout_op_branch_uncond: {
- midgard_branch_uncond br_uncond;
- memcpy((char *) &br_uncond, (char *) &word, sizeof(br_uncond));
- printf("br.uncond ");
-
- if (br_uncond.unknown != 1)
- printf("unknown:%d, ", br_uncond.unknown);
-
- if (br_uncond.offset >= 0)
- printf("+");
-
- printf("%d -> ", br_uncond.offset);
- print_tag_short(br_uncond.dest_tag);
- printf("\n");
-
- break;
- }
-
- case midgard_jmp_writeout_op_branch_cond:
- case midgard_jmp_writeout_op_writeout:
- case midgard_jmp_writeout_op_discard:
- default: {
- midgard_branch_cond br_cond;
- memcpy((char *) &br_cond, (char *) &word, sizeof(br_cond));
-
- printf("br.");
-
- print_branch_op(br_cond.op);
- print_branch_cond(br_cond.cond);
-
- printf(" ");
-
- if (br_cond.offset >= 0)
- printf("+");
-
- printf("%d -> ", br_cond.offset);
- print_tag_short(br_cond.dest_tag);
- printf("\n");
-
- break;
- }
- }
-}
-
-static void
-print_extended_branch_writeout_field(uint8_t *words)
-{
- midgard_branch_extended br;
- memcpy((char *) &br, (char *) words, sizeof(br));
-
- printf("brx.");
-
- print_branch_op(br.op);
-
- /* Condition repeated 8 times in all known cases. Check this. */
-
- unsigned cond = br.cond & 0x3;
-
- for (unsigned i = 0; i < 16; i += 2) {
- assert(((br.cond >> i) & 0x3) == cond);
- }
-
- print_branch_cond(cond);
-
- if (br.unknown)
- printf(".unknown%d", br.unknown);
-
- printf(" ");
-
- if (br.offset >= 0)
- printf("+");
-
- printf("%d -> ", br.offset);
- print_tag_short(br.dest_tag);
- printf("\n");
-}
-
-static unsigned
-num_alu_fields_enabled(uint32_t control_word)
-{
- unsigned ret = 0;
-
- if ((control_word >> 17) & 1)
- ret++;
-
- if ((control_word >> 19) & 1)
- ret++;
-
- if ((control_word >> 21) & 1)
- ret++;
-
- if ((control_word >> 23) & 1)
- ret++;
-
- if ((control_word >> 25) & 1)
- ret++;
-
- return ret;
-}
-
-static float
-float_bitcast(uint32_t integer)
-{
- union {
- uint32_t i;
- float f;
- } v;
-
- v.i = integer;
- return v.f;
-}
-
-static void
-print_alu_word(uint32_t *words, unsigned num_quad_words,
- unsigned tabs)
-{
- uint32_t control_word = words[0];
- uint16_t *beginning_ptr = (uint16_t *)(words + 1);
- unsigned num_fields = num_alu_fields_enabled(control_word);
- uint16_t *word_ptr = beginning_ptr + num_fields;
- unsigned num_words = 2 + num_fields;
-
- if ((control_word >> 16) & 1)
- printf("unknown bit 16 enabled\n");
-
- if ((control_word >> 17) & 1) {
- print_vector_field("vmul", word_ptr, *beginning_ptr, tabs);
- beginning_ptr += 1;
- word_ptr += 3;
- num_words += 3;
- }
-
- if ((control_word >> 18) & 1)
- printf("unknown bit 18 enabled\n");
-
- if ((control_word >> 19) & 1) {
- print_scalar_field("sadd", word_ptr, *beginning_ptr, tabs);
- beginning_ptr += 1;
- word_ptr += 2;
- num_words += 2;
- }
-
- if ((control_word >> 20) & 1)
- printf("unknown bit 20 enabled\n");
-
- if ((control_word >> 21) & 1) {
- print_vector_field("vadd", word_ptr, *beginning_ptr, tabs);
- beginning_ptr += 1;
- word_ptr += 3;
- num_words += 3;
- }
-
- if ((control_word >> 22) & 1)
- printf("unknown bit 22 enabled\n");
-
- if ((control_word >> 23) & 1) {
- print_scalar_field("smul", word_ptr, *beginning_ptr, tabs);
- beginning_ptr += 1;
- word_ptr += 2;
- num_words += 2;
- }
-
- if ((control_word >> 24) & 1)
- printf("unknown bit 24 enabled\n");
-
- if ((control_word >> 25) & 1) {
- print_vector_field("lut", word_ptr, *beginning_ptr, tabs);
- beginning_ptr += 1;
- word_ptr += 3;
- num_words += 3;
- }
-
- if ((control_word >> 26) & 1) {
- print_compact_branch_writeout_field(*word_ptr);
- word_ptr += 1;
- num_words += 1;
- }
-
- if ((control_word >> 27) & 1) {
- print_extended_branch_writeout_field((uint8_t *) word_ptr);
- word_ptr += 3;
- num_words += 3;
- }
-
- if (num_quad_words > (num_words + 7) / 8) {
- assert(num_quad_words == (num_words + 15) / 8);
- //Assume that the extra quadword is constants
- void *consts = words + (4 * num_quad_words - 4);
-
- if (is_embedded_constant_int) {
- if (is_embedded_constant_half) {
- int16_t *sconsts = (int16_t *) consts;
- printf("sconstants %d, %d, %d, %d\n",
- sconsts[0],
- sconsts[1],
- sconsts[2],
- sconsts[3]);
- } else {
- int32_t *iconsts = (int32_t *) consts;
- printf("iconstants %d, %d, %d, %d\n",
- iconsts[0],
- iconsts[1],
- iconsts[2],
- iconsts[3]);
- }
- } else {
- if (is_embedded_constant_half) {
- uint16_t *hconsts = (uint16_t *) consts;
- printf("hconstants %g, %g, %g, %g\n",
- _mesa_half_to_float(hconsts[0]),
- _mesa_half_to_float(hconsts[1]),
- _mesa_half_to_float(hconsts[2]),
- _mesa_half_to_float(hconsts[3]));
- } else {
- uint32_t *fconsts = (uint32_t *) consts;
- printf("fconstants %g, %g, %g, %g\n",
- float_bitcast(fconsts[0]),
- float_bitcast(fconsts[1]),
- float_bitcast(fconsts[2]),
- float_bitcast(fconsts[3]));
- }
-
- }
- }
-}
-
-static void
-print_varying_parameters(midgard_load_store_word *word)
-{
- midgard_varying_parameter param;
- unsigned v = word->varying_parameters;
- memcpy(¶m, &v, sizeof(param));
-
- if (param.is_varying) {
- /* If a varying, there are qualifiers */
- if (param.flat)
- printf(".flat");
-
- if (param.interpolation != midgard_interp_default) {
- if (param.interpolation == midgard_interp_centroid)
- printf(".centroid");
- else
- printf(".interp%d", param.interpolation);
- }
-
- if (param.modifier != midgard_varying_mod_none) {
- if (param.modifier == midgard_varying_mod_perspective_w)
- printf(".perspectivew");
- else if (param.modifier == midgard_varying_mod_perspective_z)
- printf(".perspectivez");
- else
- printf(".mod%d", param.modifier);
- }
- } else if (param.flat || param.interpolation || param.modifier) {
- printf(" /* is_varying not set but varying metadata attached */");
- }
-
- if (param.zero0 || param.zero1 || param.zero2)
- printf(" /* zero tripped, %d %d %d */ ", param.zero0, param.zero1, param.zero2);
-}
-
-static bool
-is_op_varying(unsigned op)
-{
- switch (op) {
- case midgard_op_st_vary_16:
- case midgard_op_st_vary_32:
- case midgard_op_ld_vary_16:
- case midgard_op_ld_vary_32:
- return true;
- }
-
- return false;
-}
-
-static void
-print_load_store_instr(uint64_t data,
- unsigned tabs)
-{
- midgard_load_store_word *word = (midgard_load_store_word *) &data;
-
- print_ld_st_opcode(word->op);
-
- if (is_op_varying(word->op))
- print_varying_parameters(word);
-
- printf(" r%d", word->reg);
- print_mask_4(word->mask);
-
- int address = word->address;
-
- if (word->op == midgard_op_ld_uniform_32) {
- /* Uniforms use their own addressing scheme */
-
- int lo = word->varying_parameters >> 7;
- int hi = word->address;
-
- /* TODO: Combine fields logically */
- address = (hi << 3) | lo;
- }
-
- printf(", %d", address);
-
- print_swizzle_vec4(word->swizzle, false, false);
-
- printf(", 0x%X /* %X */\n", word->unknown, word->varying_parameters);
-}
-
-static void
-print_load_store_word(uint32_t *word, unsigned tabs)
-{
- midgard_load_store *load_store = (midgard_load_store *) word;
-
- if (load_store->word1 != 3) {
- print_load_store_instr(load_store->word1, tabs);
- }
-
- if (load_store->word2 != 3) {
- print_load_store_instr(load_store->word2, tabs);
- }
-}
-
-static void
-print_texture_reg(bool full, bool select, bool upper)
-{
- if (full)
- printf("r%d", REG_TEX_BASE + select);
- else
- printf("hr%d", (REG_TEX_BASE + select) * 2 + upper);
-
- if (full && upper)
- printf("// error: out full / upper mutually exclusive\n");
-
-}
-
-static void
-print_texture_reg_triple(unsigned triple)
-{
- bool full = triple & 1;
- bool select = triple & 2;
- bool upper = triple & 4;
-
- print_texture_reg(full, select, upper);
-}
-
-static void
-print_texture_format(int format)
-{
- /* Act like a modifier */
- printf(".");
-
- switch (format) {
- DEFINE_CASE(MALI_TEX_1D, "1d");
- DEFINE_CASE(MALI_TEX_2D, "2d");
- DEFINE_CASE(MALI_TEX_3D, "3d");
- DEFINE_CASE(MALI_TEX_CUBE, "cube");
-
- default:
- unreachable("Bad format");
- }
-}
-
-static void
-print_texture_op(unsigned op, bool gather)
-{
- /* Act like a bare name, like ESSL functions */
-
- if (gather) {
- printf("textureGather");
-
- unsigned component = op >> 4;
- unsigned bottom = op & 0xF;
-
- if (bottom != 0x2)
- printf("_unk%d", bottom);
-
- printf(".%c", components[component]);
- return;
- }
-
- switch (op) {
- DEFINE_CASE(TEXTURE_OP_NORMAL, "texture");
- DEFINE_CASE(TEXTURE_OP_LOD, "textureLod");
- DEFINE_CASE(TEXTURE_OP_TEXEL_FETCH, "texelFetch");
-
- default:
- printf("tex_%d", op);
- break;
- }
-}
-
-static bool
-texture_op_takes_bias(unsigned op)
-{
- return op == TEXTURE_OP_NORMAL;
-}
-
-static char
-sampler_type_name(enum mali_sampler_type t)
-{
- switch (t) {
- case MALI_SAMPLER_FLOAT:
- return 'f';
- case MALI_SAMPLER_UNSIGNED:
- return 'u';
- case MALI_SAMPLER_SIGNED:
- return 'i';
- default:
- return '?';
- }
-
-}
-
-#undef DEFINE_CASE
-
-static void
-print_texture_word(uint32_t *word, unsigned tabs)
-{
- midgard_texture_word *texture = (midgard_texture_word *) word;
-
- /* Broad category of texture operation in question */
- print_texture_op(texture->op, texture->is_gather);
-
- /* Specific format in question */
- print_texture_format(texture->format);
-
- assert(texture->zero == 0);
-
- /* Instruction "modifiers" parallel the ALU instructions. */
-
- if (texture->shadow)
- printf(".shadow");
-
- if (texture->cont)
- printf(".cont");
-
- if (texture->last)
- printf(".last");
-
- printf(" ");
-
- print_texture_reg(texture->out_full, texture->out_reg_select, texture->out_upper);
- print_mask_4(texture->mask);
- printf(", ");
-
- printf("texture%d, ", texture->texture_handle);
-
- /* Print the type, GL style */
- printf("%c", sampler_type_name(texture->sampler_type));
- printf("sampler%d", texture->sampler_handle);
- print_swizzle_vec4(texture->swizzle, false, false);
- printf(", ");
-
- print_texture_reg(texture->in_reg_full, texture->in_reg_select, texture->in_reg_upper);
- print_swizzle_vec4(texture->in_reg_swizzle, false, false);
-
- /* There is *always* an offset attached. Of
- * course, that offset is just immediate #0 for a
- * GLES call that doesn't take an offset. If there
- * is a non-negative non-zero offset, this is
- * specified in immediate offset mode, with the
- * values in the offset_* fields as immediates. If
- * this is a negative offset, we instead switch to
- * a register offset mode, where the offset_*
- * fields become register triplets */
-
- if (texture->offset_register) {
- printf(" + ");
- print_texture_reg_triple(texture->offset_x);
-
- /* The less questions you ask, the better. */
-
- unsigned swizzle_lo, swizzle_hi;
- unsigned orig_y = texture->offset_y;
- unsigned orig_z = texture->offset_z;
-
- memcpy(&swizzle_lo, &orig_y, sizeof(unsigned));
- memcpy(&swizzle_hi, &orig_z, sizeof(unsigned));
-
- /* Duplicate hi swizzle over */
- assert(swizzle_hi < 4);
- swizzle_hi = (swizzle_hi << 2) | swizzle_hi;
-
- unsigned swiz = (swizzle_lo << 4) | swizzle_hi;
- unsigned reversed = util_bitreverse(swiz) >> 24;
- print_swizzle_vec4(reversed, false, false);
-
- printf(", ");
- } else if (texture->offset_x || texture->offset_y || texture->offset_z) {
- /* Only select ops allow negative immediate offsets, verify */
-
- bool neg_x = texture->offset_x < 0;
- bool neg_y = texture->offset_y < 0;
- bool neg_z = texture->offset_z < 0;
- bool any_neg = neg_x || neg_y || neg_z;
-
- if (any_neg && texture->op != TEXTURE_OP_TEXEL_FETCH)
- printf("/* invalid negative */ ");
-
- /* Regardless, just print the immediate offset */
-
- printf(" + <%d, %d, %d>, ",
- texture->offset_x,
- texture->offset_y,
- texture->offset_z);
- } else {
- printf(", ");
- }
-
- char lod_operand = texture_op_takes_bias(texture->op) ? '+' : '=';
-
- if (texture->lod_register) {
- midgard_tex_register_select sel;
- uint8_t raw = texture->bias;
- memcpy(&sel, &raw, sizeof(raw));
-
- unsigned c = (sel.component_hi << 1) | sel.component_lo;
-
- printf("lod %c ", lod_operand);
- print_texture_reg(sel.full, sel.select, sel.upper);
- printf(".%c, ", components[c]);
-
- if (!sel.component_hi)
- printf(" /* gradient? */");
-
- if (texture->bias_int)
- printf(" /* bias_int = 0x%X */", texture->bias_int);
-
- if (sel.zero)
- printf(" /* sel.zero = 0x%X */", sel.zero);
- } else if (texture->op == TEXTURE_OP_TEXEL_FETCH) {
- /* For texel fetch, the int LOD is in the fractional place and
- * there is no fraction / possibility of bias. We *always* have
- * an explicit LOD, even if it's zero. */
-
- if (texture->bias_int)
- printf(" /* bias_int = 0x%X */ ", texture->bias_int);
-
- printf("lod = %d, ", texture->bias);
- } else if (texture->bias || texture->bias_int) {
- signed bias_int = texture->bias_int;
- float bias_frac = texture->bias / 256.0f;
- float bias = bias_int + bias_frac;
-
- bool is_bias = texture_op_takes_bias(texture->op);
- char sign = (bias >= 0.0) ? '+' : '-';
- char operand = is_bias ? sign : '=';
-
- printf("lod %c %f, ", operand, fabsf(bias));
- }
-
- printf("\n");
-
- /* While not zero in general, for these simple instructions the
- * following unknowns are zero, so we don't include them */
-
- if (texture->unknown2 ||
- texture->unknown4 ||
- texture->unknownA ||
- texture->unknown8) {
- printf("// unknown2 = 0x%x\n", texture->unknown2);
- printf("// unknown4 = 0x%x\n", texture->unknown4);
- printf("// unknownA = 0x%x\n", texture->unknownA);
- printf("// unknown8 = 0x%x\n", texture->unknown8);
- }
-}
-
-void
-disassemble_midgard(uint8_t *code, size_t size)
-{
- uint32_t *words = (uint32_t *) code;
- unsigned num_words = size / 4;
- int tabs = 0;
-
- bool prefetch_flag = false;
-
- unsigned i = 0;
-
- while (i < num_words) {
- unsigned tag = words[i] & 0xF;
- unsigned num_quad_words = midgard_word_size[tag];
-
- switch (midgard_word_types[tag]) {
- case midgard_word_type_texture:
- print_texture_word(&words[i], tabs);
- break;
-
- case midgard_word_type_load_store:
- print_load_store_word(&words[i], tabs);
- break;
-
- case midgard_word_type_alu:
- print_alu_word(&words[i], num_quad_words, tabs);
-
- if (prefetch_flag)
- return;
-
- /* Reset word static analysis state */
- is_embedded_constant_half = false;
- is_embedded_constant_int = false;
-
- break;
-
- default:
- printf("Unknown word type %u:\n", words[i] & 0xF);
- num_quad_words = 1;
- print_quad_word(&words[i], tabs);
- printf("\n");
- break;
- }
-
- printf("\n");
-
- unsigned next = (words[i] & 0xF0) >> 4;
-
- i += 4 * num_quad_words;
-
- /* Break based on instruction prefetch flag */
-
- if (i < num_words && next == 1) {
- prefetch_flag = true;
-
- if (midgard_word_types[words[i] & 0xF] != midgard_word_type_alu)
- return;
- }
- }
-
- return;
-}
+++ /dev/null
-#include <stddef.h>
-void disassemble_midgard(uint8_t *code, size_t size);
+++ /dev/null
-/* Copyright (c) 2018-2019 Alyssa Rosenzweig (alyssa@rosenzweig.io)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef __MDG_HELPERS_H
-#define __MDG_HELPERS_H
-
-#include "util/macros.h"
-#include <string.h>
-
-#define OP_IS_STORE_VARY(op) (\
- op == midgard_op_st_vary_16 || \
- op == midgard_op_st_vary_32 \
- )
-
-#define OP_IS_STORE(op) (\
- OP_IS_STORE_VARY(op) || \
- op == midgard_op_st_cubemap_coords \
- )
-
-#define OP_IS_MOVE(op) ( \
- op == midgard_alu_op_fmov || \
- op == midgard_alu_op_imov \
- )
-
-/* ALU control words are single bit fields with a lot of space */
-
-#define ALU_ENAB_VEC_MUL (1 << 17)
-#define ALU_ENAB_SCAL_ADD (1 << 19)
-#define ALU_ENAB_VEC_ADD (1 << 21)
-#define ALU_ENAB_SCAL_MUL (1 << 23)
-#define ALU_ENAB_VEC_LUT (1 << 25)
-#define ALU_ENAB_BR_COMPACT (1 << 26)
-#define ALU_ENAB_BRANCH (1 << 27)
-
-/* Other opcode properties that don't conflict with the ALU_ENABs, non-ISA */
-
-/* Denotes an opcode that takes a vector input with a fixed-number of
- * channels, but outputs to only a single output channel, like dot products.
- * For these, to determine the effective mask, this quirk can be set. We have
- * an intentional off-by-one (a la MALI_POSITIVE), since 0-channel makes no
- * sense but we need to fit 4 channels in 2-bits. Similarly, 1-channel doesn't
- * make sense (since then why are we quirked?), so that corresponds to "no
- * count set" */
-
-#define OP_CHANNEL_COUNT(c) ((c - 1) << 0)
-#define GET_CHANNEL_COUNT(c) ((c & (0x3 << 0)) ? ((c & (0x3 << 0)) + 1) : 0)
-
-/* For instructions that take a single argument, normally the first argument
- * slot is used for the argument and the second slot is a dummy #0 constant.
- * However, there are exceptions: instructions like fmov store their argument
- * in the _second_ slot and store a dummy r24 in the first slot, designated by
- * QUIRK_FLIPPED_R24 */
-
-#define QUIRK_FLIPPED_R24 (1 << 2)
-
-/* Is the op commutative? */
-#define OP_COMMUTES (1 << 3)
-
-/* Does the op convert types between int- and float- space (i2f/f2u/etc) */
-#define OP_TYPE_CONVERT (1 << 4)
-
-/* Vector-independant shorthands for the above; these numbers are arbitrary and
- * not from the ISA. Convert to the above with unit_enum_to_midgard */
-
-#define UNIT_MUL 0
-#define UNIT_ADD 1
-#define UNIT_LUT 2
-
-/* 4-bit type tags */
-
-#define TAG_TEXTURE_4_VTX 0x2
-#define TAG_TEXTURE_4 0x3
-#define TAG_LOAD_STORE_4 0x5
-#define TAG_ALU_4 0x8
-#define TAG_ALU_8 0x9
-#define TAG_ALU_12 0xA
-#define TAG_ALU_16 0xB
-
-static inline int
-quadword_size(int tag)
-{
- switch (tag) {
- case TAG_ALU_4:
- case TAG_LOAD_STORE_4:
- case TAG_TEXTURE_4:
- case TAG_TEXTURE_4_VTX:
- return 1;
- case TAG_ALU_8:
- return 2;
- case TAG_ALU_12:
- return 3;
- case TAG_ALU_16:
- return 4;
- default:
- unreachable("Unknown tag");
- }
-}
-
-#define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 || \
- tag == TAG_ALU_12 || tag == TAG_ALU_16)
-
-/* Special register aliases */
-
-#define MAX_WORK_REGISTERS 16
-
-/* Uniforms are begin at (REGISTER_UNIFORMS - uniform_count) */
-#define REGISTER_UNIFORMS 24
-
-#define REGISTER_UNUSED 24
-#define REGISTER_CONSTANT 26
-#define REGISTER_VARYING_BASE 26
-#define REGISTER_OFFSET 27
-#define REGISTER_TEXTURE_BASE 28
-#define REGISTER_SELECT 31
-
-/* SSA helper aliases to mimic the registers. UNUSED_0 encoded as an inline
- * constant. UNUSED_1 encoded as REGISTER_UNUSED */
-
-#define SSA_UNUSED_0 0
-#define SSA_UNUSED_1 -2
-
-#define SSA_FIXED_SHIFT 24
-#define SSA_FIXED_REGISTER(reg) ((1 + reg) << SSA_FIXED_SHIFT)
-#define SSA_REG_FROM_FIXED(reg) ((reg >> SSA_FIXED_SHIFT) - 1)
-#define SSA_FIXED_MINIMUM SSA_FIXED_REGISTER(0)
-
-/* Swizzle support */
-
-#define SWIZZLE(A, B, C, D) ((D << 6) | (C << 4) | (B << 2) | (A << 0))
-#define SWIZZLE_FROM_ARRAY(r) SWIZZLE(r[0], r[1], r[2], r[3])
-#define COMPONENT_X 0x0
-#define COMPONENT_Y 0x1
-#define COMPONENT_Z 0x2
-#define COMPONENT_W 0x3
-
-#define SWIZZLE_XXXX SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X)
-#define SWIZZLE_XYXX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X)
-#define SWIZZLE_XYZX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X)
-#define SWIZZLE_XYZW SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W)
-#define SWIZZLE_XYXZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_Z)
-#define SWIZZLE_XYZZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_Z)
-#define SWIZZLE_WWWW SWIZZLE(COMPONENT_W, COMPONENT_W, COMPONENT_W, COMPONENT_W)
-
-static inline unsigned
-swizzle_of(unsigned comp)
-{
- switch (comp) {
- case 1:
- return SWIZZLE_XXXX;
- case 2:
- return SWIZZLE_XYXX;
- case 3:
- return SWIZZLE_XYZX;
- case 4:
- return SWIZZLE_XYZW;
- default:
- unreachable("Invalid component count");
- }
-}
-
-static inline unsigned
-mask_of(unsigned nr_comp)
-{
- return (1 << nr_comp) - 1;
-}
-
-
-/* See ISA notes */
-
-#define LDST_NOP (3)
-
-/* There are five ALU units: VMUL, VADD, SMUL, SADD, LUT. A given opcode is
- * implemented on some subset of these units (or occassionally all of them).
- * This table encodes a bit mask of valid units for each opcode, so the
- * scheduler can figure where to plonk the instruction. */
-
-/* Shorthands for each unit */
-#define UNIT_VMUL ALU_ENAB_VEC_MUL
-#define UNIT_SADD ALU_ENAB_SCAL_ADD
-#define UNIT_VADD ALU_ENAB_VEC_ADD
-#define UNIT_SMUL ALU_ENAB_SCAL_MUL
-#define UNIT_VLUT ALU_ENAB_VEC_LUT
-
-/* Shorthands for usual combinations of units */
-
-#define UNITS_MUL (UNIT_VMUL | UNIT_SMUL)
-#define UNITS_ADD (UNIT_VADD | UNIT_SADD)
-#define UNITS_MOST (UNITS_MUL | UNITS_ADD)
-#define UNITS_ALL (UNITS_MOST | UNIT_VLUT)
-#define UNITS_SCALAR (UNIT_SADD | UNIT_SMUL)
-#define UNITS_VECTOR (UNIT_VMUL | UNIT_VADD)
-#define UNITS_ANY_VECTOR (UNITS_VECTOR | UNIT_VLUT)
-
-struct mir_op_props {
- const char *name;
- unsigned props;
-};
-
-/* This file is common, so don't define the tables themselves. #include
- * midgard_op.h if you need that, or edit midgard_ops.c directly */
-
-/* Duplicate bits to convert a 4-bit writemask to duplicated 8-bit format,
- * which is used for 32-bit vector units */
-
-static inline unsigned
-expand_writemask_32(unsigned mask)
-{
- unsigned o = 0;
-
- for (int i = 0; i < 4; ++i)
- if (mask & (1 << i))
- o |= (3 << (2 * i));
-
- return o;
-}
-
-/* Coerce structs to integer */
-
-static inline unsigned
-vector_alu_srco_unsigned(midgard_vector_alu_src src)
-{
- unsigned u;
- memcpy(&u, &src, sizeof(src));
- return u;
-}
-
-static inline midgard_vector_alu_src
-vector_alu_from_unsigned(unsigned u)
-{
- midgard_vector_alu_src s;
- memcpy(&s, &u, sizeof(s));
- return s;
-}
-
-/* Composes two swizzles */
-static inline unsigned
-pan_compose_swizzle(unsigned left, unsigned right)
-{
- unsigned out = 0;
-
- for (unsigned c = 0; c < 4; ++c) {
- unsigned s = (left >> (2*c)) & 0x3;
- unsigned q = (right >> (2*s)) & 0x3;
-
- out |= (q << (2*c));
- }
-
- return out;
-}
-
-/* Applies a swizzle to an ALU source */
-
-static inline unsigned
-vector_alu_apply_swizzle(unsigned src, unsigned swizzle)
-{
- midgard_vector_alu_src s =
- vector_alu_from_unsigned(src);
-
- s.swizzle = pan_compose_swizzle(s.swizzle, swizzle);
-
- return vector_alu_srco_unsigned(s);
-}
-
-#endif
+++ /dev/null
-/* Author(s):
- * Connor Abbott
- * Alyssa Rosenzweig
- *
- * Copyright (c) 2013 Connor Abbott (connor@abbott.cx)
- * Copyright (c) 2018 Alyssa Rosenzweig (alyssa@rosenzweig.io)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef __midgard_parse_h__
-#define __midgard_parse_h__
-
-/* Additional metadata for parsing Midgard binaries, not needed for compilation */
-
-static midgard_word_type midgard_word_types[16] = {
- midgard_word_type_unknown, /* 0x0 */
- midgard_word_type_unknown, /* 0x1 */
- midgard_word_type_texture, /* 0x2 */
- midgard_word_type_texture, /* 0x3 */
- midgard_word_type_unknown, /* 0x4 */
- midgard_word_type_load_store, /* 0x5 */
- midgard_word_type_unknown, /* 0x6 */
- midgard_word_type_unknown, /* 0x7 */
- midgard_word_type_alu, /* 0x8 */
- midgard_word_type_alu, /* 0x9 */
- midgard_word_type_alu, /* 0xA */
- midgard_word_type_alu, /* 0xB */
- midgard_word_type_alu, /* 0xC */
- midgard_word_type_alu, /* 0xD */
- midgard_word_type_alu, /* 0xE */
- midgard_word_type_alu, /* 0xF */
-};
-
-static unsigned midgard_word_size[16] = {
- 0, /* 0x0 */
- 0, /* 0x1 */
- 1, /* 0x2 */
- 1, /* 0x3 */
- 0, /* 0x4 */
- 1, /* 0x5 */
- 0, /* 0x6 */
- 0, /* 0x7 */
- 1, /* 0x8 */
- 2, /* 0x9 */
- 3, /* 0xA */
- 4, /* 0xB */
- 1, /* 0xC */
- 2, /* 0xD */
- 3, /* 0xE */
- 4, /* 0xF */
-};
-
-#endif
+++ /dev/null
-/* Author(s):
- * Connor Abbott
- * Alyssa Rosenzweig
- *
- * Copyright (c) 2013 Connor Abbott (connor@abbott.cx)
- * Copyright (c) 2018 Alyssa Rosenzweig (alyssa@rosenzweig.io)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef __midgard_h__
-#define __midgard_h__
-
-#include <stdint.h>
-#include <stdbool.h>
-#include "panfrost-job.h"
-
-#define MIDGARD_DBG_MSGS 0x0001
-#define MIDGARD_DBG_SHADERS 0x0002
-#define MIDGARD_DBG_SHADERDB 0x0004
-
-extern int midgard_debug;
-
-typedef enum {
- midgard_word_type_alu,
- midgard_word_type_load_store,
- midgard_word_type_texture,
- midgard_word_type_unknown
-} midgard_word_type;
-
-typedef enum {
- midgard_alu_vmul,
- midgard_alu_sadd,
- midgard_alu_smul,
- midgard_alu_vadd,
- midgard_alu_lut
-} midgard_alu;
-
-/*
- * ALU words
- */
-
-typedef enum {
- midgard_alu_op_fadd = 0x10,
- midgard_alu_op_fmul = 0x14,
-
- midgard_alu_op_fmin = 0x28,
- midgard_alu_op_fmax = 0x2C,
-
- midgard_alu_op_fmov = 0x30, /* fmov_rte */
- midgard_alu_op_fmov_rtz = 0x31,
- midgard_alu_op_fmov_rtn = 0x32,
- midgard_alu_op_fmov_rtp = 0x33,
- midgard_alu_op_froundeven = 0x34,
- midgard_alu_op_ftrunc = 0x35,
- midgard_alu_op_ffloor = 0x36,
- midgard_alu_op_fceil = 0x37,
- midgard_alu_op_ffma = 0x38,
- midgard_alu_op_fdot3 = 0x3C,
- midgard_alu_op_fdot3r = 0x3D,
- midgard_alu_op_fdot4 = 0x3E,
- midgard_alu_op_freduce = 0x3F,
-
- midgard_alu_op_iadd = 0x40,
- midgard_alu_op_ishladd = 0x41,
- midgard_alu_op_isub = 0x46,
- midgard_alu_op_iaddsat = 0x48,
- midgard_alu_op_uaddsat = 0x49,
- midgard_alu_op_isubsat = 0x4E,
- midgard_alu_op_usubsat = 0x4F,
-
- midgard_alu_op_imul = 0x58,
-
- midgard_alu_op_imin = 0x60,
- midgard_alu_op_umin = 0x61,
- midgard_alu_op_imax = 0x62,
- midgard_alu_op_umax = 0x63,
- midgard_alu_op_ihadd = 0x64,
- midgard_alu_op_uhadd = 0x65,
- midgard_alu_op_irhadd = 0x66,
- midgard_alu_op_urhadd = 0x67,
- midgard_alu_op_iasr = 0x68,
- midgard_alu_op_ilsr = 0x69,
- midgard_alu_op_ishl = 0x6E,
-
- midgard_alu_op_iand = 0x70,
- midgard_alu_op_ior = 0x71,
- midgard_alu_op_inand = 0x72, /* ~(a & b), for inot let a = b */
- midgard_alu_op_inor = 0x73, /* ~(a | b) */
- midgard_alu_op_iandnot = 0x74, /* (a & ~b), used for not/b2f */
- midgard_alu_op_iornot = 0x75, /* (a | ~b) */
- midgard_alu_op_ixor = 0x76,
- midgard_alu_op_inxor = 0x77, /* ~(a & b) */
- midgard_alu_op_iclz = 0x78, /* Number of zeroes on left */
- midgard_alu_op_ibitcount8 = 0x7A, /* Counts bits in 8-bit increments */
- midgard_alu_op_imov = 0x7B,
- midgard_alu_op_iabsdiff = 0x7C,
- midgard_alu_op_uabsdiff = 0x7D,
- midgard_alu_op_ichoose = 0x7E, /* vector, component number - dupe for shuffle() */
-
- midgard_alu_op_feq = 0x80,
- midgard_alu_op_fne = 0x81,
- midgard_alu_op_flt = 0x82,
- midgard_alu_op_fle = 0x83,
- midgard_alu_op_fball_eq = 0x88,
- midgard_alu_op_bball_eq = 0x89,
- midgard_alu_op_fball_lt = 0x8A, /* all(lessThan(.., ..)) */
- midgard_alu_op_fball_lte = 0x8B, /* all(lessThanEqual(.., ..)) */
-
- midgard_alu_op_bbany_neq = 0x90, /* used for bvec4(1) */
- midgard_alu_op_fbany_neq = 0x91, /* bvec4(0) also */
- midgard_alu_op_fbany_lt = 0x92, /* any(lessThan(.., ..)) */
- midgard_alu_op_fbany_lte = 0x93, /* any(lessThanEqual(.., ..)) */
-
- midgard_alu_op_f2i_rte = 0x98,
- midgard_alu_op_f2i_rtz = 0x99,
- midgard_alu_op_f2i_rtn = 0x9A,
- midgard_alu_op_f2i_rtp = 0x9B,
- midgard_alu_op_f2u_rte = 0x9C,
- midgard_alu_op_f2u_rtz = 0x9D,
- midgard_alu_op_f2u_rtn = 0x9E,
- midgard_alu_op_f2u_rtp = 0x9F,
-
- midgard_alu_op_ieq = 0xA0,
- midgard_alu_op_ine = 0xA1,
- midgard_alu_op_ult = 0xA2,
- midgard_alu_op_ule = 0xA3,
- midgard_alu_op_ilt = 0xA4,
- midgard_alu_op_ile = 0xA5,
- midgard_alu_op_iball_eq = 0xA8,
- midgard_alu_op_iball_neq = 0xA9,
- midgard_alu_op_uball_lt = 0xAA,
- midgard_alu_op_uball_lte = 0xAB,
- midgard_alu_op_iball_lt = 0xAC,
- midgard_alu_op_iball_lte = 0xAD,
-
- midgard_alu_op_ibany_eq = 0xB0,
- midgard_alu_op_ibany_neq = 0xB1,
- midgard_alu_op_ubany_lt = 0xB2,
- midgard_alu_op_ubany_lte = 0xB3,
- midgard_alu_op_ibany_lt = 0xB4, /* any(lessThan(.., ..)) */
- midgard_alu_op_ibany_lte = 0xB5, /* any(lessThanEqual(.., ..)) */
- midgard_alu_op_i2f_rte = 0xB8,
- midgard_alu_op_i2f_rtz = 0xB9,
- midgard_alu_op_i2f_rtn = 0xBA,
- midgard_alu_op_i2f_rtp = 0xBB,
- midgard_alu_op_u2f_rte = 0xBC,
- midgard_alu_op_u2f_rtz = 0xBD,
- midgard_alu_op_u2f_rtn = 0xBE,
- midgard_alu_op_u2f_rtp = 0xBF,
-
- midgard_alu_op_icsel_v = 0xC0, /* condition code r31 */
- midgard_alu_op_icsel = 0xC1, /* condition code r31.w */
- midgard_alu_op_fcsel_v = 0xC4,
- midgard_alu_op_fcsel = 0xC5,
- midgard_alu_op_fround = 0xC6,
-
- midgard_alu_op_fatan_pt2 = 0xE8,
- midgard_alu_op_fpow_pt1 = 0xEC,
- midgard_alu_op_fpown_pt1 = 0xED,
- midgard_alu_op_fpowr_pt1 = 0xEE,
-
- midgard_alu_op_frcp = 0xF0,
- midgard_alu_op_frsqrt = 0xF2,
- midgard_alu_op_fsqrt = 0xF3,
- midgard_alu_op_fexp2 = 0xF4,
- midgard_alu_op_flog2 = 0xF5,
- midgard_alu_op_fsin = 0xF6,
- midgard_alu_op_fcos = 0xF7,
- midgard_alu_op_fatan2_pt1 = 0xF9,
-} midgard_alu_op;
-
-typedef enum {
- midgard_outmod_none = 0,
- midgard_outmod_pos = 1,
- /* 0x2 unknown */
- midgard_outmod_sat = 3
-} midgard_outmod_float;
-
-typedef enum {
- midgard_outmod_int_saturate = 0,
- midgard_outmod_uint_saturate = 1,
- midgard_outmod_int_wrap = 2,
- midgard_outmod_int_high = 3, /* Overflowed portion */
-} midgard_outmod_int;
-
-typedef enum {
- midgard_reg_mode_8 = 0,
- midgard_reg_mode_16 = 1,
- midgard_reg_mode_32 = 2,
- midgard_reg_mode_64 = 3
-} midgard_reg_mode;
-
-typedef enum {
- midgard_dest_override_lower = 0,
- midgard_dest_override_upper = 1,
- midgard_dest_override_none = 2
-} midgard_dest_override;
-
-typedef enum {
- midgard_int_sign_extend = 0,
- midgard_int_zero_extend = 1,
- midgard_int_normal = 2,
- midgard_int_shift = 3
-} midgard_int_mod;
-
-#define MIDGARD_FLOAT_MOD_ABS (1 << 0)
-#define MIDGARD_FLOAT_MOD_NEG (1 << 1)
-
-typedef struct
-__attribute__((__packed__))
-{
- /* Either midgard_int_mod or from midgard_float_mod_*, depending on the
- * type of op */
- unsigned mod : 2;
-
- /* replicate lower half if dest = half, or low/high half selection if
- * dest = full
- */
- bool rep_low : 1;
- bool rep_high : 1; /* unused if dest = full */
- bool half : 1; /* only matters if dest = full */
- unsigned swizzle : 8;
-}
-midgard_vector_alu_src;
-
-typedef struct
-__attribute__((__packed__))
-{
- midgard_alu_op op : 8;
- midgard_reg_mode reg_mode : 2;
- unsigned src1 : 13;
- unsigned src2 : 13;
- midgard_dest_override dest_override : 2;
- midgard_outmod_float outmod : 2;
- unsigned mask : 8;
-}
-midgard_vector_alu;
-
-typedef struct
-__attribute__((__packed__))
-{
- bool abs : 1;
- bool negate : 1;
- bool full : 1; /* 0 = half, 1 = full */
- unsigned component : 3;
-}
-midgard_scalar_alu_src;
-
-typedef struct
-__attribute__((__packed__))
-{
- midgard_alu_op op : 8;
- unsigned src1 : 6;
- unsigned src2 : 11;
- unsigned unknown : 1;
- unsigned outmod : 2;
- bool output_full : 1;
- unsigned output_component : 3;
-}
-midgard_scalar_alu;
-
-typedef struct
-__attribute__((__packed__))
-{
- unsigned src1_reg : 5;
- unsigned src2_reg : 5;
- unsigned out_reg : 5;
- bool src2_imm : 1;
-}
-midgard_reg_info;
-
-/* In addition to conditional branches and jumps (unconditional branches),
- * Midgard implements a bit of fixed function functionality used in fragment
- * shaders via specially crafted branches. These have special branch opcodes,
- * which perform a fixed-function operation and/or use the results of a
- * fixed-function operation as the branch condition. */
-
-typedef enum {
- /* Regular branches */
- midgard_jmp_writeout_op_branch_uncond = 1,
- midgard_jmp_writeout_op_branch_cond = 2,
-
- /* In a fragment shader, execute a discard_if instruction, with the
- * corresponding condition code. Terminates the shader, so generally
- * set the branch target to out of the shader */
- midgard_jmp_writeout_op_discard = 4,
-
- /* Branch if the tilebuffer is not yet ready. At the beginning of a
- * fragment shader that reads from the tile buffer, for instance via
- * ARM_shader_framebuffer_fetch or EXT_pixel_local_storage, this branch
- * operation should be used as a loop. An instruction like
- * "br.tilebuffer.always -1" does the trick, corresponding to
- * "while(!is_tilebuffer_ready) */
- midgard_jmp_writeout_op_tilebuffer_pending = 6,
-
- /* In a fragment shader, try to write out the value pushed to r0 to the
- * tilebuffer, subject to unknown state in r1.z and r1.w. If this
- * succeeds, the shader terminates. If it fails, it branches to the
- * specified branch target. Generally, this should be used in a loop to
- * itself, acting as "do { write(r0); } while(!write_successful);" */
- midgard_jmp_writeout_op_writeout = 7,
-} midgard_jmp_writeout_op;
-
-typedef enum {
- midgard_condition_write0 = 0,
-
- /* These condition codes denote a conditional branch on FALSE and on
- * TRUE respectively */
- midgard_condition_false = 1,
- midgard_condition_true = 2,
-
- /* This condition code always branches. For a pure branch, the
- * unconditional branch coding should be used instead, but for
- * fixed-function branch opcodes, this is still useful */
- midgard_condition_always = 3,
-} midgard_condition;
-
-typedef struct
-__attribute__((__packed__))
-{
- midgard_jmp_writeout_op op : 3; /* == branch_uncond */
- unsigned dest_tag : 4; /* tag of branch destination */
- unsigned unknown : 2;
- int offset : 7;
-}
-midgard_branch_uncond;
-
-typedef struct
-__attribute__((__packed__))
-{
- midgard_jmp_writeout_op op : 3; /* == branch_cond */
- unsigned dest_tag : 4; /* tag of branch destination */
- int offset : 7;
- midgard_condition cond : 2;
-}
-midgard_branch_cond;
-
-typedef struct
-__attribute__((__packed__))
-{
- midgard_jmp_writeout_op op : 3; /* == branch_cond */
- unsigned dest_tag : 4; /* tag of branch destination */
- unsigned unknown : 2;
- signed offset : 23;
- unsigned cond : 16;
-}
-midgard_branch_extended;
-
-typedef struct
-__attribute__((__packed__))
-{
- midgard_jmp_writeout_op op : 3; /* == writeout */
- unsigned unknown : 13;
-}
-midgard_writeout;
-
-/*
- * Load/store words
- */
-
-typedef enum {
- midgard_op_ld_st_noop = 0x03,
-
- /* Unclear why this is on the L/S unit, but (with an address of 0,
- * appropriate swizzle, magic constant 0x24, and xy mask?) moves fp32 cube
- * map coordinates in r27 to its cube map texture coordinate
- * destination (e.g r29). 0x4 magic for lding from fp16 instead */
-
- midgard_op_st_cubemap_coords = 0x0E,
-
- /* Used in OpenCL. Probably can ld other things as well */
- midgard_op_ld_global_id = 0x10,
-
- /* The L/S unit can do perspective division a clock faster than the ALU
- * if you're lucky. Put the vec4 in r27, and call with 0x24 as the
- * unknown state; the output will be <x/w, y/w, z/w, 1>. Replace w with
- * z for the z version */
- midgard_op_ldst_perspective_division_z = 0x12,
- midgard_op_ldst_perspective_division_w = 0x13,
-
- /* val in r27.y, address embedded, outputs result to argument. Invert val for sub. Let val = +-1 for inc/dec. */
- midgard_op_atomic_add = 0x40,
- midgard_op_atomic_and = 0x44,
- midgard_op_atomic_or = 0x48,
- midgard_op_atomic_xor = 0x4C,
-
- midgard_op_atomic_imin = 0x50,
- midgard_op_atomic_umin = 0x54,
- midgard_op_atomic_imax = 0x58,
- midgard_op_atomic_umax = 0x5C,
-
- midgard_op_atomic_xchg = 0x60,
-
- /* Used for compute shader's __global arguments, __local variables (or
- * for register spilling) */
-
- midgard_op_ld_char = 0x81,
- midgard_op_ld_char2 = 0x84,
- midgard_op_ld_short = 0x85,
- midgard_op_ld_char4 = 0x88, /* short2, int, float */
- midgard_op_ld_short4 = 0x8C, /* int2, float2, long */
- midgard_op_ld_int4 = 0x90, /* float4, long2 */
-
- midgard_op_ld_attr_32 = 0x94,
- midgard_op_ld_attr_16 = 0x95,
- midgard_op_ld_attr_32u = 0x96,
- midgard_op_ld_attr_32i = 0x97,
- midgard_op_ld_vary_32 = 0x98,
- midgard_op_ld_vary_16 = 0x99,
- midgard_op_ld_vary_32u = 0x9A,
- midgard_op_ld_vary_32i = 0x9B,
- midgard_op_ld_color_buffer_16 = 0x9D,
-
- midgard_op_ld_uniform_16 = 0xAC,
- midgard_op_ld_uniform_32i = 0xA8,
-
- midgard_op_ld_uniform_32 = 0xB0,
- midgard_op_ld_color_buffer_8 = 0xBA,
-
- midgard_op_st_char = 0xC0,
- midgard_op_st_char2 = 0xC4, /* short */
- midgard_op_st_char4 = 0xC8, /* short2, int, float */
- midgard_op_st_short4 = 0xCC, /* int2, float2, long */
- midgard_op_st_int4 = 0xD0, /* float4, long2 */
-
- midgard_op_st_vary_32 = 0xD4,
- midgard_op_st_vary_16 = 0xD5,
- midgard_op_st_vary_32u = 0xD6,
- midgard_op_st_vary_32i = 0xD7,
-
- /* Value to st in r27, location r26.w as short2 */
- midgard_op_st_image_f = 0xD8,
- midgard_op_st_image_ui = 0xDA,
- midgard_op_st_image_i = 0xDB,
-} midgard_load_store_op;
-
-typedef enum {
- midgard_interp_centroid = 1,
- midgard_interp_default = 2
-} midgard_interpolation;
-
-typedef enum {
- midgard_varying_mod_none = 0,
-
- /* Other values unknown */
-
- /* Take the would-be result and divide all components by its z/w
- * (perspective division baked in with the load) */
- midgard_varying_mod_perspective_z = 2,
- midgard_varying_mod_perspective_w = 3,
-} midgard_varying_modifier;
-
-typedef struct
-__attribute__((__packed__))
-{
- unsigned zero0 : 1; /* Always zero */
-
- midgard_varying_modifier modifier : 2;
-
- unsigned zero1: 1; /* Always zero */
-
- /* Varying qualifiers, zero if not a varying */
- unsigned flat : 1;
- unsigned is_varying : 1; /* Always one for varying, but maybe something else? */
- midgard_interpolation interpolation : 2;
-
- unsigned zero2 : 2; /* Always zero */
-}
-midgard_varying_parameter;
-
-typedef struct
-__attribute__((__packed__))
-{
- midgard_load_store_op op : 8;
- unsigned reg : 5;
- unsigned mask : 4;
- unsigned swizzle : 8;
- unsigned unknown : 16;
-
- unsigned varying_parameters : 10;
-
- unsigned address : 9;
-}
-midgard_load_store_word;
-
-typedef struct
-__attribute__((__packed__))
-{
- unsigned type : 4;
- unsigned next_type : 4;
- uint64_t word1 : 60;
- uint64_t word2 : 60;
-}
-midgard_load_store;
-
-/* 8-bit register selector used in texture ops to select a bias/LOD/gradient
- * register, shoved into the `bias` field */
-
-typedef struct
-__attribute__((__packed__))
-{
- /* Combines with component_hi to form 2-bit component select out of
- * xyzw, as the component for bias/LOD and the starting component of a
- * gradient vector */
-
- unsigned component_lo : 1;
-
- /* Register select between r28/r29 */
- unsigned select : 1;
-
- /* For a half-register, selects the upper half */
- unsigned upper : 1;
-
- /* Specifies a full-register, clear for a half-register. Mutually
- * exclusive with upper. */
- unsigned full : 1;
-
- /* Higher half of component_lo. Always seen to be set for LOD/bias
- * and clear for processed gradients, but I'm not sure if that's a
- * hardware requirement. */
- unsigned component_hi : 1;
-
- /* Padding to make this 8-bit */
- unsigned zero : 3;
-}
-midgard_tex_register_select;
-
-/* Texture pipeline results are in r28-r29 */
-#define REG_TEX_BASE 28
-
-/* Texture opcodes... maybe? */
-#define TEXTURE_OP_NORMAL 0x11 /* texture */
-#define TEXTURE_OP_LOD 0x12 /* textureLod */
-#define TEXTURE_OP_TEXEL_FETCH 0x14 /* texelFetch */
-
-enum mali_sampler_type {
- MALI_SAMPLER_UNK = 0x0,
- MALI_SAMPLER_FLOAT = 0x1, /* sampler */
- MALI_SAMPLER_UNSIGNED = 0x2, /* usampler */
- MALI_SAMPLER_SIGNED = 0x3, /* isampler */
-};
-
-typedef struct
-__attribute__((__packed__))
-{
- unsigned type : 4;
- unsigned next_type : 4;
-
- unsigned op : 6;
- unsigned shadow : 1;
- unsigned is_gather : 1;
-
- /* A little obscure, but last is set for the last texture operation in
- * a shader. cont appears to just be last's opposite (?). Yeah, I know,
- * kind of funky.. BiOpen thinks it could do with memory hinting, or
- * tile locking? */
-
- unsigned cont : 1;
- unsigned last : 1;
-
- enum mali_texture_type format : 2;
- unsigned zero : 2;
-
- /* Is a register used to specify the
- * LOD/bias/offset? If set, use the `bias` field as
- * a register index. If clear, use the `bias` field
- * as an immediate. */
- unsigned lod_register : 1;
-
- /* Is a register used to specify an offset? If set, use the
- * offset_reg_* fields to encode this, duplicated for each of the
- * components. If clear, there is implcitly always an immediate offst
- * specificed in offset_imm_* */
- unsigned offset_register : 1;
-
- unsigned in_reg_full : 1;
- unsigned in_reg_select : 1;
- unsigned in_reg_upper : 1;
- unsigned in_reg_swizzle : 8;
-
- unsigned unknown8 : 2;
-
- unsigned out_full : 1;
-
- enum mali_sampler_type sampler_type : 2;
-
- unsigned out_reg_select : 1;
- unsigned out_upper : 1;
-
- unsigned mask : 4;
-
- unsigned unknown2 : 2;
-
- unsigned swizzle : 8;
- unsigned unknown4 : 8;
-
- unsigned unknownA : 4;
-
- /* In immediate mode, each offset field is an immediate range [0, 7].
- *
- * In register mode, offset_x becomes a register full / select / upper
- * triplet and a vec3 swizzle is splattered across offset_y/offset_z in
- * a genuinely bizarre way.
- *
- * For texel fetches in immediate mode, the range is the full [-8, 7],
- * but for normal texturing the top bit must be zero and a register
- * used instead. It's not clear where this limitation is from. */
-
- signed offset_x : 4;
- signed offset_y : 4;
- signed offset_z : 4;
-
- /* In immediate bias mode, for a normal texture op, this is
- * texture bias, computed as int(2^8 * frac(biasf)), with
- * bias_int = floor(bias). For a textureLod, it's that, but
- * s/bias/lod. For a texel fetch, this is the LOD as-is.
- *
- * In register mode, this is a midgard_tex_register_select
- * structure and bias_int is zero */
-
- unsigned bias : 8;
- signed bias_int : 8;
-
- unsigned texture_handle : 16;
- unsigned sampler_handle : 16;
-}
-midgard_texture_word;
-
-#endif
+++ /dev/null
-/*
- * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <err.h>
-
-#include "main/mtypes.h"
-#include "compiler/glsl/glsl_to_nir.h"
-#include "compiler/nir_types.h"
-#include "main/imports.h"
-#include "compiler/nir/nir_builder.h"
-#include "util/half_float.h"
-#include "util/u_math.h"
-#include "util/u_debug.h"
-#include "util/u_dynarray.h"
-#include "util/list.h"
-#include "main/mtypes.h"
-
-#include "midgard.h"
-#include "midgard_nir.h"
-#include "midgard_compile.h"
-#include "midgard_ops.h"
-#include "helpers.h"
-#include "compiler.h"
-
-#include "disassemble.h"
-
-static const struct debug_named_value debug_options[] = {
- {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"},
- {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"},
- {"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"},
- DEBUG_NAMED_VALUE_END
-};
-
-DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", debug_options, 0)
-
-unsigned SHADER_DB_COUNT = 0;
-
-int midgard_debug = 0;
-
-#define DBG(fmt, ...) \
- do { if (midgard_debug & MIDGARD_DBG_MSGS) \
- fprintf(stderr, "%s:%d: "fmt, \
- __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
-
-static bool
-midgard_is_branch_unit(unsigned unit)
-{
- return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT);
-}
-
-static void
-midgard_block_add_successor(midgard_block *block, midgard_block *successor)
-{
- block->successors[block->nr_successors++] = successor;
- assert(block->nr_successors <= ARRAY_SIZE(block->successors));
-}
-
-/* Helpers to generate midgard_instruction's using macro magic, since every
- * driver seems to do it that way */
-
-#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
-
-#define M_LOAD_STORE(name, rname, uname) \
- static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
- midgard_instruction i = { \
- .type = TAG_LOAD_STORE_4, \
- .mask = 0xF, \
- .ssa_args = { \
- .rname = ssa, \
- .uname = -1, \
- .src1 = -1 \
- }, \
- .load_store = { \
- .op = midgard_op_##name, \
- .swizzle = SWIZZLE_XYZW, \
- .address = address \
- } \
- }; \
- \
- return i; \
- }
-
-#define M_LOAD(name) M_LOAD_STORE(name, dest, src0)
-#define M_STORE(name) M_LOAD_STORE(name, src0, dest)
-
-/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs
- * the corresponding Midgard source */
-
-static midgard_vector_alu_src
-vector_alu_modifiers(nir_alu_src *src, bool is_int, unsigned broadcast_count,
- bool half, bool sext)
-{
- if (!src) return blank_alu_src;
-
- /* Figure out how many components there are so we can adjust the
- * swizzle. Specifically we want to broadcast the last channel so
- * things like ball2/3 work
- */
-
- if (broadcast_count) {
- uint8_t last_component = src->swizzle[broadcast_count - 1];
-
- for (unsigned c = broadcast_count; c < NIR_MAX_VEC_COMPONENTS; ++c) {
- src->swizzle[c] = last_component;
- }
- }
-
- midgard_vector_alu_src alu_src = {
- .rep_low = 0,
- .rep_high = 0,
- .half = half,
- .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle)
- };
-
- if (is_int) {
- alu_src.mod = midgard_int_normal;
-
- /* Sign/zero-extend if needed */
-
- if (half) {
- alu_src.mod = sext ?
- midgard_int_sign_extend
- : midgard_int_zero_extend;
- }
-
- /* These should have been lowered away */
- assert(!(src->abs || src->negate));
- } else {
- alu_src.mod = (src->abs << 0) | (src->negate << 1);
- }
-
- return alu_src;
-}
-
-/* load/store instructions have both 32-bit and 16-bit variants, depending on
- * whether we are using vectors composed of highp or mediump. At the moment, we
- * don't support half-floats -- this requires changes in other parts of the
- * compiler -- therefore the 16-bit versions are commented out. */
-
-//M_LOAD(ld_attr_16);
-M_LOAD(ld_attr_32);
-//M_LOAD(ld_vary_16);
-M_LOAD(ld_vary_32);
-//M_LOAD(ld_uniform_16);
-M_LOAD(ld_uniform_32);
-M_LOAD(ld_color_buffer_8);
-//M_STORE(st_vary_16);
-M_STORE(st_vary_32);
-M_STORE(st_cubemap_coords);
-
-static midgard_instruction
-v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond)
-{
- midgard_branch_cond branch = {
- .op = op,
- .dest_tag = tag,
- .offset = offset,
- .cond = cond
- };
-
- uint16_t compact;
- memcpy(&compact, &branch, sizeof(branch));
-
- midgard_instruction ins = {
- .type = TAG_ALU_4,
- .unit = ALU_ENAB_BR_COMPACT,
- .prepacked_branch = true,
- .compact_branch = true,
- .br_compact = compact
- };
-
- if (op == midgard_jmp_writeout_op_writeout)
- ins.writeout = true;
-
- return ins;
-}
-
-static midgard_instruction
-v_branch(bool conditional, bool invert)
-{
- midgard_instruction ins = {
- .type = TAG_ALU_4,
- .unit = ALU_ENAB_BRANCH,
- .compact_branch = true,
- .branch = {
- .conditional = conditional,
- .invert_conditional = invert
- }
- };
-
- return ins;
-}
-
-static midgard_branch_extended
-midgard_create_branch_extended( midgard_condition cond,
- midgard_jmp_writeout_op op,
- unsigned dest_tag,
- signed quadword_offset)
-{
- /* For unclear reasons, the condition code is repeated 8 times */
- uint16_t duplicated_cond =
- (cond << 14) |
- (cond << 12) |
- (cond << 10) |
- (cond << 8) |
- (cond << 6) |
- (cond << 4) |
- (cond << 2) |
- (cond << 0);
-
- midgard_branch_extended branch = {
- .op = op,
- .dest_tag = dest_tag,
- .offset = quadword_offset,
- .cond = duplicated_cond
- };
-
- return branch;
-}
-
-static void
-attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
-{
- ins->has_constants = true;
- memcpy(&ins->constants, constants, 16);
-}
-
-static int
-glsl_type_size(const struct glsl_type *type, bool bindless)
-{
- return glsl_count_attribute_slots(type, false);
-}
-
-/* Lower fdot2 to a vector multiplication followed by channel addition */
-static void
-midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu)
-{
- if (alu->op != nir_op_fdot2)
- return;
-
- b->cursor = nir_before_instr(&alu->instr);
-
- nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);
- nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);
-
- nir_ssa_def *product = nir_fmul(b, src0, src1);
-
- nir_ssa_def *sum = nir_fadd(b,
- nir_channel(b, product, 0),
- nir_channel(b, product, 1));
-
- /* Replace the fdot2 with this sum */
- nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum));
-}
-
-static int
-midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr)
-{
- switch (instr->intrinsic) {
- case nir_intrinsic_load_viewport_scale:
- return PAN_SYSVAL_VIEWPORT_SCALE;
- case nir_intrinsic_load_viewport_offset:
- return PAN_SYSVAL_VIEWPORT_OFFSET;
- default:
- return -1;
- }
-}
-
-static unsigned
-nir_dest_index(compiler_context *ctx, nir_dest *dst)
-{
- if (dst->is_ssa)
- return dst->ssa.index;
- else {
- assert(!dst->reg.indirect);
- return ctx->func->impl->ssa_alloc + dst->reg.reg->index;
- }
-}
-
-static int sysval_for_instr(compiler_context *ctx, nir_instr *instr,
- unsigned *dest)
-{
- nir_intrinsic_instr *intr;
- nir_dest *dst = NULL;
- nir_tex_instr *tex;
- int sysval = -1;
-
- switch (instr->type) {
- case nir_instr_type_intrinsic:
- intr = nir_instr_as_intrinsic(instr);
- sysval = midgard_nir_sysval_for_intrinsic(intr);
- dst = &intr->dest;
- break;
- case nir_instr_type_tex:
- tex = nir_instr_as_tex(instr);
- if (tex->op != nir_texop_txs)
- break;
-
- sysval = PAN_SYSVAL(TEXTURE_SIZE,
- PAN_TXS_SYSVAL_ID(tex->texture_index,
- nir_tex_instr_dest_size(tex) -
- (tex->is_array ? 1 : 0),
- tex->is_array));
- dst = &tex->dest;
- break;
- default:
- break;
- }
-
- if (dest && dst)
- *dest = nir_dest_index(ctx, dst);
-
- return sysval;
-}
-
-static void
-midgard_nir_assign_sysval_body(compiler_context *ctx, nir_instr *instr)
-{
- int sysval;
-
- sysval = sysval_for_instr(ctx, instr, NULL);
- if (sysval < 0)
- return;
-
- /* We have a sysval load; check if it's already been assigned */
-
- if (_mesa_hash_table_u64_search(ctx->sysval_to_id, sysval))
- return;
-
- /* It hasn't -- so assign it now! */
-
- unsigned id = ctx->sysval_count++;
- _mesa_hash_table_u64_insert(ctx->sysval_to_id, sysval, (void *) ((uintptr_t) id + 1));
- ctx->sysvals[id] = sysval;
-}
-
-static void
-midgard_nir_assign_sysvals(compiler_context *ctx, nir_shader *shader)
-{
- ctx->sysval_count = 0;
-
- nir_foreach_function(function, shader) {
- if (!function->impl) continue;
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- midgard_nir_assign_sysval_body(ctx, instr);
- }
- }
- }
-}
-
-static bool
-midgard_nir_lower_fdot2(nir_shader *shader)
-{
- bool progress = false;
-
- nir_foreach_function(function, shader) {
- if (!function->impl) continue;
-
- nir_builder _b;
- nir_builder *b = &_b;
- nir_builder_init(b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_alu) continue;
-
- nir_alu_instr *alu = nir_instr_as_alu(instr);
- midgard_nir_lower_fdot2_body(b, alu);
-
- progress |= true;
- }
- }
-
- nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance);
-
- }
-
- return progress;
-}
-
-/* Flushes undefined values to zero */
-
-static void
-optimise_nir(nir_shader *nir)
-{
- bool progress;
- unsigned lower_flrp =
- (nir->options->lower_flrp16 ? 16 : 0) |
- (nir->options->lower_flrp32 ? 32 : 0) |
- (nir->options->lower_flrp64 ? 64 : 0);
-
- NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
- NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
- NIR_PASS(progress, nir, nir_lower_idiv);
-
- nir_lower_tex_options lower_tex_1st_pass_options = {
- .lower_rect = true,
- .lower_txp = ~0
- };
-
- nir_lower_tex_options lower_tex_2nd_pass_options = {
- .lower_txs_lod = true,
- };
-
- NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_1st_pass_options);
- NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_2nd_pass_options);
-
- do {
- progress = false;
-
- NIR_PASS(progress, nir, nir_lower_var_copies);
- NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
-
- NIR_PASS(progress, nir, nir_copy_prop);
- NIR_PASS(progress, nir, nir_opt_dce);
- NIR_PASS(progress, nir, nir_opt_dead_cf);
- NIR_PASS(progress, nir, nir_opt_cse);
- NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
- NIR_PASS(progress, nir, nir_opt_algebraic);
- NIR_PASS(progress, nir, nir_opt_constant_folding);
-
- if (lower_flrp != 0) {
- bool lower_flrp_progress = false;
- NIR_PASS(lower_flrp_progress,
- nir,
- nir_lower_flrp,
- lower_flrp,
- false /* always_precise */,
- nir->options->lower_ffma);
- if (lower_flrp_progress) {
- NIR_PASS(progress, nir,
- nir_opt_constant_folding);
- progress = true;
- }
-
- /* Nothing should rematerialize any flrps, so we only
- * need to do this lowering once.
- */
- lower_flrp = 0;
- }
-
- NIR_PASS(progress, nir, nir_opt_undef);
- NIR_PASS(progress, nir, nir_undef_to_zero);
-
- NIR_PASS(progress, nir, nir_opt_loop_unroll,
- nir_var_shader_in |
- nir_var_shader_out |
- nir_var_function_temp);
-
- NIR_PASS(progress, nir, nir_opt_vectorize);
- } while (progress);
-
- /* Must be run at the end to prevent creation of fsin/fcos ops */
- NIR_PASS(progress, nir, midgard_nir_scale_trig);
-
- do {
- progress = false;
-
- NIR_PASS(progress, nir, nir_opt_dce);
- NIR_PASS(progress, nir, nir_opt_algebraic);
- NIR_PASS(progress, nir, nir_opt_constant_folding);
- NIR_PASS(progress, nir, nir_copy_prop);
- } while (progress);
-
- NIR_PASS(progress, nir, nir_opt_algebraic_late);
-
- /* We implement booleans as 32-bit 0/~0 */
- NIR_PASS(progress, nir, nir_lower_bool_to_int32);
-
- /* Now that booleans are lowered, we can run out late opts */
- NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
-
- /* Lower mods for float ops only. Integer ops don't support modifiers
- * (saturate doesn't make sense on integers, neg/abs require dedicated
- * instructions) */
-
- NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_float_source_mods);
- NIR_PASS(progress, nir, nir_copy_prop);
- NIR_PASS(progress, nir, nir_opt_dce);
-
- /* Take us out of SSA */
- NIR_PASS(progress, nir, nir_lower_locals_to_regs);
- NIR_PASS(progress, nir, nir_convert_from_ssa, true);
-
- /* We are a vector architecture; write combine where possible */
- NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
- NIR_PASS(progress, nir, nir_lower_vec_to_movs);
-
- NIR_PASS(progress, nir, nir_opt_dce);
-}
-
-/* Front-half of aliasing the SSA slots, merely by inserting the flag in the
- * appropriate hash table. Intentional off-by-one to avoid confusing NULL with
- * r0. See the comments in compiler_context */
-
-static void
-alias_ssa(compiler_context *ctx, int dest, int src)
-{
- _mesa_hash_table_u64_insert(ctx->ssa_to_alias, dest + 1, (void *) ((uintptr_t) src + 1));
- _mesa_set_add(ctx->leftover_ssa_to_alias, (void *) (uintptr_t) (dest + 1));
-}
-
-/* ...or undo it, after which the original index will be used (dummy move should be emitted alongside this) */
-
-static void
-unalias_ssa(compiler_context *ctx, int dest)
-{
- _mesa_hash_table_u64_remove(ctx->ssa_to_alias, dest + 1);
- /* TODO: Remove from leftover or no? */
-}
-
-/* Do not actually emit a load; instead, cache the constant for inlining */
-
-static void
-emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
-{
- nir_ssa_def def = instr->def;
-
- float *v = rzalloc_array(NULL, float, 4);
- nir_const_load_to_arr(v, instr, f32);
- _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v);
-}
-
-static unsigned
-nir_src_index(compiler_context *ctx, nir_src *src)
-{
- if (src->is_ssa)
- return src->ssa->index;
- else {
- assert(!src->reg.indirect);
- return ctx->func->impl->ssa_alloc + src->reg.reg->index;
- }
-}
-
-static unsigned
-nir_alu_src_index(compiler_context *ctx, nir_alu_src *src)
-{
- return nir_src_index(ctx, &src->src);
-}
-
-static bool
-nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
-{
- unsigned comp = src->swizzle[0];
-
- for (unsigned c = 1; c < nr_components; ++c) {
- if (src->swizzle[c] != comp)
- return true;
- }
-
- return false;
-}
-
-/* Midgard puts scalar conditionals in r31.w; move an arbitrary source (the
- * output of a conditional test) into that register */
-
-static void
-emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component)
-{
- int condition = nir_src_index(ctx, src);
-
- /* Source to swizzle the desired component into w */
-
- const midgard_vector_alu_src alu_src = {
- .swizzle = SWIZZLE(component, component, component, component),
- };
-
- /* There is no boolean move instruction. Instead, we simulate a move by
- * ANDing the condition with itself to get it into r31.w */
-
- midgard_instruction ins = {
- .type = TAG_ALU_4,
-
- /* We need to set the conditional as close as possible */
- .precede_break = true,
- .unit = for_branch ? UNIT_SMUL : UNIT_SADD,
- .mask = 1 << COMPONENT_W,
-
- .ssa_args = {
- .src0 = condition,
- .src1 = condition,
- .dest = SSA_FIXED_REGISTER(31),
- },
-
- .alu = {
- .op = midgard_alu_op_iand,
- .outmod = midgard_outmod_int_wrap,
- .reg_mode = midgard_reg_mode_32,
- .dest_override = midgard_dest_override_none,
- .src1 = vector_alu_srco_unsigned(alu_src),
- .src2 = vector_alu_srco_unsigned(alu_src)
- },
- };
-
- emit_mir_instruction(ctx, ins);
-}
-
-/* Or, for mixed conditions (with csel_v), here's a vector version using all of
- * r31 instead */
-
-static void
-emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp)
-{
- int condition = nir_src_index(ctx, &src->src);
-
- /* Source to swizzle the desired component into w */
-
- const midgard_vector_alu_src alu_src = {
- .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle),
- };
-
- /* There is no boolean move instruction. Instead, we simulate a move by
- * ANDing the condition with itself to get it into r31.w */
-
- midgard_instruction ins = {
- .type = TAG_ALU_4,
- .precede_break = true,
- .mask = mask_of(nr_comp),
- .ssa_args = {
- .src0 = condition,
- .src1 = condition,
- .dest = SSA_FIXED_REGISTER(31),
- },
- .alu = {
- .op = midgard_alu_op_iand,
- .outmod = midgard_outmod_int_wrap,
- .reg_mode = midgard_reg_mode_32,
- .dest_override = midgard_dest_override_none,
- .src1 = vector_alu_srco_unsigned(alu_src),
- .src2 = vector_alu_srco_unsigned(alu_src)
- },
- };
-
- emit_mir_instruction(ctx, ins);
-}
-
-
-
-/* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise
- * pinning to eliminate this move in all known cases */
-
-static void
-emit_indirect_offset(compiler_context *ctx, nir_src *src)
-{
- int offset = nir_src_index(ctx, src);
-
- midgard_instruction ins = {
- .type = TAG_ALU_4,
- .mask = 1 << COMPONENT_W,
- .ssa_args = {
- .src0 = SSA_UNUSED_1,
- .src1 = offset,
- .dest = SSA_FIXED_REGISTER(REGISTER_OFFSET),
- },
- .alu = {
- .op = midgard_alu_op_imov,
- .outmod = midgard_outmod_int_wrap,
- .reg_mode = midgard_reg_mode_32,
- .dest_override = midgard_dest_override_none,
- .src1 = vector_alu_srco_unsigned(zero_alu_src),
- .src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx)
- },
- };
-
- emit_mir_instruction(ctx, ins);
-}
-
-#define ALU_CASE(nir, _op) \
- case nir_op_##nir: \
- op = midgard_alu_op_##_op; \
- assert(src_bitsize == dst_bitsize); \
- break;
-
-#define ALU_CASE_BCAST(nir, _op, count) \
- case nir_op_##nir: \
- op = midgard_alu_op_##_op; \
- broadcast_swizzle = count; \
- assert(src_bitsize == dst_bitsize); \
- break;
-static bool
-nir_is_fzero_constant(nir_src src)
-{
- if (!nir_src_is_const(src))
- return false;
-
- for (unsigned c = 0; c < nir_src_num_components(src); ++c) {
- if (nir_src_comp_as_float(src, c) != 0.0)
- return false;
- }
-
- return true;
-}
-
-/* Analyze the sizes of the inputs to determine which reg mode. Ops needed
- * special treatment override this anyway. */
-
-static midgard_reg_mode
-reg_mode_for_nir(nir_alu_instr *instr)
-{
- unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
-
- switch (src_bitsize) {
- case 8:
- return midgard_reg_mode_8;
- case 16:
- return midgard_reg_mode_16;
- case 32:
- return midgard_reg_mode_32;
- case 64:
- return midgard_reg_mode_64;
- default:
- unreachable("Invalid bit size");
- }
-}
-
-static void
-emit_alu(compiler_context *ctx, nir_alu_instr *instr)
-{
- bool is_ssa = instr->dest.dest.is_ssa;
-
- unsigned dest = nir_dest_index(ctx, &instr->dest.dest);
- unsigned nr_components = nir_dest_num_components(instr->dest.dest);
- unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
-
- /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are
- * supported. A few do not and are commented for now. Also, there are a
- * number of NIR ops which Midgard does not support and need to be
- * lowered, also TODO. This switch block emits the opcode and calling
- * convention of the Midgard instruction; actual packing is done in
- * emit_alu below */
-
- unsigned op;
-
- /* Number of components valid to check for the instruction (the rest
- * will be forced to the last), or 0 to use as-is. Relevant as
- * ball-type instructions have a channel count in NIR but are all vec4
- * in Midgard */
-
- unsigned broadcast_swizzle = 0;
-
- /* What register mode should we operate in? */
- midgard_reg_mode reg_mode =
- reg_mode_for_nir(instr);
-
- /* Do we need a destination override? Used for inline
- * type conversion */
-
- midgard_dest_override dest_override =
- midgard_dest_override_none;
-
- /* Should we use a smaller respective source and sign-extend? */
-
- bool half_1 = false, sext_1 = false;
- bool half_2 = false, sext_2 = false;
-
- unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
- unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest);
-
- switch (instr->op) {
- ALU_CASE(fadd, fadd);
- ALU_CASE(fmul, fmul);
- ALU_CASE(fmin, fmin);
- ALU_CASE(fmax, fmax);
- ALU_CASE(imin, imin);
- ALU_CASE(imax, imax);
- ALU_CASE(umin, umin);
- ALU_CASE(umax, umax);
- ALU_CASE(ffloor, ffloor);
- ALU_CASE(fround_even, froundeven);
- ALU_CASE(ftrunc, ftrunc);
- ALU_CASE(fceil, fceil);
- ALU_CASE(fdot3, fdot3);
- ALU_CASE(fdot4, fdot4);
- ALU_CASE(iadd, iadd);
- ALU_CASE(isub, isub);
- ALU_CASE(imul, imul);
-
- /* Zero shoved as second-arg */
- ALU_CASE(iabs, iabsdiff);
-
- ALU_CASE(mov, imov);
-
- ALU_CASE(feq32, feq);
- ALU_CASE(fne32, fne);
- ALU_CASE(flt32, flt);
- ALU_CASE(ieq32, ieq);
- ALU_CASE(ine32, ine);
- ALU_CASE(ilt32, ilt);
- ALU_CASE(ult32, ult);
-
- /* We don't have a native b2f32 instruction. Instead, like many
- * GPUs, we exploit booleans as 0/~0 for false/true, and
- * correspondingly AND
- * by 1.0 to do the type conversion. For the moment, prime us
- * to emit:
- *
- * iand [whatever], #0
- *
- * At the end of emit_alu (as MIR), we'll fix-up the constant
- */
-
- ALU_CASE(b2f32, iand);
- ALU_CASE(b2i32, iand);
-
- /* Likewise, we don't have a dedicated f2b32 instruction, but
- * we can do a "not equal to 0.0" test. */
-
- ALU_CASE(f2b32, fne);
- ALU_CASE(i2b32, ine);
-
- ALU_CASE(frcp, frcp);
- ALU_CASE(frsq, frsqrt);
- ALU_CASE(fsqrt, fsqrt);
- ALU_CASE(fexp2, fexp2);
- ALU_CASE(flog2, flog2);
-
- ALU_CASE(f2i32, f2i_rtz);
- ALU_CASE(f2u32, f2u_rtz);
- ALU_CASE(i2f32, i2f_rtz);
- ALU_CASE(u2f32, u2f_rtz);
-
- ALU_CASE(f2i16, f2i_rtz);
- ALU_CASE(f2u16, f2u_rtz);
- ALU_CASE(i2f16, i2f_rtz);
- ALU_CASE(u2f16, u2f_rtz);
-
- ALU_CASE(fsin, fsin);
- ALU_CASE(fcos, fcos);
-
- /* Second op implicit #0 */
- ALU_CASE(inot, inor);
- ALU_CASE(iand, iand);
- ALU_CASE(ior, ior);
- ALU_CASE(ixor, ixor);
- ALU_CASE(ishl, ishl);
- ALU_CASE(ishr, iasr);
- ALU_CASE(ushr, ilsr);
-
- ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2);
- ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3);
- ALU_CASE(b32all_fequal4, fball_eq);
-
- ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2);
- ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3);
- ALU_CASE(b32any_fnequal4, fbany_neq);
-
- ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2);
- ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3);
- ALU_CASE(b32all_iequal4, iball_eq);
-
- ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2);
- ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3);
- ALU_CASE(b32any_inequal4, ibany_neq);
-
- /* Source mods will be shoved in later */
- ALU_CASE(fabs, fmov);
- ALU_CASE(fneg, fmov);
- ALU_CASE(fsat, fmov);
-
- /* For size conversion, we use a move. Ideally though we would squash
- * these ops together; maybe that has to happen after in NIR as part of
- * propagation...? An earlier algebraic pass ensured we step down by
- * only / exactly one size. If stepping down, we use a dest override to
- * reduce the size; if stepping up, we use a larger-sized move with a
- * half source and a sign/zero-extension modifier */
-
- case nir_op_i2i8:
- case nir_op_i2i16:
- case nir_op_i2i32:
- /* If we end up upscale, we'll need a sign-extend on the
- * operand (the second argument) */
-
- sext_2 = true;
- case nir_op_u2u8:
- case nir_op_u2u16:
- case nir_op_u2u32: {
- op = midgard_alu_op_imov;
-
- if (dst_bitsize == (src_bitsize * 2)) {
- /* Converting up */
- half_2 = true;
-
- /* Use a greater register mode */
- reg_mode++;
- } else if (src_bitsize == (dst_bitsize * 2)) {
- /* Converting down */
- dest_override = midgard_dest_override_lower;
- }
-
- break;
- }
-
- case nir_op_f2f16: {
- assert(src_bitsize == 32);
-
- op = midgard_alu_op_fmov;
- dest_override = midgard_dest_override_lower;
- break;
- }
-
- case nir_op_f2f32: {
- assert(src_bitsize == 16);
-
- op = midgard_alu_op_fmov;
- half_2 = true;
- reg_mode++;
- break;
- }
-
-
- /* For greater-or-equal, we lower to less-or-equal and flip the
- * arguments */
-
- case nir_op_fge:
- case nir_op_fge32:
- case nir_op_ige32:
- case nir_op_uge32: {
- op =
- instr->op == nir_op_fge ? midgard_alu_op_fle :
- instr->op == nir_op_fge32 ? midgard_alu_op_fle :
- instr->op == nir_op_ige32 ? midgard_alu_op_ile :
- instr->op == nir_op_uge32 ? midgard_alu_op_ule :
- 0;
-
- /* Swap via temporary */
- nir_alu_src temp = instr->src[1];
- instr->src[1] = instr->src[0];
- instr->src[0] = temp;
-
- break;
- }
-
- case nir_op_b32csel: {
- /* Midgard features both fcsel and icsel, depending on
- * the type of the arguments/output. However, as long
- * as we're careful we can _always_ use icsel and
- * _never_ need fcsel, since the latter does additional
- * floating-point-specific processing whereas the
- * former just moves bits on the wire. It's not obvious
- * why these are separate opcodes, save for the ability
- * to do things like sat/pos/abs/neg for free */
-
- bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
- op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel;
-
- /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
- nr_inputs = 2;
-
- /* Emit the condition into r31 */
-
- if (mixed)
- emit_condition_mixed(ctx, &instr->src[0], nr_components);
- else
- emit_condition(ctx, &instr->src[0].src, false, instr->src[0].swizzle[0]);
-
- /* The condition is the first argument; move the other
- * arguments up one to be a binary instruction for
- * Midgard */
-
- memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
- break;
- }
-
- default:
- DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
- assert(0);
- return;
- }
-
- /* Midgard can perform certain modifiers on output of an ALU op */
- unsigned outmod;
-
- if (midgard_is_integer_out_op(op)) {
- outmod = midgard_outmod_int_wrap;
- } else {
- bool sat = instr->dest.saturate || instr->op == nir_op_fsat;
- outmod = sat ? midgard_outmod_sat : midgard_outmod_none;
- }
-
- /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */
-
- if (instr->op == nir_op_fmax) {
- if (nir_is_fzero_constant(instr->src[0].src)) {
- op = midgard_alu_op_fmov;
- nr_inputs = 1;
- outmod = midgard_outmod_pos;
- instr->src[0] = instr->src[1];
- } else if (nir_is_fzero_constant(instr->src[1].src)) {
- op = midgard_alu_op_fmov;
- nr_inputs = 1;
- outmod = midgard_outmod_pos;
- }
- }
-
- /* Fetch unit, quirks, etc information */
- unsigned opcode_props = alu_opcode_props[op].props;
- bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
-
- /* src0 will always exist afaik, but src1 will not for 1-argument
- * instructions. The latter can only be fetched if the instruction
- * needs it, or else we may segfault. */
-
- unsigned src0 = nir_alu_src_index(ctx, &instr->src[0]);
- unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(ctx, &instr->src[1]) : SSA_UNUSED_0;
-
- /* Rather than use the instruction generation helpers, we do it
- * ourselves here to avoid the mess */
-
- midgard_instruction ins = {
- .type = TAG_ALU_4,
- .ssa_args = {
- .src0 = quirk_flipped_r24 ? SSA_UNUSED_1 : src0,
- .src1 = quirk_flipped_r24 ? src0 : src1,
- .dest = dest,
- }
- };
-
- nir_alu_src *nirmods[2] = { NULL };
-
- if (nr_inputs == 2) {
- nirmods[0] = &instr->src[0];
- nirmods[1] = &instr->src[1];
- } else if (nr_inputs == 1) {
- nirmods[quirk_flipped_r24] = &instr->src[0];
- } else {
- assert(0);
- }
-
- /* These were lowered to a move, so apply the corresponding mod */
-
- if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) {
- nir_alu_src *s = nirmods[quirk_flipped_r24];
-
- if (instr->op == nir_op_fneg)
- s->negate = !s->negate;
-
- if (instr->op == nir_op_fabs)
- s->abs = !s->abs;
- }
-
- bool is_int = midgard_is_integer_op(op);
-
- ins.mask = mask_of(nr_components);
-
- midgard_vector_alu alu = {
- .op = op,
- .reg_mode = reg_mode,
- .dest_override = dest_override,
- .outmod = outmod,
-
- .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, sext_1)),
- .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int, broadcast_swizzle, half_2, sext_2)),
- };
-
- /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */
-
- if (!is_ssa)
- ins.mask &= instr->dest.write_mask;
-
- ins.alu = alu;
-
- /* Late fixup for emulated instructions */
-
- if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) {
- /* Presently, our second argument is an inline #0 constant.
- * Switch over to an embedded 1.0 constant (that can't fit
- * inline, since we're 32-bit, not 16-bit like the inline
- * constants) */
-
- ins.ssa_args.inline_constant = false;
- ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
- ins.has_constants = true;
-
- if (instr->op == nir_op_b2f32) {
- ins.constants[0] = 1.0f;
- } else {
- /* Type pun it into place */
- uint32_t one = 0x1;
- memcpy(&ins.constants[0], &one, sizeof(uint32_t));
- }
-
- ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
- } else if (nr_inputs == 1 && !quirk_flipped_r24) {
- /* Lots of instructions need a 0 plonked in */
- ins.ssa_args.inline_constant = false;
- ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
- ins.has_constants = true;
- ins.constants[0] = 0.0f;
- ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
- } else if (instr->op == nir_op_inot) {
- /* ~b = ~(b & b), so duplicate the source */
- ins.ssa_args.src1 = ins.ssa_args.src0;
- ins.alu.src2 = ins.alu.src1;
- }
-
- if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
- /* To avoid duplicating the lookup tables (probably), true LUT
- * instructions can only operate as if they were scalars. Lower
- * them here by changing the component. */
-
- uint8_t original_swizzle[4];
- memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle));
- unsigned orig_mask = ins.mask;
-
- for (int i = 0; i < nr_components; ++i) {
- /* Mask the associated component, dropping the
- * instruction if needed */
-
- ins.mask = 1 << i;
- ins.mask &= orig_mask;
-
- if (!ins.mask)
- continue;
-
- for (int j = 0; j < 4; ++j)
- nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */
-
- ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, false));
- emit_mir_instruction(ctx, ins);
- }
- } else {
- emit_mir_instruction(ctx, ins);
- }
-}
-
-#undef ALU_CASE
-
-/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly
- * optimized) versions of UBO #0 */
-
-static void
-emit_ubo_read(
- compiler_context *ctx,
- unsigned dest,
- unsigned offset,
- nir_src *indirect_offset,
- unsigned index)
-{
- /* TODO: half-floats */
-
- if (!indirect_offset && offset < ctx->uniform_cutoff && index == 0) {
- /* Fast path: For the first 16 uniforms, direct accesses are
- * 0-cycle, since they're just a register fetch in the usual
- * case. So, we alias the registers while we're still in
- * SSA-space */
-
- int reg_slot = 23 - offset;
- alias_ssa(ctx, dest, SSA_FIXED_REGISTER(reg_slot));
- } else {
- /* Otherwise, read from the 'special' UBO to access
- * higher-indexed uniforms, at a performance cost. More
- * generally, we're emitting a UBO read instruction. */
-
- midgard_instruction ins = m_ld_uniform_32(dest, offset);
-
- /* TODO: Don't split */
- ins.load_store.varying_parameters = (offset & 7) << 7;
- ins.load_store.address = offset >> 3;
-
- if (indirect_offset) {
- emit_indirect_offset(ctx, indirect_offset);
- ins.load_store.unknown = 0x8700 | index; /* xxx: what is this? */
- } else {
- ins.load_store.unknown = 0x1E00 | index; /* xxx: what is this? */
- }
-
- /* TODO respect index */
-
- emit_mir_instruction(ctx, ins);
- }
-}
-
-static void
-emit_varying_read(
- compiler_context *ctx,
- unsigned dest, unsigned offset,
- unsigned nr_comp, unsigned component,
- nir_src *indirect_offset, nir_alu_type type)
-{
- /* XXX: Half-floats? */
- /* TODO: swizzle, mask */
-
- midgard_instruction ins = m_ld_vary_32(dest, offset);
- ins.mask = mask_of(nr_comp);
- ins.load_store.swizzle = SWIZZLE_XYZW >> (2 * component);
-
- midgard_varying_parameter p = {
- .is_varying = 1,
- .interpolation = midgard_interp_default,
- .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0
- };
-
- unsigned u;
- memcpy(&u, &p, sizeof(p));
- ins.load_store.varying_parameters = u;
-
- if (indirect_offset) {
- /* We need to add in the dynamic index, moved to r27.w */
- emit_indirect_offset(ctx, indirect_offset);
- ins.load_store.unknown = 0x79e; /* xxx: what is this? */
- } else {
- /* Just a direct load */
- ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
- }
-
- /* Use the type appropriate load */
- switch (type) {
- case nir_type_uint:
- case nir_type_bool:
- ins.load_store.op = midgard_op_ld_vary_32u;
- break;
- case nir_type_int:
- ins.load_store.op = midgard_op_ld_vary_32i;
- break;
- case nir_type_float:
- ins.load_store.op = midgard_op_ld_vary_32;
- break;
- default:
- unreachable("Attempted to load unknown type");
- break;
- }
-
- emit_mir_instruction(ctx, ins);
-}
-
-static void
-emit_sysval_read(compiler_context *ctx, nir_instr *instr)
-{
- unsigned dest;
- /* Figure out which uniform this is */
- int sysval = sysval_for_instr(ctx, instr, &dest);
- void *val = _mesa_hash_table_u64_search(ctx->sysval_to_id, sysval);
-
- /* Sysvals are prefix uniforms */
- unsigned uniform = ((uintptr_t) val) - 1;
-
- /* Emit the read itself -- this is never indirect */
- emit_ubo_read(ctx, dest, uniform, NULL, 0);
-}
-
-static void
-emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
-{
- unsigned offset = 0, reg;
-
- switch (instr->intrinsic) {
- case nir_intrinsic_discard_if:
- emit_condition(ctx, &instr->src[0], true, COMPONENT_X);
-
- /* fallthrough */
-
- case nir_intrinsic_discard: {
- bool conditional = instr->intrinsic == nir_intrinsic_discard_if;
- struct midgard_instruction discard = v_branch(conditional, false);
- discard.branch.target_type = TARGET_DISCARD;
- emit_mir_instruction(ctx, discard);
-
- ctx->can_discard = true;
- break;
- }
-
- case nir_intrinsic_load_uniform:
- case nir_intrinsic_load_ubo:
- case nir_intrinsic_load_input: {
- bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
- bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
-
- /* Get the base type of the intrinsic */
- /* TODO: Infer type? Does it matter? */
- nir_alu_type t =
- is_ubo ? nir_type_uint : nir_intrinsic_type(instr);
- t = nir_alu_type_get_base_type(t);
-
- if (!is_ubo) {
- offset = nir_intrinsic_base(instr);
- }
-
- unsigned nr_comp = nir_intrinsic_dest_components(instr);
-
- nir_src *src_offset = nir_get_io_offset_src(instr);
-
- bool direct = nir_src_is_const(*src_offset);
-
- if (direct)
- offset += nir_src_as_uint(*src_offset);
-
- /* We may need to apply a fractional offset */
- int component = instr->intrinsic == nir_intrinsic_load_input ?
- nir_intrinsic_component(instr) : 0;
- reg = nir_dest_index(ctx, &instr->dest);
-
- if (is_uniform && !ctx->is_blend) {
- emit_ubo_read(ctx, reg, ctx->sysval_count + offset, !direct ? &instr->src[0] : NULL, 0);
- } else if (is_ubo) {
- nir_src index = instr->src[0];
-
- /* We don't yet support indirect UBOs. For indirect
- * block numbers (if that's possible), we don't know
- * enough about the hardware yet. For indirect sources,
- * we know what we need but we need to add some NIR
- * support for lowering correctly with respect to
- * 128-bit reads */
-
- assert(nir_src_is_const(index));
- assert(nir_src_is_const(*src_offset));
-
- /* TODO: Alignment */
- assert((offset & 0xF) == 0);
-
- uint32_t uindex = nir_src_as_uint(index) + 1;
- emit_ubo_read(ctx, reg, offset / 16, NULL, uindex);
- } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) {
- emit_varying_read(ctx, reg, offset, nr_comp, component, !direct ? &instr->src[0] : NULL, t);
- } else if (ctx->is_blend) {
- /* For blend shaders, load the input color, which is
- * preloaded to r0 */
-
- midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
- emit_mir_instruction(ctx, move);
- } else if (ctx->stage == MESA_SHADER_VERTEX) {
- midgard_instruction ins = m_ld_attr_32(reg, offset);
- ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */
- ins.mask = mask_of(nr_comp);
-
- /* Use the type appropriate load */
- switch (t) {
- case nir_type_uint:
- case nir_type_bool:
- ins.load_store.op = midgard_op_ld_attr_32u;
- break;
- case nir_type_int:
- ins.load_store.op = midgard_op_ld_attr_32i;
- break;
- case nir_type_float:
- ins.load_store.op = midgard_op_ld_attr_32;
- break;
- default:
- unreachable("Attempted to load unknown type");
- break;
- }
-
- emit_mir_instruction(ctx, ins);
- } else {
- DBG("Unknown load\n");
- assert(0);
- }
-
- break;
- }
-
- /* Reads 128-bit value raw off the tilebuffer during blending, tasty */
-
- case nir_intrinsic_load_raw_output_pan:
- reg = nir_dest_index(ctx, &instr->dest);
- assert(ctx->is_blend);
-
- midgard_instruction ins = m_ld_color_buffer_8(reg, 0);
- emit_mir_instruction(ctx, ins);
- break;
-
- case nir_intrinsic_load_blend_const_color_rgba: {
- assert(ctx->is_blend);
- reg = nir_dest_index(ctx, &instr->dest);
-
- /* Blend constants are embedded directly in the shader and
- * patched in, so we use some magic routing */
-
- midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg);
- ins.has_constants = true;
- ins.has_blend_constant = true;
- emit_mir_instruction(ctx, ins);
- break;
- }
-
- case nir_intrinsic_store_output:
- assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
-
- offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]);
-
- reg = nir_src_index(ctx, &instr->src[0]);
-
- if (ctx->stage == MESA_SHADER_FRAGMENT) {
- /* gl_FragColor is not emitted with load/store
- * instructions. Instead, it gets plonked into
- * r0 at the end of the shader and we do the
- * framebuffer writeout dance. TODO: Defer
- * writes */
-
- midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
- emit_mir_instruction(ctx, move);
-
- /* Save the index we're writing to for later reference
- * in the epilogue */
-
- ctx->fragment_output = reg;
- } else if (ctx->stage == MESA_SHADER_VERTEX) {
- /* Varyings are written into one of two special
- * varying register, r26 or r27. The register itself is
- * selected as the register in the st_vary instruction,
- * minus the base of 26. E.g. write into r27 and then
- * call st_vary(1) */
-
- midgard_instruction ins = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(26));
- emit_mir_instruction(ctx, ins);
-
- /* We should have been vectorized, though we don't
- * currently check that st_vary is emitted only once
- * per slot (this is relevant, since there's not a mask
- * parameter available on the store [set to 0 by the
- * blob]). We do respect the component by adjusting the
- * swizzle. */
-
- unsigned component = nir_intrinsic_component(instr);
-
- midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset);
- st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
- st.load_store.swizzle = SWIZZLE_XYZW << (2*component);
- emit_mir_instruction(ctx, st);
- } else {
- DBG("Unknown store\n");
- assert(0);
- }
-
- break;
-
- /* Special case of store_output for lowered blend shaders */
- case nir_intrinsic_store_raw_output_pan:
- assert (ctx->stage == MESA_SHADER_FRAGMENT);
- reg = nir_src_index(ctx, &instr->src[0]);
-
- midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
- emit_mir_instruction(ctx, move);
- ctx->fragment_output = reg;
-
- break;
-
- case nir_intrinsic_load_alpha_ref_float:
- assert(instr->dest.is_ssa);
-
- float ref_value = ctx->alpha_ref;
-
- float *v = ralloc_array(NULL, float, 4);
- memcpy(v, &ref_value, sizeof(float));
- _mesa_hash_table_u64_insert(ctx->ssa_constants, instr->dest.ssa.index + 1, v);
- break;
-
- case nir_intrinsic_load_viewport_scale:
- case nir_intrinsic_load_viewport_offset:
- emit_sysval_read(ctx, &instr->instr);
- break;
-
- default:
- printf ("Unhandled intrinsic\n");
- assert(0);
- break;
- }
-}
-
-static unsigned
-midgard_tex_format(enum glsl_sampler_dim dim)
-{
- switch (dim) {
- case GLSL_SAMPLER_DIM_1D:
- case GLSL_SAMPLER_DIM_BUF:
- return MALI_TEX_1D;
-
- case GLSL_SAMPLER_DIM_2D:
- case GLSL_SAMPLER_DIM_EXTERNAL:
- return MALI_TEX_2D;
-
- case GLSL_SAMPLER_DIM_3D:
- return MALI_TEX_3D;
-
- case GLSL_SAMPLER_DIM_CUBE:
- return MALI_TEX_CUBE;
-
- default:
- DBG("Unknown sampler dim type\n");
- assert(0);
- return 0;
- }
-}
-
-/* Tries to attach an explicit LOD / bias as a constant. Returns whether this
- * was successful */
-
-static bool
-pan_attach_constant_bias(
- compiler_context *ctx,
- nir_src lod,
- midgard_texture_word *word)
-{
- /* To attach as constant, it has to *be* constant */
-
- if (!nir_src_is_const(lod))
- return false;
-
- float f = nir_src_as_float(lod);
-
- /* Break into fixed-point */
- signed lod_int = f;
- float lod_frac = f - lod_int;
-
- /* Carry over negative fractions */
- if (lod_frac < 0.0) {
- lod_int--;
- lod_frac += 1.0;
- }
-
- /* Encode */
- word->bias = float_to_ubyte(lod_frac);
- word->bias_int = lod_int;
-
- return true;
-}
-
-static enum mali_sampler_type
-midgard_sampler_type(nir_alu_type t) {
- switch (nir_alu_type_get_base_type(t))
- {
- case nir_type_float:
- return MALI_SAMPLER_FLOAT;
- case nir_type_int:
- return MALI_SAMPLER_SIGNED;
- case nir_type_uint:
- return MALI_SAMPLER_UNSIGNED;
- default:
- unreachable("Unknown sampler type");
- }
-}
-
-static void
-emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
- unsigned midgard_texop)
-{
- /* TODO */
- //assert (!instr->sampler);
- //assert (!instr->texture_array_size);
-
- /* Allocate registers via a round robin scheme to alternate between the two registers */
- int reg = ctx->texture_op_count & 1;
- int in_reg = reg, out_reg = reg;
-
- /* Make room for the reg */
-
- if (ctx->texture_index[reg] > -1)
- unalias_ssa(ctx, ctx->texture_index[reg]);
-
- int texture_index = instr->texture_index;
- int sampler_index = texture_index;
-
- /* No helper to build texture words -- we do it all here */
- midgard_instruction ins = {
- .type = TAG_TEXTURE_4,
- .mask = 0xF,
- .texture = {
- .op = midgard_texop,
- .format = midgard_tex_format(instr->sampler_dim),
- .texture_handle = texture_index,
- .sampler_handle = sampler_index,
-
- /* TODO: Regalloc it in */
- .swizzle = SWIZZLE_XYZW,
-
- /* TODO: half */
- .in_reg_full = 1,
- .out_full = 1,
-
- .sampler_type = midgard_sampler_type(instr->dest_type),
- }
- };
-
- for (unsigned i = 0; i < instr->num_srcs; ++i) {
- int reg = SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg);
- int index = nir_src_index(ctx, &instr->src[i].src);
- int nr_comp = nir_src_num_components(instr->src[i].src);
- midgard_vector_alu_src alu_src = blank_alu_src;
-
- switch (instr->src[i].src_type) {
- case nir_tex_src_coord: {
- if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
- /* texelFetch is undefined on samplerCube */
- assert(midgard_texop != TEXTURE_OP_TEXEL_FETCH);
-
- /* For cubemaps, we need to load coords into
- * special r27, and then use a special ld/st op
- * to select the face and copy the xy into the
- * texture register */
-
- alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X);
-
- midgard_instruction move = v_mov(index, alu_src, SSA_FIXED_REGISTER(27));
- emit_mir_instruction(ctx, move);
-
- midgard_instruction st = m_st_cubemap_coords(reg, 0);
- st.load_store.unknown = 0x24; /* XXX: What is this? */
- st.mask = 0x3; /* xy */
- st.load_store.swizzle = alu_src.swizzle;
- emit_mir_instruction(ctx, st);
-
- ins.texture.in_reg_swizzle = swizzle_of(2);
- } else {
- ins.texture.in_reg_swizzle = alu_src.swizzle = swizzle_of(nr_comp);
-
- midgard_instruction mov = v_mov(index, alu_src, reg);
- mov.mask = mask_of(nr_comp);
- emit_mir_instruction(ctx, mov);
-
- if (midgard_texop == TEXTURE_OP_TEXEL_FETCH) {
- /* Texel fetch opcodes care about the
- * values of z and w, so we actually
- * need to spill into a second register
- * for a texel fetch with register bias
- * (for non-2D). TODO: Implement that
- */
-
- assert(instr->sampler_dim == GLSL_SAMPLER_DIM_2D);
-
- midgard_instruction zero = v_mov(index, alu_src, reg);
- zero.ssa_args.inline_constant = true;
- zero.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
- zero.has_constants = true;
- zero.mask = ~mov.mask;
- emit_mir_instruction(ctx, zero);
-
- ins.texture.in_reg_swizzle = SWIZZLE_XYZZ;
- } else {
- /* Non-texel fetch doesn't need that
- * nonsense. However we do use the Z
- * for array indexing */
- bool is_3d = instr->sampler_dim == GLSL_SAMPLER_DIM_3D;
- ins.texture.in_reg_swizzle = is_3d ? SWIZZLE_XYZZ : SWIZZLE_XYXZ;
- }
- }
-
- break;
- }
-
- case nir_tex_src_bias:
- case nir_tex_src_lod: {
- /* Try as a constant if we can */
-
- bool is_txf = midgard_texop == TEXTURE_OP_TEXEL_FETCH;
- if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture))
- break;
-
- /* Otherwise we use a register. To keep RA simple, we
- * put the bias/LOD into the w component of the input
- * source, which is otherwise in xy */
-
- alu_src.swizzle = SWIZZLE_XXXX;
-
- midgard_instruction mov = v_mov(index, alu_src, reg);
- mov.mask = 1 << COMPONENT_W;
- emit_mir_instruction(ctx, mov);
-
- ins.texture.lod_register = true;
-
- midgard_tex_register_select sel = {
- .select = in_reg,
- .full = 1,
-
- /* w */
- .component_lo = 1,
- .component_hi = 1
- };
-
- uint8_t packed;
- memcpy(&packed, &sel, sizeof(packed));
- ins.texture.bias = packed;
-
- break;
- };
-
- default:
- unreachable("Unknown texture source type\n");
- }
- }
-
- /* Set registers to read and write from the same place */
- ins.texture.in_reg_select = in_reg;
- ins.texture.out_reg_select = out_reg;
-
- emit_mir_instruction(ctx, ins);
-
- int o_reg = REGISTER_TEXTURE_BASE + out_reg, o_index = nir_dest_index(ctx, &instr->dest);
- midgard_instruction ins2 = v_mov(SSA_FIXED_REGISTER(o_reg), blank_alu_src, o_index);
- emit_mir_instruction(ctx, ins2);
-
- /* Used for .cont and .last hinting */
- ctx->texture_op_count++;
-}
-
-static void
-emit_tex(compiler_context *ctx, nir_tex_instr *instr)
-{
- /* Fixup op, since only textureLod is permitted in VS but NIR can give
- * generic tex in some cases (which confuses the hardware) */
-
- bool is_vertex = ctx->stage == MESA_SHADER_VERTEX;
-
- if (is_vertex && instr->op == nir_texop_tex)
- instr->op = nir_texop_txl;
-
- switch (instr->op) {
- case nir_texop_tex:
- case nir_texop_txb:
- emit_texop_native(ctx, instr, TEXTURE_OP_NORMAL);
- break;
- case nir_texop_txl:
- emit_texop_native(ctx, instr, TEXTURE_OP_LOD);
- break;
- case nir_texop_txf:
- emit_texop_native(ctx, instr, TEXTURE_OP_TEXEL_FETCH);
- break;
- case nir_texop_txs:
- emit_sysval_read(ctx, &instr->instr);
- break;
- default:
- unreachable("Unhanlded texture op");
- }
-}
-
-static void
-emit_jump(compiler_context *ctx, nir_jump_instr *instr)
-{
- switch (instr->type) {
- case nir_jump_break: {
- /* Emit a branch out of the loop */
- struct midgard_instruction br = v_branch(false, false);
- br.branch.target_type = TARGET_BREAK;
- br.branch.target_break = ctx->current_loop_depth;
- emit_mir_instruction(ctx, br);
-
- DBG("break..\n");
- break;
- }
-
- default:
- DBG("Unknown jump type %d\n", instr->type);
- break;
- }
-}
-
-static void
-emit_instr(compiler_context *ctx, struct nir_instr *instr)
-{
- switch (instr->type) {
- case nir_instr_type_load_const:
- emit_load_const(ctx, nir_instr_as_load_const(instr));
- break;
-
- case nir_instr_type_intrinsic:
- emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
- break;
-
- case nir_instr_type_alu:
- emit_alu(ctx, nir_instr_as_alu(instr));
- break;
-
- case nir_instr_type_tex:
- emit_tex(ctx, nir_instr_as_tex(instr));
- break;
-
- case nir_instr_type_jump:
- emit_jump(ctx, nir_instr_as_jump(instr));
- break;
-
- case nir_instr_type_ssa_undef:
- /* Spurious */
- break;
-
- default:
- DBG("Unhandled instruction type\n");
- break;
- }
-}
-
-
-/* ALU instructions can inline or embed constants, which decreases register
- * pressure and saves space. */
-
-#define CONDITIONAL_ATTACH(src) { \
- void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \
-\
- if (entry) { \
- attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \
- alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
- } \
-}
-
-static void
-inline_alu_constants(compiler_context *ctx)
-{
- mir_foreach_instr(ctx, alu) {
- /* Other instructions cannot inline constants */
- if (alu->type != TAG_ALU_4) continue;
-
- /* If there is already a constant here, we can do nothing */
- if (alu->has_constants) continue;
-
- /* It makes no sense to inline constants on a branch */
- if (alu->compact_branch || alu->prepacked_branch) continue;
-
- CONDITIONAL_ATTACH(src0);
-
- if (!alu->has_constants) {
- CONDITIONAL_ATTACH(src1)
- } else if (!alu->inline_constant) {
- /* Corner case: _two_ vec4 constants, for instance with a
- * csel. For this case, we can only use a constant
- * register for one, we'll have to emit a move for the
- * other. Note, if both arguments are constants, then
- * necessarily neither argument depends on the value of
- * any particular register. As the destination register
- * will be wiped, that means we can spill the constant
- * to the destination register.
- */
-
- void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src1 + 1);
- unsigned scratch = alu->ssa_args.dest;
-
- if (entry) {
- midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch);
- attach_constants(ctx, &ins, entry, alu->ssa_args.src1 + 1);
-
- /* Force a break XXX Defer r31 writes */
- ins.unit = UNIT_VLUT;
-
- /* Set the source */
- alu->ssa_args.src1 = scratch;
-
- /* Inject us -before- the last instruction which set r31 */
- mir_insert_instruction_before(mir_prev_op(alu), ins);
- }
- }
- }
-}
-
-/* Midgard supports two types of constants, embedded constants (128-bit) and
- * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
- * constants can be demoted to inline constants, for space savings and
- * sometimes a performance boost */
-
-static void
-embedded_to_inline_constant(compiler_context *ctx)
-{
- mir_foreach_instr(ctx, ins) {
- if (!ins->has_constants) continue;
-
- if (ins->ssa_args.inline_constant) continue;
-
- /* Blend constants must not be inlined by definition */
- if (ins->has_blend_constant) continue;
-
- /* We can inline 32-bit (sometimes) or 16-bit (usually) */
- bool is_16 = ins->alu.reg_mode == midgard_reg_mode_16;
- bool is_32 = ins->alu.reg_mode == midgard_reg_mode_32;
-
- if (!(is_16 || is_32))
- continue;
-
- /* src1 cannot be an inline constant due to encoding
- * restrictions. So, if possible we try to flip the arguments
- * in that case */
-
- int op = ins->alu.op;
-
- if (ins->ssa_args.src0 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
- switch (op) {
- /* These ops require an operational change to flip
- * their arguments TODO */
- case midgard_alu_op_flt:
- case midgard_alu_op_fle:
- case midgard_alu_op_ilt:
- case midgard_alu_op_ile:
- case midgard_alu_op_fcsel:
- case midgard_alu_op_icsel:
- DBG("Missed non-commutative flip (%s)\n", alu_opcode_props[op].name);
- default:
- break;
- }
-
- if (alu_opcode_props[op].props & OP_COMMUTES) {
- /* Flip the SSA numbers */
- ins->ssa_args.src0 = ins->ssa_args.src1;
- ins->ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
-
- /* And flip the modifiers */
-
- unsigned src_temp;
-
- src_temp = ins->alu.src2;
- ins->alu.src2 = ins->alu.src1;
- ins->alu.src1 = src_temp;
- }
- }
-
- if (ins->ssa_args.src1 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
- /* Extract the source information */
-
- midgard_vector_alu_src *src;
- int q = ins->alu.src2;
- midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
- src = m;
-
- /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */
- int component = src->swizzle & 3;
-
- /* Scale constant appropriately, if we can legally */
- uint16_t scaled_constant = 0;
-
- if (midgard_is_integer_op(op) || is_16) {
- unsigned int *iconstants = (unsigned int *) ins->constants;
- scaled_constant = (uint16_t) iconstants[component];
-
- /* Constant overflow after resize */
- if (scaled_constant != iconstants[component])
- continue;
- } else {
- float original = (float) ins->constants[component];
- scaled_constant = _mesa_float_to_half(original);
-
- /* Check for loss of precision. If this is
- * mediump, we don't care, but for a highp
- * shader, we need to pay attention. NIR
- * doesn't yet tell us which mode we're in!
- * Practically this prevents most constants
- * from being inlined, sadly. */
-
- float fp32 = _mesa_half_to_float(scaled_constant);
-
- if (fp32 != original)
- continue;
- }
-
- /* We don't know how to handle these with a constant */
-
- if (src->mod || src->half || src->rep_low || src->rep_high) {
- DBG("Bailing inline constant...\n");
- continue;
- }
-
- /* Make sure that the constant is not itself a
- * vector by checking if all accessed values
- * (by the swizzle) are the same. */
-
- uint32_t *cons = (uint32_t *) ins->constants;
- uint32_t value = cons[component];
-
- bool is_vector = false;
- unsigned mask = effective_writemask(&ins->alu, ins->mask);
-
- for (int c = 1; c < 4; ++c) {
- /* We only care if this component is actually used */
- if (!(mask & (1 << c)))
- continue;
-
- uint32_t test = cons[(src->swizzle >> (2 * c)) & 3];
-
- if (test != value) {
- is_vector = true;
- break;
- }
- }
-
- if (is_vector)
- continue;
-
- /* Get rid of the embedded constant */
- ins->has_constants = false;
- ins->ssa_args.src1 = SSA_UNUSED_0;
- ins->ssa_args.inline_constant = true;
- ins->inline_constant = scaled_constant;
- }
- }
-}
-
-/* Map normal SSA sources to other SSA sources / fixed registers (like
- * uniforms) */
-
-static void
-map_ssa_to_alias(compiler_context *ctx, int *ref)
-{
- /* Sign is used quite deliberately for unused */
- if (*ref < 0)
- return;
-
- unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1);
-
- if (alias) {
- /* Remove entry in leftovers to avoid a redunant fmov */
-
- struct set_entry *leftover = _mesa_set_search(ctx->leftover_ssa_to_alias, ((void *) (uintptr_t) (*ref + 1)));
-
- if (leftover)
- _mesa_set_remove(ctx->leftover_ssa_to_alias, leftover);
-
- /* Assign the alias map */
- *ref = alias - 1;
- return;
- }
-}
-
-/* Basic dead code elimination on the MIR itself, which cleans up e.g. the
- * texture pipeline */
-
-static bool
-midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block)
-{
- bool progress = false;
-
- mir_foreach_instr_in_block_safe(block, ins) {
- if (ins->type != TAG_ALU_4) continue;
- if (ins->compact_branch) continue;
-
- if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
- if (mir_is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue;
-
- mir_remove_instruction(ins);
- progress = true;
- }
-
- return progress;
-}
-
-/* Dead code elimination for branches at the end of a block - only one branch
- * per block is legal semantically */
-
-static void
-midgard_opt_cull_dead_branch(compiler_context *ctx, midgard_block *block)
-{
- bool branched = false;
-
- mir_foreach_instr_in_block_safe(block, ins) {
- if (!midgard_is_branch_unit(ins->unit)) continue;
-
- /* We ignore prepacked branches since the fragment epilogue is
- * just generally special */
- if (ins->prepacked_branch) continue;
-
- /* Discards are similarly special and may not correspond to the
- * end of a block */
-
- if (ins->branch.target_type == TARGET_DISCARD) continue;
-
- if (branched) {
- /* We already branched, so this is dead */
- mir_remove_instruction(ins);
- }
-
- branched = true;
- }
-}
-
-static bool
-mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
-{
- /* abs or neg */
- if (!is_int && src.mod) return true;
-
- /* Other int mods don't matter in isolation */
- if (is_int && src.mod == midgard_int_shift) return true;
-
- /* size-conversion */
- if (src.half) return true;
-
- /* swizzle */
- for (unsigned c = 0; c < 4; ++c) {
- if (!(mask & (1 << c))) continue;
- if (((src.swizzle >> (2*c)) & 3) != c) return true;
- }
-
- return false;
-}
-
-static bool
-mir_nontrivial_source2_mod(midgard_instruction *ins)
-{
- bool is_int = midgard_is_integer_op(ins->alu.op);
-
- midgard_vector_alu_src src2 =
- vector_alu_from_unsigned(ins->alu.src2);
-
- return mir_nontrivial_mod(src2, is_int, ins->mask);
-}
-
-static bool
-mir_nontrivial_outmod(midgard_instruction *ins)
-{
- bool is_int = midgard_is_integer_op(ins->alu.op);
- unsigned mod = ins->alu.outmod;
-
- /* Type conversion is a sort of outmod */
- if (ins->alu.dest_override != midgard_dest_override_none)
- return true;
-
- if (is_int)
- return mod != midgard_outmod_int_wrap;
- else
- return mod != midgard_outmod_none;
-}
-
-static bool
-midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
-{
- bool progress = false;
-
- mir_foreach_instr_in_block_safe(block, ins) {
- if (ins->type != TAG_ALU_4) continue;
- if (!OP_IS_MOVE(ins->alu.op)) continue;
-
- unsigned from = ins->ssa_args.src1;
- unsigned to = ins->ssa_args.dest;
-
- /* We only work on pure SSA */
-
- if (to >= SSA_FIXED_MINIMUM) continue;
- if (from >= SSA_FIXED_MINIMUM) continue;
- if (to >= ctx->func->impl->ssa_alloc) continue;
- if (from >= ctx->func->impl->ssa_alloc) continue;
-
- /* Constant propagation is not handled here, either */
- if (ins->ssa_args.inline_constant) continue;
- if (ins->has_constants) continue;
-
- if (mir_nontrivial_source2_mod(ins)) continue;
- if (mir_nontrivial_outmod(ins)) continue;
-
- /* We're clear -- rewrite */
- mir_rewrite_index_src(ctx, to, from);
- mir_remove_instruction(ins);
- progress |= true;
- }
-
- return progress;
-}
-
-/* fmov.pos is an idiom for fpos. Propoagate the .pos up to the source, so then
- * the move can be propagated away entirely */
-
-static bool
-mir_compose_float_outmod(midgard_outmod_float *outmod, midgard_outmod_float comp)
-{
- /* Nothing to do */
- if (comp == midgard_outmod_none)
- return true;
-
- if (*outmod == midgard_outmod_none) {
- *outmod = comp;
- return true;
- }
-
- /* TODO: Compose rules */
- return false;
-}
-
-static bool
-midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block)
-{
- bool progress = false;
-
- mir_foreach_instr_in_block_safe(block, ins) {
- if (ins->type != TAG_ALU_4) continue;
- if (ins->alu.op != midgard_alu_op_fmov) continue;
- if (ins->alu.outmod != midgard_outmod_pos) continue;
-
- /* TODO: Registers? */
- unsigned src = ins->ssa_args.src1;
- if (src >= ctx->func->impl->ssa_alloc) continue;
- assert(!mir_has_multiple_writes(ctx, src));
-
- /* There might be a source modifier, too */
- if (mir_nontrivial_source2_mod(ins)) continue;
-
- /* Backpropagate the modifier */
- mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) {
- if (v->type != TAG_ALU_4) continue;
- if (v->ssa_args.dest != src) continue;
-
- /* Can we even take a float outmod? */
- if (midgard_is_integer_out_op(v->alu.op)) continue;
-
- midgard_outmod_float temp = v->alu.outmod;
- progress |= mir_compose_float_outmod(&temp, ins->alu.outmod);
-
- /* Throw in the towel.. */
- if (!progress) break;
-
- /* Otherwise, transfer the modifier */
- v->alu.outmod = temp;
- ins->alu.outmod = midgard_outmod_none;
-
- break;
- }
- }
-
- return progress;
-}
-
-/* The following passes reorder MIR instructions to enable better scheduling */
-
-static void
-midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
-{
- mir_foreach_instr_in_block_safe(block, ins) {
- if (ins->type != TAG_LOAD_STORE_4) continue;
-
- /* We've found a load/store op. Check if next is also load/store. */
- midgard_instruction *next_op = mir_next_op(ins);
- if (&next_op->link != &block->instructions) {
- if (next_op->type == TAG_LOAD_STORE_4) {
- /* If so, we're done since we're a pair */
- ins = mir_next_op(ins);
- continue;
- }
-
- /* Maximum search distance to pair, to avoid register pressure disasters */
- int search_distance = 8;
-
- /* Otherwise, we have an orphaned load/store -- search for another load */
- mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
- /* Terminate search if necessary */
- if (!(search_distance--)) break;
-
- if (c->type != TAG_LOAD_STORE_4) continue;
-
- /* Stores cannot be reordered, since they have
- * dependencies. For the same reason, indirect
- * loads cannot be reordered as their index is
- * loaded in r27.w */
-
- if (OP_IS_STORE(c->load_store.op)) continue;
-
- /* It appears the 0x800 bit is set whenever a
- * load is direct, unset when it is indirect.
- * Skip indirect loads. */
-
- if (!(c->load_store.unknown & 0x800)) continue;
-
- /* We found one! Move it up to pair and remove it from the old location */
-
- mir_insert_instruction_before(ins, *c);
- mir_remove_instruction(c);
-
- break;
- }
- }
- }
-}
-
-/* If there are leftovers after the below pass, emit actual fmov
- * instructions for the slow-but-correct path */
-
-static void
-emit_leftover_move(compiler_context *ctx)
-{
- set_foreach(ctx->leftover_ssa_to_alias, leftover) {
- int base = ((uintptr_t) leftover->key) - 1;
- int mapped = base;
-
- map_ssa_to_alias(ctx, &mapped);
- EMIT(mov, mapped, blank_alu_src, base);
- }
-}
-
-static void
-actualise_ssa_to_alias(compiler_context *ctx)
-{
- mir_foreach_instr(ctx, ins) {
- map_ssa_to_alias(ctx, &ins->ssa_args.src0);
- map_ssa_to_alias(ctx, &ins->ssa_args.src1);
- }
-
- emit_leftover_move(ctx);
-}
-
-static void
-emit_fragment_epilogue(compiler_context *ctx)
-{
- /* Special case: writing out constants requires us to include the move
- * explicitly now, so shove it into r0 */
-
- void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, ctx->fragment_output + 1);
-
- if (constant_value) {
- midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(0));
- attach_constants(ctx, &ins, constant_value, ctx->fragment_output + 1);
- emit_mir_instruction(ctx, ins);
- }
-
- /* Perform the actual fragment writeout. We have two writeout/branch
- * instructions, forming a loop until writeout is successful as per the
- * docs. TODO: gl_FragDepth */
-
- EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always);
- EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always);
-}
-
-static midgard_block *
-emit_block(compiler_context *ctx, nir_block *block)
-{
- midgard_block *this_block = calloc(sizeof(midgard_block), 1);
- list_addtail(&this_block->link, &ctx->blocks);
-
- this_block->is_scheduled = false;
- ++ctx->block_count;
-
- ctx->texture_index[0] = -1;
- ctx->texture_index[1] = -1;
-
- /* Add us as a successor to the block we are following */
- if (ctx->current_block)
- midgard_block_add_successor(ctx->current_block, this_block);
-
- /* Set up current block */
- list_inithead(&this_block->instructions);
- ctx->current_block = this_block;
-
- nir_foreach_instr(instr, block) {
- emit_instr(ctx, instr);
- ++ctx->instruction_count;
- }
-
- inline_alu_constants(ctx);
- embedded_to_inline_constant(ctx);
-
- /* Perform heavylifting for aliasing */
- actualise_ssa_to_alias(ctx);
-
- midgard_pair_load_store(ctx, this_block);
-
- /* Append fragment shader epilogue (value writeout) */
- if (ctx->stage == MESA_SHADER_FRAGMENT) {
- if (block == nir_impl_last_block(ctx->func->impl)) {
- emit_fragment_epilogue(ctx);
- }
- }
-
- if (block == nir_start_block(ctx->func->impl))
- ctx->initial_block = this_block;
-
- if (block == nir_impl_last_block(ctx->func->impl))
- ctx->final_block = this_block;
-
- /* Allow the next control flow to access us retroactively, for
- * branching etc */
- ctx->current_block = this_block;
-
- /* Document the fallthrough chain */
- ctx->previous_source_block = this_block;
-
- return this_block;
-}
-
-static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
-
-static void
-emit_if(struct compiler_context *ctx, nir_if *nif)
-{
- /* Conditional branches expect the condition in r31.w; emit a move for
- * that in the _previous_ block (which is the current block). */
- emit_condition(ctx, &nif->condition, true, COMPONENT_X);
-
- /* Speculatively emit the branch, but we can't fill it in until later */
- EMIT(branch, true, true);
- midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
-
- /* Emit the two subblocks */
- midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
-
- /* Emit a jump from the end of the then block to the end of the else */
- EMIT(branch, false, false);
- midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
-
- /* Emit second block, and check if it's empty */
-
- int else_idx = ctx->block_count;
- int count_in = ctx->instruction_count;
- midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
- int after_else_idx = ctx->block_count;
-
- /* Now that we have the subblocks emitted, fix up the branches */
-
- assert(then_block);
- assert(else_block);
-
- if (ctx->instruction_count == count_in) {
- /* The else block is empty, so don't emit an exit jump */
- mir_remove_instruction(then_exit);
- then_branch->branch.target_block = after_else_idx;
- } else {
- then_branch->branch.target_block = else_idx;
- then_exit->branch.target_block = after_else_idx;
- }
-}
-
-static void
-emit_loop(struct compiler_context *ctx, nir_loop *nloop)
-{
- /* Remember where we are */
- midgard_block *start_block = ctx->current_block;
-
- /* Allocate a loop number, growing the current inner loop depth */
- int loop_idx = ++ctx->current_loop_depth;
-
- /* Get index from before the body so we can loop back later */
- int start_idx = ctx->block_count;
-
- /* Emit the body itself */
- emit_cf_list(ctx, &nloop->body);
-
- /* Branch back to loop back */
- struct midgard_instruction br_back = v_branch(false, false);
- br_back.branch.target_block = start_idx;
- emit_mir_instruction(ctx, br_back);
-
- /* Mark down that branch in the graph. Note that we're really branching
- * to the block *after* we started in. TODO: Why doesn't the branch
- * itself have an off-by-one then...? */
- midgard_block_add_successor(ctx->current_block, start_block->successors[0]);
-
- /* Find the index of the block about to follow us (note: we don't add
- * one; blocks are 0-indexed so we get a fencepost problem) */
- int break_block_idx = ctx->block_count;
-
- /* Fix up the break statements we emitted to point to the right place,
- * now that we can allocate a block number for them */
-
- list_for_each_entry_from(struct midgard_block, block, start_block, &ctx->blocks, link) {
- mir_foreach_instr_in_block(block, ins) {
- if (ins->type != TAG_ALU_4) continue;
- if (!ins->compact_branch) continue;
- if (ins->prepacked_branch) continue;
-
- /* We found a branch -- check the type to see if we need to do anything */
- if (ins->branch.target_type != TARGET_BREAK) continue;
-
- /* It's a break! Check if it's our break */
- if (ins->branch.target_break != loop_idx) continue;
-
- /* Okay, cool, we're breaking out of this loop.
- * Rewrite from a break to a goto */
-
- ins->branch.target_type = TARGET_GOTO;
- ins->branch.target_block = break_block_idx;
- }
- }
-
- /* Now that we've finished emitting the loop, free up the depth again
- * so we play nice with recursion amid nested loops */
- --ctx->current_loop_depth;
-
- /* Dump loop stats */
- ++ctx->loop_count;
-}
-
-static midgard_block *
-emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
-{
- midgard_block *start_block = NULL;
-
- foreach_list_typed(nir_cf_node, node, node, list) {
- switch (node->type) {
- case nir_cf_node_block: {
- midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
-
- if (!start_block)
- start_block = block;
-
- break;
- }
-
- case nir_cf_node_if:
- emit_if(ctx, nir_cf_node_as_if(node));
- break;
-
- case nir_cf_node_loop:
- emit_loop(ctx, nir_cf_node_as_loop(node));
- break;
-
- case nir_cf_node_function:
- assert(0);
- break;
- }
- }
-
- return start_block;
-}
-
-/* Due to lookahead, we need to report the first tag executed in the command
- * stream and in branch targets. An initial block might be empty, so iterate
- * until we find one that 'works' */
-
-static unsigned
-midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
-{
- midgard_block *initial_block = mir_get_block(ctx, block_idx);
-
- unsigned first_tag = 0;
-
- do {
- midgard_bundle *initial_bundle = util_dynarray_element(&initial_block->bundles, midgard_bundle, 0);
-
- if (initial_bundle) {
- first_tag = initial_bundle->tag;
- break;
- }
-
- /* Initial block is empty, try the next block */
- initial_block = list_first_entry(&(initial_block->link), midgard_block, link);
- } while(initial_block != NULL);
-
- assert(first_tag);
- return first_tag;
-}
-
-int
-midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend)
-{
- struct util_dynarray *compiled = &program->compiled;
-
- midgard_debug = debug_get_option_midgard_debug();
-
- compiler_context ictx = {
- .nir = nir,
- .stage = nir->info.stage,
-
- .is_blend = is_blend,
- .blend_constant_offset = 0,
-
- .alpha_ref = program->alpha_ref
- };
-
- compiler_context *ctx = &ictx;
-
- /* TODO: Decide this at runtime */
- ctx->uniform_cutoff = 8;
-
- /* Initialize at a global (not block) level hash tables */
-
- ctx->ssa_constants = _mesa_hash_table_u64_create(NULL);
- ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL);
- ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
- ctx->sysval_to_id = _mesa_hash_table_u64_create(NULL);
- ctx->leftover_ssa_to_alias = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
-
- /* Record the varying mapping for the command stream's bookkeeping */
-
- struct exec_list *varyings =
- ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs;
-
- unsigned max_varying = 0;
- nir_foreach_variable(var, varyings) {
- unsigned loc = var->data.driver_location;
- unsigned sz = glsl_type_size(var->type, FALSE);
-
- for (int c = 0; c < sz; ++c) {
- program->varyings[loc + c] = var->data.location + c;
- max_varying = MAX2(max_varying, loc + c);
- }
- }
-
- /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
- * (so we don't accidentally duplicate the epilogue since mesa/st has
- * messed with our I/O quite a bit already) */
-
- NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
- if (ctx->stage == MESA_SHADER_VERTEX)
- NIR_PASS_V(nir, nir_lower_viewport_transform);
-
- NIR_PASS_V(nir, nir_lower_var_copies);
- NIR_PASS_V(nir, nir_lower_vars_to_ssa);
- NIR_PASS_V(nir, nir_split_var_copies);
- NIR_PASS_V(nir, nir_lower_var_copies);
- NIR_PASS_V(nir, nir_lower_global_vars_to_local);
- NIR_PASS_V(nir, nir_lower_var_copies);
- NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
- NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
-
- /* Optimisation passes */
-
- optimise_nir(nir);
-
- if (midgard_debug & MIDGARD_DBG_SHADERS) {
- nir_print_shader(nir, stdout);
- }
-
- /* Assign sysvals and counts, now that we're sure
- * (post-optimisation) */
-
- midgard_nir_assign_sysvals(ctx, nir);
-
- program->uniform_count = nir->num_uniforms;
- program->sysval_count = ctx->sysval_count;
- memcpy(program->sysvals, ctx->sysvals, sizeof(ctx->sysvals[0]) * ctx->sysval_count);
-
- program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0;
- program->varying_count = max_varying + 1; /* Fencepost off-by-one */
-
- nir_foreach_function(func, nir) {
- if (!func->impl)
- continue;
-
- list_inithead(&ctx->blocks);
- ctx->block_count = 0;
- ctx->func = func;
-
- emit_cf_list(ctx, &func->impl->body);
- emit_block(ctx, func->impl->end_block);
-
- break; /* TODO: Multi-function shaders */
- }
-
- util_dynarray_init(compiled, NULL);
-
- /* MIR-level optimizations */
-
- bool progress = false;
-
- do {
- progress = false;
-
- mir_foreach_block(ctx, block) {
- progress |= midgard_opt_pos_propagate(ctx, block);
- progress |= midgard_opt_copy_prop(ctx, block);
- progress |= midgard_opt_dead_code_eliminate(ctx, block);
- }
- } while (progress);
-
- /* Nested control-flow can result in dead branches at the end of the
- * block. This messes with our analysis and is just dead code, so cull
- * them */
- mir_foreach_block(ctx, block) {
- midgard_opt_cull_dead_branch(ctx, block);
- }
-
- /* Schedule! */
- schedule_program(ctx);
-
- /* Now that all the bundles are scheduled and we can calculate block
- * sizes, emit actual branch instructions rather than placeholders */
-
- int br_block_idx = 0;
-
- mir_foreach_block(ctx, block) {
- util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
- for (int c = 0; c < bundle->instruction_count; ++c) {
- midgard_instruction *ins = bundle->instructions[c];
-
- if (!midgard_is_branch_unit(ins->unit)) continue;
-
- if (ins->prepacked_branch) continue;
-
- /* Parse some basic branch info */
- bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT;
- bool is_conditional = ins->branch.conditional;
- bool is_inverted = ins->branch.invert_conditional;
- bool is_discard = ins->branch.target_type == TARGET_DISCARD;
-
- /* Determine the block we're jumping to */
- int target_number = ins->branch.target_block;
-
- /* Report the destination tag */
- int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number);
-
- /* Count up the number of quadwords we're
- * jumping over = number of quadwords until
- * (br_block_idx, target_number) */
-
- int quadword_offset = 0;
-
- if (is_discard) {
- /* Jump to the end of the shader. We
- * need to include not only the
- * following blocks, but also the
- * contents of our current block (since
- * discard can come in the middle of
- * the block) */
-
- midgard_block *blk = mir_get_block(ctx, br_block_idx + 1);
-
- for (midgard_bundle *bun = bundle + 1; bun < (midgard_bundle *)((char*) block->bundles.data + block->bundles.size); ++bun) {
- quadword_offset += quadword_size(bun->tag);
- }
-
- mir_foreach_block_from(ctx, blk, b) {
- quadword_offset += b->quadword_count;
- }
-
- } else if (target_number > br_block_idx) {
- /* Jump forward */
-
- for (int idx = br_block_idx + 1; idx < target_number; ++idx) {
- midgard_block *blk = mir_get_block(ctx, idx);
- assert(blk);
-
- quadword_offset += blk->quadword_count;
- }
- } else {
- /* Jump backwards */
-
- for (int idx = br_block_idx; idx >= target_number; --idx) {
- midgard_block *blk = mir_get_block(ctx, idx);
- assert(blk);
-
- quadword_offset -= blk->quadword_count;
- }
- }
-
- /* Unconditional extended branches (far jumps)
- * have issues, so we always use a conditional
- * branch, setting the condition to always for
- * unconditional. For compact unconditional
- * branches, cond isn't used so it doesn't
- * matter what we pick. */
-
- midgard_condition cond =
- !is_conditional ? midgard_condition_always :
- is_inverted ? midgard_condition_false :
- midgard_condition_true;
-
- midgard_jmp_writeout_op op =
- is_discard ? midgard_jmp_writeout_op_discard :
- (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond :
- midgard_jmp_writeout_op_branch_cond;
-
- if (!is_compact) {
- midgard_branch_extended branch =
- midgard_create_branch_extended(
- cond, op,
- dest_tag,
- quadword_offset);
-
- memcpy(&ins->branch_extended, &branch, sizeof(branch));
- } else if (is_conditional || is_discard) {
- midgard_branch_cond branch = {
- .op = op,
- .dest_tag = dest_tag,
- .offset = quadword_offset,
- .cond = cond
- };
-
- assert(branch.offset == quadword_offset);
-
- memcpy(&ins->br_compact, &branch, sizeof(branch));
- } else {
- assert(op == midgard_jmp_writeout_op_branch_uncond);
-
- midgard_branch_uncond branch = {
- .op = op,
- .dest_tag = dest_tag,
- .offset = quadword_offset,
- .unknown = 1
- };
-
- assert(branch.offset == quadword_offset);
-
- memcpy(&ins->br_compact, &branch, sizeof(branch));
- }
- }
- }
-
- ++br_block_idx;
- }
-
- /* Emit flat binary from the instruction arrays. Iterate each block in
- * sequence. Save instruction boundaries such that lookahead tags can
- * be assigned easily */
-
- /* Cache _all_ bundles in source order for lookahead across failed branches */
-
- int bundle_count = 0;
- mir_foreach_block(ctx, block) {
- bundle_count += block->bundles.size / sizeof(midgard_bundle);
- }
- midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
- int bundle_idx = 0;
- mir_foreach_block(ctx, block) {
- util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
- source_order_bundles[bundle_idx++] = bundle;
- }
- }
-
- int current_bundle = 0;
-
- /* Midgard prefetches instruction types, so during emission we
- * need to lookahead. Unless this is the last instruction, in
- * which we return 1. Or if this is the second to last and the
- * last is an ALU, then it's also 1... */
-
- mir_foreach_block(ctx, block) {
- mir_foreach_bundle_in_block(block, bundle) {
- int lookahead = 1;
-
- if (current_bundle + 1 < bundle_count) {
- uint8_t next = source_order_bundles[current_bundle + 1]->tag;
-
- if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) {
- lookahead = 1;
- } else {
- lookahead = next;
- }
- }
-
- emit_binary_bundle(ctx, bundle, compiled, lookahead);
- ++current_bundle;
- }
-
- /* TODO: Free deeper */
- //util_dynarray_fini(&block->instructions);
- }
-
- free(source_order_bundles);
-
- /* Report the very first tag executed */
- program->first_tag = midgard_get_first_tag_from_block(ctx, 0);
-
- /* Deal with off-by-one related to the fencepost problem */
- program->work_register_count = ctx->work_registers + 1;
-
- program->can_discard = ctx->can_discard;
- program->uniform_cutoff = ctx->uniform_cutoff;
-
- program->blend_patch_offset = ctx->blend_constant_offset;
-
- if (midgard_debug & MIDGARD_DBG_SHADERS)
- disassemble_midgard(program->compiled.data, program->compiled.size);
-
- if (midgard_debug & MIDGARD_DBG_SHADERDB) {
- unsigned nr_bundles = 0, nr_ins = 0, nr_quadwords = 0;
-
- /* Count instructions and bundles */
-
- mir_foreach_instr_global(ctx, ins) {
- nr_ins++;
- }
-
- mir_foreach_block(ctx, block) {
- nr_bundles += util_dynarray_num_elements(
- &block->bundles, midgard_bundle);
-
- nr_quadwords += block->quadword_count;
- }
-
- /* Calculate thread count. There are certain cutoffs by
- * register count for thread count */
-
- unsigned nr_registers = program->work_register_count;
-
- unsigned nr_threads =
- (nr_registers <= 4) ? 4 :
- (nr_registers <= 8) ? 2 :
- 1;
-
- /* Dump stats */
-
- fprintf(stderr, "shader%d - %s shader: "
- "%u inst, %u bundles, %u quadwords, "
- "%u registers, %u threads, %u loops\n",
- SHADER_DB_COUNT++,
- gl_shader_stage_name(ctx->stage),
- nr_ins, nr_bundles, nr_quadwords,
- nr_registers, nr_threads,
- ctx->loop_count);
- }
-
-
- return 0;
-}
+++ /dev/null
-/*
- * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __MIDGARD_H_
-#define __MIDGARD_H_
-
-#include "compiler/nir/nir.h"
-#include "util/u_dynarray.h"
-
-/* Define the general compiler entry point */
-
-#define MAX_SYSVAL_COUNT 32
-
-/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal
- * their class for equal comparison */
-
-#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type)
-#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff)
-#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16)
-
-/* Define some common types. We start at one for easy indexing of hash
- * tables internal to the compiler */
-
-enum {
- PAN_SYSVAL_VIEWPORT_SCALE = 1,
- PAN_SYSVAL_VIEWPORT_OFFSET = 2,
- PAN_SYSVAL_TEXTURE_SIZE = 3,
-} pan_sysval;
-
-#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \
- ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0))
-
-#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id) & 0x7f)
-#define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3)
-#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9))
-
-typedef struct {
- int work_register_count;
- int uniform_count;
- int uniform_cutoff;
-
- int attribute_count;
- int varying_count;
-
- /* Prepended before uniforms, mapping to SYSVAL_ names for the
- * sysval */
-
- unsigned sysval_count;
- unsigned sysvals[MAX_SYSVAL_COUNT];
-
- unsigned varyings[32];
-
- /* Boolean properties of the program */
- bool can_discard;
- bool writes_point_size;
-
- int first_tag;
-
- struct util_dynarray compiled;
-
- /* For a blend shader using a constant color -- patch point. If
- * negative, there's no constant. */
-
- int blend_patch_offset;
-
- /* IN: For a fragment shader with a lowered alpha test, the ref value */
- float alpha_ref;
-} midgard_program;
-
-int
-midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend);
-
-/* NIR options are shared between the standalone compiler and the online
- * compiler. Defining it here is the simplest, though maybe not the Right
- * solution. */
-
-static const nir_shader_compiler_options midgard_nir_options = {
- .lower_ffma = true,
- .lower_sub = true,
- .lower_scmp = true,
- .lower_flrp32 = true,
- .lower_flrp64 = true,
- .lower_ffract = true,
- .lower_fmod = true,
- .lower_fdiv = true,
- .lower_idiv = true,
- .lower_isign = true,
- .lower_fpow = true,
- .lower_find_lsb = true,
-
- .lower_wpos_pntc = true,
-
- /* TODO: We have native ops to help here, which we'll want to look into
- * eventually */
- .lower_fsign = true,
-
- .vertex_id_zero_based = true,
- .lower_extract_byte = true,
- .lower_extract_word = true,
- .lower_rotate = true,
-
- .lower_doubles_options = nir_lower_dmod,
-
- .vectorize_io = true,
-};
-
-#endif
+++ /dev/null
-/*
- * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "compiler.h"
-#include "midgard_ops.h"
-
-/* Midgard IR only knows vector ALU types, but we sometimes need to actually
- * use scalar ALU instructions, for functional or performance reasons. To do
- * this, we just demote vector ALU payloads to scalar. */
-
-static int
-component_from_mask(unsigned mask)
-{
- for (int c = 0; c < 8; ++c) {
- if (mask & (1 << c))
- return c;
- }
-
- assert(0);
- return 0;
-}
-
-static unsigned
-vector_to_scalar_source(unsigned u, bool is_int, bool is_full)
-{
- midgard_vector_alu_src v;
- memcpy(&v, &u, sizeof(v));
-
- /* TODO: Integers */
-
- unsigned component = v.swizzle & 3;
- bool upper = false; /* TODO */
-
- midgard_scalar_alu_src s = { 0 };
-
- if (is_full) {
- /* For a 32-bit op, just check the source half flag */
- s.full = !v.half;
- } else if (!v.half) {
- /* For a 16-bit op that's not subdivided, never full */
- s.full = false;
- } else {
- /* We can't do 8-bit scalar, abort! */
- assert(0);
- }
-
- /* Component indexing takes size into account */
-
- if (s.full)
- s.component = component << 1;
- else
- s.component = component + (upper << 2);
-
- if (is_int) {
- /* TODO */
- } else {
- s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS;
- s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG;
- }
-
- unsigned o;
- memcpy(&o, &s, sizeof(s));
-
- return o & ((1 << 6) - 1);
-}
-
-static midgard_scalar_alu
-vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)
-{
- bool is_int = midgard_is_integer_op(v.op);
- bool is_full = v.reg_mode == midgard_reg_mode_32;
- bool is_inline_constant = ins->ssa_args.inline_constant;
-
- /* The output component is from the mask */
- midgard_scalar_alu s = {
- .op = v.op,
- .src1 = vector_to_scalar_source(v.src1, is_int, is_full),
- .src2 = !is_inline_constant ? vector_to_scalar_source(v.src2, is_int, is_full) : 0,
- .unknown = 0,
- .outmod = v.outmod,
- .output_full = is_full,
- .output_component = component_from_mask(ins->mask),
- };
-
- /* Full components are physically spaced out */
- if (is_full) {
- assert(s.output_component < 4);
- s.output_component <<= 1;
- }
-
- /* Inline constant is passed along rather than trying to extract it
- * from v */
-
- if (ins->ssa_args.inline_constant) {
- uint16_t imm = 0;
- int lower_11 = ins->inline_constant & ((1 << 12) - 1);
- imm |= (lower_11 >> 9) & 3;
- imm |= (lower_11 >> 6) & 4;
- imm |= (lower_11 >> 2) & 0x38;
- imm |= (lower_11 & 63) << 6;
-
- s.src2 = imm;
- }
-
- return s;
-}
-
-static void
-emit_alu_bundle(compiler_context *ctx,
- midgard_bundle *bundle,
- struct util_dynarray *emission,
- unsigned lookahead)
-{
- /* Emit the control word */
- util_dynarray_append(emission, uint32_t, bundle->control | lookahead);
-
- /* Next up, emit register words */
- for (unsigned i = 0; i < bundle->instruction_count; ++i) {
- midgard_instruction *ins = bundle->instructions[i];
-
- /* Check if this instruction has registers */
- if (ins->compact_branch || ins->prepacked_branch) continue;
-
- /* Otherwise, just emit the registers */
- uint16_t reg_word = 0;
- memcpy(®_word, &ins->registers, sizeof(uint16_t));
- util_dynarray_append(emission, uint16_t, reg_word);
- }
-
- /* Now, we emit the body itself */
- for (unsigned i = 0; i < bundle->instruction_count; ++i) {
- midgard_instruction *ins = bundle->instructions[i];
-
- /* Where is this body */
- unsigned size = 0;
- void *source = NULL;
-
- /* In case we demote to a scalar */
- midgard_scalar_alu scalarized;
-
- if (ins->unit & UNITS_ANY_VECTOR) {
- if (ins->alu.reg_mode == midgard_reg_mode_32)
- ins->alu.mask = expand_writemask_32(ins->mask);
- else
- ins->alu.mask = ins->mask;
-
- size = sizeof(midgard_vector_alu);
- source = &ins->alu;
- } else if (ins->unit == ALU_ENAB_BR_COMPACT) {
- size = sizeof(midgard_branch_cond);
- source = &ins->br_compact;
- } else if (ins->compact_branch) { /* misnomer */
- size = sizeof(midgard_branch_extended);
- source = &ins->branch_extended;
- } else {
- size = sizeof(midgard_scalar_alu);
- scalarized = vector_to_scalar_alu(ins->alu, ins);
- source = &scalarized;
- }
-
- memcpy(util_dynarray_grow_bytes(emission, 1, size), source, size);
- }
-
- /* Emit padding (all zero) */
- memset(util_dynarray_grow_bytes(emission, 1, bundle->padding), 0, bundle->padding);
-
- /* Tack on constants */
-
- if (bundle->has_embedded_constants) {
- util_dynarray_append(emission, float, bundle->constants[0]);
- util_dynarray_append(emission, float, bundle->constants[1]);
- util_dynarray_append(emission, float, bundle->constants[2]);
- util_dynarray_append(emission, float, bundle->constants[3]);
- }
-}
-
-/* After everything is scheduled, emit whole bundles at a time */
-
-void
-emit_binary_bundle(compiler_context *ctx,
- midgard_bundle *bundle,
- struct util_dynarray *emission,
- int next_tag)
-{
- int lookahead = next_tag << 4;
-
- switch (bundle->tag) {
- case TAG_ALU_4:
- case TAG_ALU_8:
- case TAG_ALU_12:
- case TAG_ALU_16:
- emit_alu_bundle(ctx, bundle, emission, lookahead);
- break;
-
- case TAG_LOAD_STORE_4: {
- /* One or two composing instructions */
-
- uint64_t current64, next64 = LDST_NOP;
-
- /* Copy masks */
-
- for (unsigned i = 0; i < bundle->instruction_count; ++i) {
- bundle->instructions[i]->load_store.mask =
- bundle->instructions[i]->mask;
- }
-
- memcpy(¤t64, &bundle->instructions[0]->load_store, sizeof(current64));
-
- if (bundle->instruction_count == 2)
- memcpy(&next64, &bundle->instructions[1]->load_store, sizeof(next64));
-
- midgard_load_store instruction = {
- .type = bundle->tag,
- .next_type = next_tag,
- .word1 = current64,
- .word2 = next64
- };
-
- util_dynarray_append(emission, midgard_load_store, instruction);
-
- break;
- }
-
- case TAG_TEXTURE_4:
- case TAG_TEXTURE_4_VTX: {
- /* Texture instructions are easy, since there is no pipelining
- * nor VLIW to worry about. We may need to set .cont/.last
- * flags. */
-
- midgard_instruction *ins = bundle->instructions[0];
-
- ins->texture.type = bundle->tag;
- ins->texture.next_type = next_tag;
- ins->texture.mask = ins->mask;
-
- ctx->texture_op_count--;
-
- if (ins->texture.op == TEXTURE_OP_NORMAL) {
- bool continues = ctx->texture_op_count > 0;
- ins->texture.cont = continues;
- ins->texture.last = !continues;
- } else {
- ins->texture.cont = ins->texture.last = 1;
- }
-
- util_dynarray_append(emission, midgard_texture_word, ins->texture);
- break;
- }
-
- default:
- unreachable("Unknown midgard instruction type\n");
- }
-}
+++ /dev/null
-/*
- * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* mir_is_live_after performs liveness analysis on the MIR, used primarily
- * as part of register allocation. TODO: Algorithmic improvements for
- * compiler performance (this is the worst algorithm possible -- see
- * backlog with Connor on IRC) */
-
-#include "compiler.h"
-
-static bool
-midgard_is_live_in_instr(midgard_instruction *ins, int src)
-{
- if (ins->compact_branch)
- return false;
-
- if (ins->ssa_args.src0 == src)
- return true;
-
- if (!ins->ssa_args.inline_constant && ins->ssa_args.src1 == src)
- return true;
-
- return false;
-}
-
-/* Determine if a variable is live in the successors of a block */
-static bool
-is_live_after_successors(compiler_context *ctx, midgard_block *bl, int src)
-{
- for (unsigned i = 0; i < bl->nr_successors; ++i) {
- midgard_block *succ = bl->successors[i];
-
- /* If we already visited, the value we're seeking
- * isn't down this path (or we would have short
- * circuited */
-
- if (succ->visited) continue;
-
- /* Otherwise (it's visited *now*), check the block */
-
- succ->visited = true;
-
- mir_foreach_instr_in_block(succ, ins) {
- if (midgard_is_live_in_instr(ins, src))
- return true;
- }
-
- /* ...and also, check *its* successors */
- if (is_live_after_successors(ctx, succ, src))
- return true;
-
- }
-
- /* Welp. We're really not live. */
-
- return false;
-}
-
-bool
-mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src)
-{
- /* Check the rest of the block for liveness */
-
- mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) {
- if (midgard_is_live_in_instr(ins, src))
- return true;
- }
-
- /* Check the rest of the blocks for liveness recursively */
-
- bool succ = is_live_after_successors(ctx, block, src);
-
- mir_foreach_block(ctx, block) {
- block->visited = false;
- }
-
- return succ;
-}
-
-/* Just a quick check -- is it written more than once? (I.e. are we definitely
- * not SSA?) */
-
-bool
-mir_has_multiple_writes(compiler_context *ctx, int dest)
-{
- unsigned write_count = 0;
-
- mir_foreach_instr_global(ctx, ins) {
- if (ins->ssa_args.dest == dest)
- write_count++;
- }
-
- return write_count > 1;
-}
+++ /dev/null
-#include <stdbool.h>
-#include "nir.h"
-
-bool midgard_nir_lower_algebraic_late(nir_shader *shader);
-bool midgard_nir_scale_trig(nir_shader *shader);
+++ /dev/null
-#
-# Copyright (C) 2018 Alyssa Rosenzweig
-#
-# Copyright (C) 2016 Intel Corporation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-import argparse
-import sys
-import math
-
-a = 'a'
-b = 'b'
-c = 'c'
-
-algebraic_late = [
- # ineg must be lowered late, but only for integers; floats will try to
- # have modifiers attached... hence why this has to be here rather than
- # a more standard lower_negate approach
-
- (('ineg', a), ('isub', 0, a)),
-
- # These two special-cases save space/an op than the actual csel op +
- # scheduler flexibility
-
- (('b32csel', a, 'b@32', 0), ('iand', a, b)),
- (('b32csel', a, 0, 'b@32'), ('iand', ('inot', a), b)),
-]
-
-
-# Midgard is able to type convert down by only one "step" per instruction; if
-# NIR wants more than one step, we need to break up into multiple instructions
-
-converts = [
- (('i2i8', 'a@32'), ('i2i8', ('i2i16', a))),
- (('u2u8', 'a@32'), ('u2u8', ('u2u16', a))),
-
- (('i2i32', 'a@8'), ('i2i32', ('i2i16', a))),
- (('u2u32', 'a@8'), ('u2u32', ('u2u16', a))),
-
- (('f2i32', 'a@16'), ('f2i32', ('f2f32', a))),
- (('f2u32', 'a@16'), ('f2u32', ('f2f32', a))),
-
- # Totally redundant
- (('~f2f16', ('f2f32', 'a@16')), a),
-
- (('pack_half_2x16_split', 'a@32', 'b@32'), ('ior', ('ishl', ('i2i32', ('f2f16', b)), 16), ('i2i32', ('f2f16', a)))),
-]
-
-# Midgard scales fsin/fcos arguments by pi.
-# Pass must be run only once, after the main loop
-
-scale_trig = [
- (('fsin', a), ('fsin', ('fdiv', a, math.pi))),
- (('fcos', a), ('fcos', ('fdiv', a, math.pi))),
-]
-
-def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('-p', '--import-path', required=True)
- args = parser.parse_args()
- sys.path.insert(0, args.import_path)
- run()
-
-
-def run():
- import nir_algebraic # pylint: disable=import-error
-
- print('#include "midgard_nir.h"')
-
- print(nir_algebraic.AlgebraicPass("midgard_nir_lower_algebraic_late",
- algebraic_late + converts).render())
-
- print(nir_algebraic.AlgebraicPass("midgard_nir_scale_trig",
- scale_trig).render())
-
-
-if __name__ == '__main__':
- main()
+++ /dev/null
-/* Copyright (c) 2018-2019 Alyssa Rosenzweig (alyssa@rosenzweig.io)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include "midgard.h"
-
-/* Include the definitions of the macros and such */
-
-#define MIDGARD_OPS_TABLE
-#include "helpers.h"
-#undef MIDGARD_OPS_TABLE
-
-/* Table of mapping opcodes to accompanying properties. This is used for both
- * the disassembler and the compiler. It is placed in a .c file like this to
- * avoid duplications in the binary */
-
-struct mir_op_props alu_opcode_props[256] = {
- [midgard_alu_op_fadd] = {"fadd", UNITS_ADD | OP_COMMUTES},
- [midgard_alu_op_fmul] = {"fmul", UNITS_MUL | UNIT_VLUT | OP_COMMUTES},
- [midgard_alu_op_fmin] = {"fmin", UNITS_MUL | UNITS_ADD | OP_COMMUTES},
- [midgard_alu_op_fmax] = {"fmax", UNITS_MUL | UNITS_ADD | OP_COMMUTES},
- [midgard_alu_op_imin] = {"imin", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_imax] = {"imax", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_umin] = {"umin", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_umax] = {"umax", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_ihadd] = {"ihadd", UNITS_ADD | OP_COMMUTES},
- [midgard_alu_op_uhadd] = {"uhadd", UNITS_ADD | OP_COMMUTES},
- [midgard_alu_op_irhadd] = {"irhadd", UNITS_ADD | OP_COMMUTES},
- [midgard_alu_op_urhadd] = {"urhadd", UNITS_ADD | OP_COMMUTES},
-
- [midgard_alu_op_fmov] = {"fmov", UNITS_ALL | QUIRK_FLIPPED_R24},
- [midgard_alu_op_fmov_rtz] = {"fmov_rtz", UNITS_ALL | QUIRK_FLIPPED_R24},
- [midgard_alu_op_fmov_rtn] = {"fmov_rtn", UNITS_ALL | QUIRK_FLIPPED_R24},
- [midgard_alu_op_fmov_rtp] = {"fmov_rtp", UNITS_ALL | QUIRK_FLIPPED_R24},
- [midgard_alu_op_fround] = {"fround", UNITS_ADD},
- [midgard_alu_op_froundeven] = {"froundeven", UNITS_ADD},
- [midgard_alu_op_ftrunc] = {"ftrunc", UNITS_ADD},
- [midgard_alu_op_ffloor] = {"ffloor", UNITS_ADD},
- [midgard_alu_op_fceil] = {"fceil", UNITS_ADD},
- [midgard_alu_op_ffma] = {"ffma", UNIT_VLUT},
-
- /* Though they output a scalar, they need to run on a vector unit
- * since they process vectors */
- [midgard_alu_op_fdot3] = {"fdot3", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES},
- [midgard_alu_op_fdot3r] = {"fdot3r", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES},
- [midgard_alu_op_fdot4] = {"fdot4", UNIT_VMUL | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
-
- /* Incredibly, iadd can run on vmul, etc */
- [midgard_alu_op_iadd] = {"iadd", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_iaddsat] = {"iaddsat", UNITS_ADD | OP_COMMUTES},
- [midgard_alu_op_uaddsat] = {"uaddsat", UNITS_ADD | OP_COMMUTES},
- [midgard_alu_op_iabsdiff] = {"iabsdiff", UNITS_ADD},
- [midgard_alu_op_uabsdiff] = {"uabsdiff", UNITS_ADD},
- [midgard_alu_op_ichoose] = {"ichoose", UNITS_ADD},
- [midgard_alu_op_isub] = {"isub", UNITS_MOST},
- [midgard_alu_op_isubsat] = {"isubsat", UNITS_MOST},
- [midgard_alu_op_usubsat] = {"usubsat", UNITS_MOST},
- [midgard_alu_op_imul] = {"imul", UNITS_MUL | OP_COMMUTES},
- [midgard_alu_op_imov] = {"imov", UNITS_MOST | QUIRK_FLIPPED_R24},
-
- /* For vector comparisons, use ball etc */
- [midgard_alu_op_feq] = {"feq", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES},
- [midgard_alu_op_fne] = {"fne", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES},
- [midgard_alu_op_fle] = {"fle", UNITS_MOST | OP_TYPE_CONVERT},
- [midgard_alu_op_flt] = {"flt", UNITS_MOST | OP_TYPE_CONVERT},
- [midgard_alu_op_ieq] = {"ieq", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_ine] = {"ine", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_ilt] = {"ilt", UNITS_MOST},
- [midgard_alu_op_ile] = {"ile", UNITS_MOST},
- [midgard_alu_op_ult] = {"ult", UNITS_MOST},
- [midgard_alu_op_ule] = {"ule", UNITS_MOST},
-
- [midgard_alu_op_icsel] = {"icsel", UNITS_ADD},
- [midgard_alu_op_icsel_v] = {"icsel_v", UNITS_ADD}, /* Acts as bitselect() */
- [midgard_alu_op_fcsel_v] = {"fcsel_v", UNITS_ADD},
- [midgard_alu_op_fcsel] = {"fcsel", UNITS_ADD | UNIT_SMUL},
-
- [midgard_alu_op_frcp] = {"frcp", UNIT_VLUT},
- [midgard_alu_op_frsqrt] = {"frsqrt", UNIT_VLUT},
- [midgard_alu_op_fsqrt] = {"fsqrt", UNIT_VLUT},
- [midgard_alu_op_fpow_pt1] = {"fpow_pt1", UNIT_VLUT},
- [midgard_alu_op_fpown_pt1] = {"fpown_pt1", UNIT_VLUT},
- [midgard_alu_op_fpowr_pt1] = {"fpowr_pt1", UNIT_VLUT},
- [midgard_alu_op_fexp2] = {"fexp2", UNIT_VLUT},
- [midgard_alu_op_flog2] = {"flog2", UNIT_VLUT},
-
- [midgard_alu_op_f2i_rte] = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_f2i_rtz] = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_f2i_rtn] = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_f2i_rtp] = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_f2u_rte] = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_f2u_rtz] = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_f2u_rtn] = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_f2u_rtp] = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_i2f_rte] = {"i2f", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_i2f_rtz] = {"i2f_rtz", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_i2f_rtn] = {"i2f_rtn", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_i2f_rtp] = {"i2f_rtp", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_u2f_rte] = {"u2f", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_u2f_rtz] = {"u2f_rtz", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_u2f_rtn] = {"u2f_rtn", UNITS_ADD | OP_TYPE_CONVERT},
- [midgard_alu_op_u2f_rtp] = {"u2f_rtp", UNITS_ADD | OP_TYPE_CONVERT},
-
- [midgard_alu_op_fsin] = {"fsin", UNIT_VLUT},
- [midgard_alu_op_fcos] = {"fcos", UNIT_VLUT},
-
- /* XXX: Test case where it's right on smul but not sadd */
- [midgard_alu_op_iand] = {"iand", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_iandnot] = {"iandnot", UNITS_MOST},
-
- [midgard_alu_op_ior] = {"ior", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_iornot] = {"iornot", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_inor] = {"inor", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_ixor] = {"ixor", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_inxor] = {"inxor", UNITS_MOST | OP_COMMUTES},
- [midgard_alu_op_iclz] = {"iclz", UNITS_ADD},
- [midgard_alu_op_ibitcount8] = {"ibitcount8", UNITS_ADD},
- [midgard_alu_op_inand] = {"inand", UNITS_MOST},
- [midgard_alu_op_ishl] = {"ishl", UNITS_ADD},
- [midgard_alu_op_iasr] = {"iasr", UNITS_ADD},
- [midgard_alu_op_ilsr] = {"ilsr", UNITS_ADD},
-
- [midgard_alu_op_fball_eq] = {"fball_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
- [midgard_alu_op_fbany_neq] = {"fbany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
- [midgard_alu_op_iball_eq] = {"iball_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
- [midgard_alu_op_iball_neq] = {"iball_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
- [midgard_alu_op_ibany_eq] = {"ibany_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
- [midgard_alu_op_ibany_neq] = {"ibany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
-
- /* These instructions are not yet emitted by the compiler, so
- * don't speculate about units yet */
- [midgard_alu_op_ishladd] = {"ishladd", 0},
-
- [midgard_alu_op_uball_lt] = {"uball_lt", 0},
- [midgard_alu_op_uball_lte] = {"uball_lte", 0},
- [midgard_alu_op_iball_lt] = {"iball_lt", 0},
- [midgard_alu_op_iball_lte] = {"iball_lte", 0},
- [midgard_alu_op_ubany_lt] = {"ubany_lt", 0},
- [midgard_alu_op_ubany_lte] = {"ubany_lte", 0},
- [midgard_alu_op_ibany_lt] = {"ibany_lt", 0},
- [midgard_alu_op_ibany_lte] = {"ibany_lte", 0},
-
- [midgard_alu_op_freduce] = {"freduce", 0},
- [midgard_alu_op_bball_eq] = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
- [midgard_alu_op_bbany_neq] = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
- [midgard_alu_op_fatan2_pt1] = {"fatan2_pt1", 0},
- [midgard_alu_op_fatan_pt2] = {"fatan_pt2", 0},
-};
-
-const char *load_store_opcode_names[256] = {
- [midgard_op_st_cubemap_coords] = "st_cubemap_coords",
- [midgard_op_ld_global_id] = "ld_global_id",
- [midgard_op_ldst_perspective_division_z] = "ldst_perspective_division_z",
- [midgard_op_ldst_perspective_division_w] = "ldst_perspective_division_w",
-
- [midgard_op_atomic_add] = "atomic_add",
- [midgard_op_atomic_and] = "atomic_and",
- [midgard_op_atomic_or] = "atomic_or",
- [midgard_op_atomic_xor] = "atomic_xor",
- [midgard_op_atomic_imin] = "atomic_imin",
- [midgard_op_atomic_umin] = "atomic_umin",
- [midgard_op_atomic_imax] = "atomic_imax",
- [midgard_op_atomic_umax] = "atomic_umax",
- [midgard_op_atomic_xchg] = "atomic_xchg",
-
- [midgard_op_ld_char] = "ld_char",
- [midgard_op_ld_char2] = "ld_char2",
- [midgard_op_ld_short] = "ld_short",
- [midgard_op_ld_char4] = "ld_char4",
- [midgard_op_ld_short4] = "ld_short4",
- [midgard_op_ld_int4] = "ld_int4",
-
- [midgard_op_ld_attr_32] = "ld_attr_32",
- [midgard_op_ld_attr_16] = "ld_attr_16",
- [midgard_op_ld_attr_32i] = "ld_attr_32i",
- [midgard_op_ld_attr_32u] = "ld_attr_32u",
-
- [midgard_op_ld_vary_32] = "ld_vary_32",
- [midgard_op_ld_vary_16] = "ld_vary_16",
- [midgard_op_ld_vary_32i] = "ld_vary_32i",
- [midgard_op_ld_vary_32u] = "ld_vary_32u",
-
- [midgard_op_ld_color_buffer_16] = "ld_color_buffer_16",
-
- [midgard_op_ld_uniform_16] = "ld_uniform_16",
- [midgard_op_ld_uniform_32] = "ld_uniform_32",
- [midgard_op_ld_uniform_32i] = "ld_uniform_32i",
- [midgard_op_ld_color_buffer_8] = "ld_color_buffer_8",
-
- [midgard_op_st_char] = "st_char",
- [midgard_op_st_char2] = "st_char2",
- [midgard_op_st_char4] = "st_char4",
- [midgard_op_st_short4] = "st_short4",
- [midgard_op_st_int4] = "st_int4",
-
- [midgard_op_st_vary_32] = "st_vary_32",
- [midgard_op_st_vary_16] = "st_vary_16",
- [midgard_op_st_vary_32i] = "st_vary_32i",
- [midgard_op_st_vary_32u] = "st_vary_32u",
-
- [midgard_op_st_image_f] = "st_image_f",
- [midgard_op_st_image_ui] = "st_image_ui",
- [midgard_op_st_image_i] = "st_image_i",
-};
+++ /dev/null
-/* Copyright (c) 2018-2019 Alyssa Rosenzweig (alyssa@rosenzweig.io)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include "helpers.h"
-
-/* Forward declare */
-
-extern struct mir_op_props alu_opcode_props[256];
-extern const char *load_store_opcode_names[256];
-
-/* Is this opcode that of an integer (regardless of signedness)? Instruction
- * names authoritatively determine types */
-
-static inline bool
-midgard_is_integer_op(int op)
-{
- const char *name = alu_opcode_props[op].name;
-
- if (!name)
- return false;
-
- return (name[0] == 'i') || (name[0] == 'u');
-}
-
-/* Does this opcode *write* an integer? Same as is_integer_op, unless it's a
- * conversion between int<->float in which case we do the opposite */
-
-static inline bool
-midgard_is_integer_out_op(int op)
-{
- bool is_int = midgard_is_integer_op(op);
- bool is_conversion = alu_opcode_props[op].props & OP_TYPE_CONVERT;
-
- return is_int ^ is_conversion;
-}
-
-/* Determines effective writemask, taking quirks and expansion into account */
-
-static inline unsigned
-effective_writemask(midgard_vector_alu *alu, unsigned existing_mask)
-{
- /* Channel count is off-by-one to fit in two-bits (0 channel makes no
- * sense) */
-
- unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[alu->op].props);
-
- /* If there is a fixed channel count, construct the appropriate mask */
-
- if (channel_count)
- return (1 << channel_count) - 1;
-
- return existing_mask;
-};
-
-
+++ /dev/null
-/*
- * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "compiler.h"
-#include "helpers.h"
-#include "midgard_ops.h"
-
-/* Pretty printer for Midgard IR, for use debugging compiler-internal
- * passes like register allocation. The output superficially resembles
- * Midgard assembly, with the exception that unit information and such is
- * (normally) omitted, and generic indices are usually used instead of
- * registers */
-
-static void
-mir_print_source(int source)
-{
- if (source >= SSA_FIXED_MINIMUM) {
- /* Specific register */
- int reg = SSA_REG_FROM_FIXED(source);
-
- /* TODO: Moving threshold */
- if (reg > 16 && reg < 24)
- printf("u%d", 23 - reg);
- else
- printf("r%d", reg);
- } else {
- printf("%d", source);
- }
-}
-
-void
-mir_print_instruction(midgard_instruction *ins)
-{
- printf("\t");
-
- switch (ins->type) {
- case TAG_ALU_4: {
- midgard_alu_op op = ins->alu.op;
- const char *name = alu_opcode_props[op].name;
-
- if (ins->unit)
- printf("%d.", ins->unit);
-
- printf("%s", name ? name : "??");
- break;
- }
-
- case TAG_LOAD_STORE_4: {
- midgard_load_store_op op = ins->load_store.op;
- const char *name = load_store_opcode_names[op];
-
- assert(name);
- printf("%s", name);
- break;
- }
-
- case TAG_TEXTURE_4: {
- printf("texture");
- break;
- }
-
- default:
- assert(0);
- }
-
- ssa_args *args = &ins->ssa_args;
-
- printf(" %d, ", args->dest);
-
- mir_print_source(args->src0);
- printf(", ");
-
- if (args->inline_constant)
- printf("#%d", ins->inline_constant);
- else
- mir_print_source(args->src1);
-
- if (ins->has_constants)
- printf(" <%f, %f, %f, %f>", ins->constants[0], ins->constants[1], ins->constants[2], ins->constants[3]);
-
- printf("\n");
-}
-
-/* Dumps MIR for a block or entire shader respective */
-
-void
-mir_print_block(midgard_block *block)
-{
- printf("{\n");
-
- mir_foreach_instr_in_block(block, ins) {
- mir_print_instruction(ins);
- }
-
- printf("}\n");
-}
-
-void
-mir_print_shader(compiler_context *ctx)
-{
- mir_foreach_block(ctx, block) {
- mir_print_block(block);
- }
-}
-
-void
-mir_print_bundle(midgard_bundle *bundle)
-{
- printf("[\n");
-
- for (unsigned i = 0; i < bundle->instruction_count; ++i) {
- midgard_instruction *ins = bundle->instructions[i];
- mir_print_instruction(ins);
- }
-
- printf("]\n");
-}
+++ /dev/null
-/*
- * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "compiler.h"
-#include "midgard_ops.h"
-#include "util/register_allocate.h"
-#include "util/u_math.h"
-
-/* For work registers, we can subdivide in various ways. So we create
- * classes for the various sizes and conflict accordingly, keeping in
- * mind that physical registers are divided along 128-bit boundaries.
- * The important part is that 128-bit boundaries are not crossed.
- *
- * For each 128-bit register, we can subdivide to 32-bits 10 ways
- *
- * vec4: xyzw
- * vec3: xyz, yzw
- * vec2: xy, yz, zw,
- * vec1: x, y, z, w
- *
- * For each 64-bit register, we can subdivide similarly to 16-bit
- * (TODO: half-float RA, not that we support fp16 yet)
- */
-
-#define WORK_STRIDE 10
-
-/* Prepacked masks/swizzles for virtual register types */
-static unsigned reg_type_to_mask[WORK_STRIDE] = {
- 0xF, /* xyzw */
- 0x7, 0x7 << 1, /* xyz */
- 0x3, 0x3 << 1, 0x3 << 2, /* xy */
- 0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3 /* x */
-};
-
-static unsigned reg_type_to_swizzle[WORK_STRIDE] = {
- SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
-
- SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
- SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W),
-
- SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
- SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W),
- SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W),
-
- SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
- SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
- SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
- SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
-};
-
-struct phys_reg {
- unsigned reg;
- unsigned mask;
- unsigned swizzle;
-};
-
-/* Given the mask/swizzle of both the register and the original source,
- * compose to find the actual mask/swizzle to give the hardware */
-
-static unsigned
-compose_writemask(unsigned mask, struct phys_reg reg)
-{
- /* Note: the reg mask is guaranteed to be contiguous. So we shift
- * into the X place, compose via a simple AND, and shift back */
-
- unsigned shift = __builtin_ctz(reg.mask);
- return ((reg.mask >> shift) & mask) << shift;
-}
-
-static unsigned
-compose_swizzle(unsigned swizzle, unsigned mask,
- struct phys_reg reg, struct phys_reg dst)
-{
- unsigned out = pan_compose_swizzle(swizzle, reg.swizzle);
-
- /* Based on the register mask, we need to adjust over. E.g if we're
- * writing to yz, a base swizzle of xy__ becomes _xy_. Save the
- * original first component (x). But to prevent duplicate shifting
- * (only applies to ALU -- mask param is set to xyzw out on L/S to
- * prevent changes), we have to account for the shift inherent to the
- * original writemask */
-
- unsigned rep = out & 0x3;
- unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask);
- unsigned shifted = out << (2*shift);
-
- /* ..but we fill in the gaps so it appears to replicate */
-
- for (unsigned s = 0; s < shift; ++s)
- shifted |= rep << (2*s);
-
- return shifted;
-}
-
-/* When we're 'squeezing down' the values in the IR, we maintain a hash
- * as such */
-
-static unsigned
-find_or_allocate_temp(compiler_context *ctx, unsigned hash)
-{
- if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
- return hash;
-
- unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
- ctx->hash_to_temp, hash + 1);
-
- if (temp)
- return temp - 1;
-
- /* If no temp is find, allocate one */
- temp = ctx->temp_count++;
- ctx->max_hash = MAX2(ctx->max_hash, hash);
-
- _mesa_hash_table_u64_insert(ctx->hash_to_temp,
- hash + 1, (void *) ((uintptr_t) temp + 1));
-
- return temp;
-}
-
-/* Callback for register allocation selection, trivial default for now */
-
-static unsigned int
-midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
-{
- /* Choose the first available register to minimise register pressure */
-
- for (int i = 0; i < (16 * WORK_STRIDE); ++i) {
- if (BITSET_TEST(regs, i)) {
- return i;
- }
- }
-
- assert(0);
- return 0;
-}
-
-/* Helper to return the default phys_reg for a given register */
-
-static struct phys_reg
-default_phys_reg(int reg)
-{
- struct phys_reg r = {
- .reg = reg,
- .mask = 0xF, /* xyzw */
- .swizzle = 0xE4 /* xyzw */
- };
-
- return r;
-}
-
-/* Determine which physical register, swizzle, and mask a virtual
- * register corresponds to */
-
-static struct phys_reg
-index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg)
-{
- /* Check for special cases */
- if (reg >= SSA_FIXED_MINIMUM)
- return default_phys_reg(SSA_REG_FROM_FIXED(reg));
- else if ((reg < 0) || !g)
- return default_phys_reg(REGISTER_UNUSED);
-
- /* Special cases aside, we pick the underlying register */
- int virt = ra_get_node_reg(g, reg);
-
- /* Divide out the register and classification */
- int phys = virt / WORK_STRIDE;
- int type = virt % WORK_STRIDE;
-
- struct phys_reg r = {
- .reg = phys,
- .mask = reg_type_to_mask[type],
- .swizzle = reg_type_to_swizzle[type]
- };
-
- /* Report that we actually use this register, and return it */
- ctx->work_registers = MAX2(ctx->work_registers, phys);
- return r;
-}
-
-/* This routine performs the actual register allocation. It should be succeeded
- * by install_registers */
-
-struct ra_graph *
-allocate_registers(compiler_context *ctx)
-{
- /* The number of vec4 work registers available depends on when the
- * uniforms start, so compute that first */
-
- int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
-
- int virtual_count = work_count * WORK_STRIDE;
-
- /* First, initialize the RA */
- struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true);
-
- int work_vec4 = ra_alloc_reg_class(regs);
- int work_vec3 = ra_alloc_reg_class(regs);
- int work_vec2 = ra_alloc_reg_class(regs);
- int work_vec1 = ra_alloc_reg_class(regs);
-
- unsigned classes[4] = {
- work_vec1,
- work_vec2,
- work_vec3,
- work_vec4
- };
-
- /* Add the full set of work registers */
- for (unsigned i = 0; i < work_count; ++i) {
- int base = WORK_STRIDE * i;
-
- /* Build a full set of subdivisions */
- ra_class_add_reg(regs, work_vec4, base);
- ra_class_add_reg(regs, work_vec3, base + 1);
- ra_class_add_reg(regs, work_vec3, base + 2);
- ra_class_add_reg(regs, work_vec2, base + 3);
- ra_class_add_reg(regs, work_vec2, base + 4);
- ra_class_add_reg(regs, work_vec2, base + 5);
- ra_class_add_reg(regs, work_vec1, base + 6);
- ra_class_add_reg(regs, work_vec1, base + 7);
- ra_class_add_reg(regs, work_vec1, base + 8);
- ra_class_add_reg(regs, work_vec1, base + 9);
-
- for (unsigned a = 0; a < 10; ++a) {
- unsigned mask1 = reg_type_to_mask[a];
-
- for (unsigned b = 0; b < 10; ++b) {
- unsigned mask2 = reg_type_to_mask[b];
-
- if (mask1 & mask2)
- ra_add_reg_conflict(regs,
- base + a, base + b);
- }
- }
- }
-
- /* We're done setting up */
- ra_set_finalize(regs, NULL);
-
- /* Transform the MIR into squeezed index form */
- mir_foreach_block(ctx, block) {
- mir_foreach_instr_in_block(block, ins) {
- if (ins->compact_branch) continue;
-
- ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
- ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
-
- if (!ins->ssa_args.inline_constant)
- ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
-
- }
- }
-
- /* No register allocation to do with no SSA */
-
- if (!ctx->temp_count)
- return NULL;
-
- /* Let's actually do register allocation */
- int nodes = ctx->temp_count;
- struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
-
- /* Determine minimum size needed to hold values, to indirectly
- * determine class */
-
- unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count);
-
- mir_foreach_block(ctx, block) {
- mir_foreach_instr_in_block(block, ins) {
- if (ins->compact_branch) continue;
- if (ins->ssa_args.dest < 0) continue;
- if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
-
- int class = util_logbase2(ins->mask) + 1;
-
- /* Use the largest class if there's ambiguity, this
- * handles partial writes */
-
- int dest = ins->ssa_args.dest;
- found_class[dest] = MAX2(found_class[dest], class);
- }
- }
-
- for (unsigned i = 0; i < ctx->temp_count; ++i) {
- unsigned class = found_class[i];
- if (!class) continue;
- ra_set_node_class(g, i, classes[class - 1]);
- }
-
- /* Determine liveness */
-
- int *live_start = malloc(nodes * sizeof(int));
- int *live_end = malloc(nodes * sizeof(int));
-
- /* Initialize as non-existent */
-
- for (int i = 0; i < nodes; ++i) {
- live_start[i] = live_end[i] = -1;
- }
-
- int d = 0;
-
- mir_foreach_block(ctx, block) {
- mir_foreach_instr_in_block(block, ins) {
- if (ins->compact_branch) continue;
-
- /* Dest is < 0 for st_vary instructions, which break
- * the usual SSA conventions. Liveness analysis doesn't
- * make sense on these instructions, so skip them to
- * avoid memory corruption */
-
- if (ins->ssa_args.dest < 0) continue;
-
- if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) {
- /* If this destination is not yet live, it is
- * now since we just wrote it */
-
- int dest = ins->ssa_args.dest;
-
- if (live_start[dest] == -1)
- live_start[dest] = d;
- }
-
- /* Since we just used a source, the source might be
- * dead now. Scan the rest of the block for
- * invocations, and if there are none, the source dies
- * */
-
- int sources[2] = {
- ins->ssa_args.src0, ins->ssa_args.src1
- };
-
- for (int src = 0; src < 2; ++src) {
- int s = sources[src];
-
- if (s < 0) continue;
-
- if (s >= SSA_FIXED_MINIMUM) continue;
-
- if (!mir_is_live_after(ctx, block, ins, s)) {
- live_end[s] = d;
- }
- }
-
- ++d;
- }
- }
-
- /* If a node still hasn't been killed, kill it now */
-
- for (int i = 0; i < nodes; ++i) {
- /* live_start == -1 most likely indicates a pinned output */
-
- if (live_end[i] == -1)
- live_end[i] = d;
- }
-
- /* Setup interference between nodes that are live at the same time */
-
- for (int i = 0; i < nodes; ++i) {
- for (int j = i + 1; j < nodes; ++j) {
- bool j_overlaps_i = live_start[j] < live_end[i];
- bool i_overlaps_j = live_end[j] < live_start[i];
-
- if (i_overlaps_j || j_overlaps_i)
- ra_add_node_interference(g, i, j);
- }
- }
-
- ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL);
-
- if (!ra_allocate(g)) {
- unreachable("Error allocating registers\n");
- }
-
- /* Cleanup */
- free(live_start);
- free(live_end);
-
- return g;
-}
-
-/* Once registers have been decided via register allocation
- * (allocate_registers), we need to rewrite the MIR to use registers instead of
- * indices */
-
-static void
-install_registers_instr(
- compiler_context *ctx,
- struct ra_graph *g,
- midgard_instruction *ins)
-{
- ssa_args args = ins->ssa_args;
-
- switch (ins->type) {
- case TAG_ALU_4: {
- int adjusted_src = args.inline_constant ? -1 : args.src1;
- struct phys_reg src1 = index_to_reg(ctx, g, args.src0);
- struct phys_reg src2 = index_to_reg(ctx, g, adjusted_src);
- struct phys_reg dest = index_to_reg(ctx, g, args.dest);
-
- unsigned uncomposed_mask = ins->mask;
- ins->mask = compose_writemask(uncomposed_mask, dest);
-
- /* Adjust the dest mask if necessary. Mostly this is a no-op
- * but it matters for dot products */
- dest.mask = effective_writemask(&ins->alu, ins->mask);
-
- midgard_vector_alu_src mod1 =
- vector_alu_from_unsigned(ins->alu.src1);
- mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest);
- ins->alu.src1 = vector_alu_srco_unsigned(mod1);
-
- ins->registers.src1_reg = src1.reg;
-
- ins->registers.src2_imm = args.inline_constant;
-
- if (args.inline_constant) {
- /* Encode inline 16-bit constant. See disassembler for
- * where the algorithm is from */
-
- ins->registers.src2_reg = ins->inline_constant >> 11;
-
- int lower_11 = ins->inline_constant & ((1 << 12) - 1);
- uint16_t imm = ((lower_11 >> 8) & 0x7) |
- ((lower_11 & 0xFF) << 3);
-
- ins->alu.src2 = imm << 2;
- } else {
- midgard_vector_alu_src mod2 =
- vector_alu_from_unsigned(ins->alu.src2);
- mod2.swizzle = compose_swizzle(
- mod2.swizzle, uncomposed_mask, src2, dest);
- ins->alu.src2 = vector_alu_srco_unsigned(mod2);
-
- ins->registers.src2_reg = src2.reg;
- }
-
- ins->registers.out_reg = dest.reg;
- break;
- }
-
- case TAG_LOAD_STORE_4: {
- if (OP_IS_STORE_VARY(ins->load_store.op)) {
- /* TODO: use ssa_args for st_vary */
- ins->load_store.reg = 0;
- } else {
- /* Which physical register we read off depends on
- * whether we are loading or storing -- think about the
- * logical dataflow */
-
- unsigned r = OP_IS_STORE(ins->load_store.op) ?
- args.src0 : args.dest;
- struct phys_reg src = index_to_reg(ctx, g, r);
-
- ins->load_store.reg = src.reg;
-
- ins->load_store.swizzle = compose_swizzle(
- ins->load_store.swizzle, 0xF,
- default_phys_reg(0), src);
-
- ins->mask = compose_writemask(
- ins->mask, src);
- }
-
- break;
- }
-
- default:
- break;
- }
-}
-
-void
-install_registers(compiler_context *ctx, struct ra_graph *g)
-{
- mir_foreach_block(ctx, block) {
- mir_foreach_instr_in_block(block, ins) {
- if (ins->compact_branch) continue;
- install_registers_instr(ctx, g, ins);
- }
- }
-
-}
+++ /dev/null
-/*
- * Copyright (C) 2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "compiler.h"
-
-/* Creates pipeline registers. This is a prepass run before the main register
- * allocator but after scheduling, once bundles are created. It works by
- * iterating the scheduled IR, checking if a value is ever used after the end
- * of the current bundle. If it is not, it is promoted to a bundle-specific
- * pipeline register.
- *
- * Pipeline registers are only written from the first two stages of the
- * pipeline (vmul/sadd) lasting the duration of the bundle only. There are two
- * 128-bit pipeline registers available (r24/r25). The upshot is that no actual
- * register allocation is needed; we can _always_ promote a value to a pipeline
- * register, liveness permitting. This greatly simplifies the logic of this
- * passing, negating the need for a proper RA like work registers.
- */
-
-static bool
-mir_pipeline_ins(
- compiler_context *ctx,
- midgard_block *block,
- midgard_bundle *bundle, unsigned i,
- unsigned pipeline_count)
-{
- midgard_instruction *ins = bundle->instructions[i];
- unsigned dest = ins->ssa_args.dest;
-
- /* Check to make sure we're legal */
-
- if (ins->compact_branch)
- return false;
-
- /* Don't allow non-SSA. Pipelining registers is theoretically possible,
- * but the analysis is much hairier, so don't bother quite yet */
- if ((dest < 0) || (dest >= ctx->func->impl->ssa_alloc))
- return false;
-
- /* Make sure they're not lying to us. Blend shaders lie. TODO: Fix your
- * bad code Alyssa */
-
- if (mir_has_multiple_writes(ctx, dest))
- return false;
-
- /* We want to know if we live after this bundle, so check if
- * we're live after the last instruction of the bundle */
-
- midgard_instruction *end = bundle->instructions[
- bundle->instruction_count - 1];
-
- if (mir_is_live_after(ctx, block, end, ins->ssa_args.dest))
- return false;
-
- /* We're only live in this bundle -- pipeline! */
-
- mir_rewrite_index(ctx, dest, SSA_FIXED_REGISTER(24 + pipeline_count));
-
- return true;
-}
-
-void
-mir_create_pipeline_registers(compiler_context *ctx)
-{
- mir_foreach_block(ctx, block) {
- mir_foreach_bundle_in_block(block, bundle) {
- if (!mir_is_alu_bundle(bundle)) continue;
- if (bundle->instruction_count < 2) continue;
-
- /* Only first 2 instructions could pipeline */
- bool succ = mir_pipeline_ins(ctx, block, bundle, 0, 0);
- mir_pipeline_ins(ctx, block, bundle, 1, succ);
- }
- }
-}
+++ /dev/null
-/*
- * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "compiler.h"
-#include "midgard_ops.h"
-#include "util/u_memory.h"
-
-/* Create a mask of accessed components from a swizzle to figure out vector
- * dependencies */
-
-static unsigned
-swizzle_to_access_mask(unsigned swizzle)
-{
- unsigned component_mask = 0;
-
- for (int i = 0; i < 4; ++i) {
- unsigned c = (swizzle >> (2 * i)) & 3;
- component_mask |= (1 << c);
- }
-
- return component_mask;
-}
-
-/* Does the mask cover more than a scalar? */
-
-static bool
-is_single_component_mask(unsigned mask)
-{
- int components = 0;
-
- for (int c = 0; c < 8; ++c) {
- if (mask & (1 << c))
- components++;
- }
-
- return components == 1;
-}
-
-/* Checks for an SSA data hazard between two adjacent instructions, keeping in
- * mind that we are a vector architecture and we can write to different
- * components simultaneously */
-
-static bool
-can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
-{
- /* Each instruction reads some registers and writes to a register. See
- * where the first writes */
-
- /* Figure out where exactly we wrote to */
- int source = first->ssa_args.dest;
- int source_mask = first->mask;
-
- /* As long as the second doesn't read from the first, we're okay */
- if (second->ssa_args.src0 == source) {
- if (first->type == TAG_ALU_4) {
- /* Figure out which components we just read from */
-
- int q = second->alu.src1;
- midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
-
- /* Check if there are components in common, and fail if so */
- if (swizzle_to_access_mask(m->swizzle) & source_mask)
- return false;
- } else
- return false;
-
- }
-
- if (second->ssa_args.src1 == source)
- return false;
-
- /* Otherwise, it's safe in that regard. Another data hazard is both
- * writing to the same place, of course */
-
- if (second->ssa_args.dest == source) {
- /* ...but only if the components overlap */
-
- if (second->mask & source_mask)
- return false;
- }
-
- /* ...That's it */
- return true;
-}
-
-static bool
-midgard_has_hazard(
- midgard_instruction **segment, unsigned segment_size,
- midgard_instruction *ains)
-{
- for (int s = 0; s < segment_size; ++s)
- if (!can_run_concurrent_ssa(segment[s], ains))
- return true;
-
- return false;
-
-
-}
-
-/* Schedules, but does not emit, a single basic block. After scheduling, the
- * final tag and size of the block are known, which are necessary for branching
- * */
-
-static midgard_bundle
-schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
-{
- int instructions_emitted = 0, packed_idx = 0;
- midgard_bundle bundle = { 0 };
-
- uint8_t tag = ins->type;
-
- /* Default to the instruction's tag */
- bundle.tag = tag;
-
- switch (ins->type) {
- case TAG_ALU_4: {
- uint32_t control = 0;
- size_t bytes_emitted = sizeof(control);
-
- /* TODO: Constant combining */
- int index = 0, last_unit = 0;
-
- /* Previous instructions, for the purpose of parallelism */
- midgard_instruction *segment[4] = {0};
- int segment_size = 0;
-
- instructions_emitted = -1;
- midgard_instruction *pins = ins;
-
- unsigned constant_count = 0;
-
- for (;;) {
- midgard_instruction *ains = pins;
-
- /* Advance instruction pointer */
- if (index) {
- ains = mir_next_op(pins);
- pins = ains;
- }
-
- /* Out-of-work condition */
- if ((struct list_head *) ains == &block->instructions)
- break;
-
- /* Ensure that the chain can continue */
- if (ains->type != TAG_ALU_4) break;
-
- /* If there's already something in the bundle and we
- * have weird scheduler constraints, break now */
- if (ains->precede_break && index) break;
-
- /* According to the presentation "The ARM
- * Mali-T880 Mobile GPU" from HotChips 27,
- * there are two pipeline stages. Branching
- * position determined experimentally. Lines
- * are executed in parallel:
- *
- * [ VMUL ] [ SADD ]
- * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
- *
- * Verify that there are no ordering dependencies here.
- *
- * TODO: Allow for parallelism!!!
- */
-
- /* Pick a unit for it if it doesn't force a particular unit */
-
- int unit = ains->unit;
-
- if (!unit) {
- int op = ains->alu.op;
- int units = alu_opcode_props[op].props;
-
- bool scalarable = units & UNITS_SCALAR;
- bool could_scalar = is_single_component_mask(ains->mask);
-
- /* Only 16/32-bit can run on a scalar unit */
- could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
- could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
- could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
-
- if (ains->alu.reg_mode == midgard_reg_mode_16) {
- /* If we're running in 16-bit mode, we
- * can't have any 8-bit sources on the
- * scalar unit (since the scalar unit
- * doesn't understand 8-bit) */
-
- midgard_vector_alu_src s1 =
- vector_alu_from_unsigned(ains->alu.src1);
-
- could_scalar &= !s1.half;
-
- if (!ains->ssa_args.inline_constant) {
- midgard_vector_alu_src s2 =
- vector_alu_from_unsigned(ains->alu.src2);
-
- could_scalar &= !s2.half;
- }
-
- }
-
- bool scalar = could_scalar && scalarable;
-
- /* TODO: Check ahead-of-time for other scalar
- * hazards that otherwise get aborted out */
-
- if (scalar)
- assert(units & UNITS_SCALAR);
-
- if (!scalar) {
- if (last_unit >= UNIT_VADD) {
- if (units & UNIT_VLUT)
- unit = UNIT_VLUT;
- else
- break;
- } else {
- if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
- unit = UNIT_VMUL;
- else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
- unit = UNIT_VADD;
- else if (units & UNIT_VLUT)
- unit = UNIT_VLUT;
- else
- break;
- }
- } else {
- if (last_unit >= UNIT_VADD) {
- if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
- unit = UNIT_SMUL;
- else if (units & UNIT_VLUT)
- unit = UNIT_VLUT;
- else
- break;
- } else {
- if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
- unit = UNIT_SADD;
- else if (units & UNIT_SMUL)
- unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
- else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
- unit = UNIT_VADD;
- else
- break;
- }
- }
-
- assert(unit & units);
- }
-
- /* Late unit check, this time for encoding (not parallelism) */
- if (unit <= last_unit) break;
-
- /* Clear the segment */
- if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
- segment_size = 0;
-
- if (midgard_has_hazard(segment, segment_size, ains))
- break;
-
- /* We're good to go -- emit the instruction */
- ains->unit = unit;
-
- segment[segment_size++] = ains;
-
- /* We try to reuse constants if possible, by adjusting
- * the swizzle */
-
- if (ains->has_blend_constant) {
- /* Everything conflicts with the blend constant */
- if (bundle.has_embedded_constants)
- break;
-
- bundle.has_blend_constant = 1;
- bundle.has_embedded_constants = 1;
- } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
- /* TODO: DRY with the analysis pass */
-
- if (bundle.has_blend_constant)
- break;
-
- if (constant_count)
- break;
-
- /* TODO: Fix packing XXX */
- uint16_t *bundles = (uint16_t *) bundle.constants;
- uint32_t *constants = (uint32_t *) ains->constants;
-
- /* Copy them wholesale */
- for (unsigned i = 0; i < 4; ++i)
- bundles[i] = constants[i];
-
- bundle.has_embedded_constants = true;
- constant_count = 4;
- } else if (ains->has_constants) {
- /* By definition, blend constants conflict with
- * everything, so if there are already
- * constants we break the bundle *now* */
-
- if (bundle.has_blend_constant)
- break;
-
- /* For anything but blend constants, we can do
- * proper analysis, however */
-
- /* TODO: Mask by which are used */
- uint32_t *constants = (uint32_t *) ains->constants;
- uint32_t *bundles = (uint32_t *) bundle.constants;
-
- uint32_t indices[4] = { 0 };
- bool break_bundle = false;
-
- for (unsigned i = 0; i < 4; ++i) {
- uint32_t cons = constants[i];
- bool constant_found = false;
-
- /* Search for the constant */
- for (unsigned j = 0; j < constant_count; ++j) {
- if (bundles[j] != cons)
- continue;
-
- /* We found it, reuse */
- indices[i] = j;
- constant_found = true;
- break;
- }
-
- if (constant_found)
- continue;
-
- /* We didn't find it, so allocate it */
- unsigned idx = constant_count++;
-
- if (idx >= 4) {
- /* Uh-oh, out of space */
- break_bundle = true;
- break;
- }
-
- /* We have space, copy it in! */
- bundles[idx] = cons;
- indices[i] = idx;
- }
-
- if (break_bundle)
- break;
-
- /* Cool, we have it in. So use indices as a
- * swizzle */
-
- unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
- unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
-
- if (ains->ssa_args.src0 == r_constant)
- ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
-
- if (ains->ssa_args.src1 == r_constant)
- ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
-
- bundle.has_embedded_constants = true;
- }
-
- if (ains->unit & UNITS_ANY_VECTOR) {
- bytes_emitted += sizeof(midgard_reg_info);
- bytes_emitted += sizeof(midgard_vector_alu);
- } else if (ains->compact_branch) {
- /* All of r0 has to be written out along with
- * the branch writeout */
-
- if (ains->writeout) {
- /* The rules for when "bare" writeout
- * is safe are when all components are
- * r0 are written out in the final
- * bundle, earlier than VLUT, where any
- * register dependencies of r0 are from
- * an earlier bundle. We can't verify
- * this before RA, so we don't try. */
-
- if (index != 0)
- break;
-
- /* Inject a move */
- midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
- ins.unit = UNIT_VMUL;
- control |= ins.unit;
-
- /* TODO don't leak */
- midgard_instruction *move =
- mem_dup(&ins, sizeof(midgard_instruction));
- bytes_emitted += sizeof(midgard_reg_info);
- bytes_emitted += sizeof(midgard_vector_alu);
- bundle.instructions[packed_idx++] = move;
- }
-
- if (ains->unit == ALU_ENAB_BRANCH) {
- bytes_emitted += sizeof(midgard_branch_extended);
- } else {
- bytes_emitted += sizeof(ains->br_compact);
- }
- } else {
- bytes_emitted += sizeof(midgard_reg_info);
- bytes_emitted += sizeof(midgard_scalar_alu);
- }
-
- /* Defer marking until after writing to allow for break */
- control |= ains->unit;
- last_unit = ains->unit;
- ++instructions_emitted;
- ++index;
- }
-
- int padding = 0;
-
- /* Pad ALU op to nearest word */
-
- if (bytes_emitted & 15) {
- padding = 16 - (bytes_emitted & 15);
- bytes_emitted += padding;
- }
-
- /* Constants must always be quadwords */
- if (bundle.has_embedded_constants)
- bytes_emitted += 16;
-
- /* Size ALU instruction for tag */
- bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
- bundle.padding = padding;
- bundle.control = bundle.tag | control;
-
- break;
- }
-
- case TAG_LOAD_STORE_4: {
- /* Load store instructions have two words at once. If
- * we only have one queued up, we need to NOP pad.
- * Otherwise, we store both in succession to save space
- * and cycles -- letting them go in parallel -- skip
- * the next. The usefulness of this optimisation is
- * greatly dependent on the quality of the instruction
- * scheduler.
- */
-
- midgard_instruction *next_op = mir_next_op(ins);
-
- if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
- /* TODO: Concurrency check */
- instructions_emitted++;
- }
-
- break;
- }
-
- case TAG_TEXTURE_4: {
- /* Which tag we use depends on the shader stage */
- bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
- bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
- break;
- }
-
- default:
- unreachable("Unknown tag");
- break;
- }
-
- /* Copy the instructions into the bundle */
- bundle.instruction_count = instructions_emitted + 1 + packed_idx;
-
- midgard_instruction *uins = ins;
- for (; packed_idx < bundle.instruction_count; ++packed_idx) {
- bundle.instructions[packed_idx] = uins;
- uins = mir_next_op(uins);
- }
-
- *skip = instructions_emitted;
-
- return bundle;
-}
-
-/* Schedule a single block by iterating its instruction to create bundles.
- * While we go, tally about the bundle sizes to compute the block size. */
-
-static void
-schedule_block(compiler_context *ctx, midgard_block *block)
-{
- util_dynarray_init(&block->bundles, NULL);
-
- block->quadword_count = 0;
-
- mir_foreach_instr_in_block(block, ins) {
- int skip;
- midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
- util_dynarray_append(&block->bundles, midgard_bundle, bundle);
-
- if (bundle.has_blend_constant) {
- /* TODO: Multiblock? */
- int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
- ctx->blend_constant_offset = quadwords_within_block * 0x10;
- }
-
- while(skip--)
- ins = mir_next_op(ins);
-
- block->quadword_count += quadword_size(bundle.tag);
- }
-
- block->is_scheduled = true;
-}
-
-void
-schedule_program(compiler_context *ctx)
-{
- /* We run RA prior to scheduling */
-
- mir_foreach_block(ctx, block) {
- schedule_block(ctx, block);
- }
-
- /* Pipeline registers creation is a prepass before RA */
- mir_create_pipeline_registers(ctx);
-
- struct ra_graph *g = allocate_registers(ctx);
- install_registers(ctx, g);
-}
+++ /dev/null
-/*
- * Copyright (C) 2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "compiler.h"
-
-void
-mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new)
-{
- mir_foreach_instr_global(ctx, ins) {
- if (ins->ssa_args.src0 == old)
- ins->ssa_args.src0 = new;
-
- if (ins->ssa_args.src1 == old &&
- !ins->ssa_args.inline_constant)
- ins->ssa_args.src1 = new;
- }
-}
-
-void
-mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new)
-{
- mir_foreach_instr_global(ctx, ins) {
- if (ins->ssa_args.dest == old)
- ins->ssa_args.dest = new;
- }
-}
-
-void
-mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new)
-{
- mir_rewrite_index_src(ctx, old, new);
- mir_rewrite_index_dst(ctx, old, new);
-}
+++ /dev/null
-/*
- * © Copyright 2017-2098 The Panfrost Communiy
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "pan_pretty_print.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-/* Some self-contained prettyprinting functions shared between pandecode and
- * the main driver */
-
-#define DEFINE_CASE(name) case MALI_## name: return "MALI_" #name
-char *pandecode_format_name(enum mali_format format)
-{
- static char unk_format_str[5];
-
- switch (format) {
- DEFINE_CASE(RGB565);
- DEFINE_CASE(RGB5_A1_UNORM);
- DEFINE_CASE(RGB10_A2_UNORM);
- DEFINE_CASE(RGB10_A2_SNORM);
- DEFINE_CASE(RGB10_A2UI);
- DEFINE_CASE(RGB10_A2I);
- DEFINE_CASE(NV12);
- DEFINE_CASE(Z32_UNORM);
- DEFINE_CASE(R32_FIXED);
- DEFINE_CASE(RG32_FIXED);
- DEFINE_CASE(RGB32_FIXED);
- DEFINE_CASE(RGBA32_FIXED);
- DEFINE_CASE(R11F_G11F_B10F);
- DEFINE_CASE(R9F_G9F_B9F_E5F);
- DEFINE_CASE(VARYING_POS);
- DEFINE_CASE(VARYING_DISCARD);
-
- DEFINE_CASE(R8_SNORM);
- DEFINE_CASE(R16_SNORM);
- DEFINE_CASE(R32_SNORM);
- DEFINE_CASE(RG8_SNORM);
- DEFINE_CASE(RG16_SNORM);
- DEFINE_CASE(RG32_SNORM);
- DEFINE_CASE(RGB8_SNORM);
- DEFINE_CASE(RGB16_SNORM);
- DEFINE_CASE(RGB32_SNORM);
- DEFINE_CASE(RGBA8_SNORM);
- DEFINE_CASE(RGBA16_SNORM);
- DEFINE_CASE(RGBA32_SNORM);
-
- DEFINE_CASE(R8UI);
- DEFINE_CASE(R16UI);
- DEFINE_CASE(R32UI);
- DEFINE_CASE(RG8UI);
- DEFINE_CASE(RG16UI);
- DEFINE_CASE(RG32UI);
- DEFINE_CASE(RGB8UI);
- DEFINE_CASE(RGB16UI);
- DEFINE_CASE(RGB32UI);
- DEFINE_CASE(RGBA8UI);
- DEFINE_CASE(RGBA16UI);
- DEFINE_CASE(RGBA32UI);
-
- DEFINE_CASE(R8_UNORM);
- DEFINE_CASE(R16_UNORM);
- DEFINE_CASE(R32_UNORM);
- DEFINE_CASE(R32F);
- DEFINE_CASE(RG8_UNORM);
- DEFINE_CASE(RG16_UNORM);
- DEFINE_CASE(RG32_UNORM);
- DEFINE_CASE(RG32F);
- DEFINE_CASE(RGB8_UNORM);
- DEFINE_CASE(RGB16_UNORM);
- DEFINE_CASE(RGB32_UNORM);
- DEFINE_CASE(RGB32F);
- DEFINE_CASE(RGBA4_UNORM);
- DEFINE_CASE(RGBA8_UNORM);
- DEFINE_CASE(RGBA16_UNORM);
- DEFINE_CASE(RGBA32_UNORM);
- DEFINE_CASE(RGBA32F);
-
- DEFINE_CASE(R8I);
- DEFINE_CASE(R16I);
- DEFINE_CASE(R32I);
- DEFINE_CASE(RG8I);
- DEFINE_CASE(R16F);
- DEFINE_CASE(RG16I);
- DEFINE_CASE(RG32I);
- DEFINE_CASE(RG16F);
- DEFINE_CASE(RGB8I);
- DEFINE_CASE(RGB16I);
- DEFINE_CASE(RGB32I);
- DEFINE_CASE(RGB16F);
- DEFINE_CASE(RGBA8I);
- DEFINE_CASE(RGBA16I);
- DEFINE_CASE(RGBA32I);
- DEFINE_CASE(RGBA16F);
-
- DEFINE_CASE(RGBA4);
- DEFINE_CASE(RGBA8_2);
- DEFINE_CASE(RGB10_A2_2);
- default:
- snprintf(unk_format_str, sizeof(unk_format_str), "0x%02x", format);
- return unk_format_str;
- }
-}
-
-#undef DEFINE_CASE
-
-/* Helper to dump fixed-function blend part for debugging */
-
-static const char *
-panfrost_factor_name(enum mali_dominant_factor factor)
-{
- switch (factor) {
- case MALI_DOMINANT_UNK0:
- return "unk0";
-
- case MALI_DOMINANT_ZERO:
- return "zero";
-
- case MALI_DOMINANT_SRC_COLOR:
- return "source color";
-
- case MALI_DOMINANT_DST_COLOR:
- return "dest color";
-
- case MALI_DOMINANT_UNK4:
- return "unk4";
-
- case MALI_DOMINANT_SRC_ALPHA:
- return "source alpha";
-
- case MALI_DOMINANT_DST_ALPHA:
- return "dest alpha";
-
- case MALI_DOMINANT_CONSTANT:
- return "constant";
- }
-
- return "unreachable";
-}
-
-static const char *
-panfrost_modifier_name(enum mali_blend_modifier mod)
-{
- switch (mod) {
- case MALI_BLEND_MOD_UNK0:
- return "unk0";
-
- case MALI_BLEND_MOD_NORMAL:
- return "normal";
-
- case MALI_BLEND_MOD_SOURCE_ONE:
- return "source one";
-
- case MALI_BLEND_MOD_DEST_ONE:
- return "dest one";
- }
-
- return "unreachable";
-}
-
-static void
-panfrost_print_fixed_part(const char *name, unsigned u)
-{
- struct mali_blend_mode part;
- memcpy(&part, &u, sizeof(part));
-
- printf("%s blend mode (%X):\n", name, u);
-
- printf(" %s dominant:\n",
- (part.dominant == MALI_BLEND_DOM_SOURCE) ? "source" : "destination");
-
- printf(" %s\n", panfrost_factor_name(part.dominant_factor));
-
- if (part.complement_dominant)
- printf(" complement\n");
-
-
- printf(" nondominant %s\n",
- (part.nondominant_mode == MALI_BLEND_NON_MIRROR) ? "mirror" : "zero");
-
-
- printf(" mode: %s\n", panfrost_modifier_name(part.clip_modifier));
-
- if (part.negate_source) printf(" negate source\n");
-
- if (part.negate_dest) printf(" negate dest\n");
-
- assert(!(part.unused_0 || part.unused_1));
-}
-
-void
-panfrost_print_blend_equation(struct mali_blend_equation eq)
-{
- printf("\n");
- panfrost_print_fixed_part("RGB", eq.rgb_mode);
- panfrost_print_fixed_part("Alpha", eq.alpha_mode);
-
- assert(!eq.zero1);
-
- printf("Mask: %s%s%s%s\n",
- (eq.color_mask & MALI_MASK_R) ? "R" : "",
- (eq.color_mask & MALI_MASK_G) ? "G" : "",
- (eq.color_mask & MALI_MASK_B) ? "B" : "",
- (eq.color_mask & MALI_MASK_A) ? "A" : "");
-}
+++ /dev/null
-/*
- * © Copyright 2017-2098 The Panfrost Communiy
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __PAN_PRETTY_H
-#define __PAN_PRETTY_H
-
-#include "panfrost-job.h"
-
-char *pandecode_format_name(enum mali_format format);
-void panfrost_print_blend_equation(struct mali_blend_equation eq);
-
-#endif
+++ /dev/null
-/*
- * Copyright (C) 2019 Alyssa Rosenzweig
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "decode.h"
-
-/* Parsing */
-
-static FILE *
-pandecode_read_filename(const char *base, const char *name)
-{
- char *fn = NULL;
- asprintf(&fn, "%s/%s", base, name);
-
- FILE *fp = fopen(fn, "rb");
- free(fn);
-
- return fp;
-}
-
-static void
-pandecode_read_memory(const char *base, const char *name, mali_ptr gpu_va)
-{
- FILE *fp = pandecode_read_filename(base, name);
-
- if (!fp) {
- fprintf(stderr, "Warning: missing %s\n", name);
- return;
- }
-
- fseek(fp, 0, SEEK_END);
- long sz = ftell(fp);
- fseek(fp, 0, SEEK_SET);
-
- char *buf = malloc(sz);
- assert(buf);
- fread(buf, 1, sz, fp);
- fclose(fp);
-
- pandecode_inject_mmap(gpu_va, buf, sz, name);
-}
-
-static void
-pandecode_read_mmap(const char *base, const char *line)
-{
- assert(strlen(line) < 500);
-
- mali_ptr addr;
- char name[512];
-
- sscanf(line, "MMAP %" PRIx64 " %s", &addr, name);
- pandecode_read_memory(base, name, addr);
-}
-
-static void
-pandecode_read_job_submit(const char *base, const char *line)
-{
- mali_ptr addr;
- unsigned core_req;
- unsigned is_bifrost;
-
- sscanf(line, "JS %" PRIx64 " %x %x", &addr, &core_req, &is_bifrost);
- pandecode_replay_jc(addr, is_bifrost);
-}
-
-
-
-/* Reads the control file, processing as it goes. */
-
-static void
-pandecode_read_control(const char *base)
-{
- FILE *fp = pandecode_read_filename(base, "control.log");
-
- if (!fp) {
- fprintf(stderr, "Invalid directory path\n");
- return;
- }
-
- char *line = NULL;
- size_t len = 0;
-
- while (getline(&line, &len, fp) != -1) {
- switch (line[0]) {
- case 'M':
- pandecode_read_mmap(base, line);
- break;
-
- case 'J':
- pandecode_read_job_submit(base, line);
- break;
-
- default:
- assert(0);
- break;
- }
- }
-}
-
-int
-main(int argc, char **argv)
-{
- if (argc < 2) {
- fprintf(stderr, "Usage: pandecode [directory]\n");
- exit(1);
- }
-
- pandecode_initialize();
- pandecode_read_control(argv[1]);
-}
+++ /dev/null
-/*
- * Copyright (C) 2019 Alyssa Rosenzweig
- * Copyright (C) 2017-2018 Lyude Paul
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "decode.h"
-#include "util/macros.h"
-
-/* Memory handling */
-
-static struct pandecode_mapped_memory mmaps;
-
-struct pandecode_mapped_memory *
-pandecode_find_mapped_gpu_mem_containing(mali_ptr addr)
-{
- list_for_each_entry(struct pandecode_mapped_memory, pos, &mmaps.node, node) {
- if (addr >= pos->gpu_va && addr < pos->gpu_va + pos->length)
- return pos;
- }
-
- return NULL;
-}
-
-void
-pandecode_inject_mmap(mali_ptr gpu_va, void *cpu, unsigned sz, const char *name)
-{
- struct pandecode_mapped_memory *mapped_mem = NULL;
-
- mapped_mem = malloc(sizeof(*mapped_mem));
- list_inithead(&mapped_mem->node);
-
- mapped_mem->gpu_va = gpu_va;
- mapped_mem->length = sz;
- mapped_mem->addr = cpu;
-
- if (!name) {
- /* If we don't have a name, assign one */
-
- snprintf(mapped_mem->name, ARRAY_SIZE(mapped_mem->name) - 1,
- "memory_%" PRIx64, gpu_va);
- } else {
- assert(strlen(name) < ARRAY_SIZE(mapped_mem->name));
- memcpy(mapped_mem->name, name, strlen(name));
- }
-
- list_add(&mapped_mem->node, &mmaps.node);
-}
-
-char *
-pointer_as_memory_reference(mali_ptr ptr)
-{
- struct pandecode_mapped_memory *mapped;
- char *out = malloc(128);
-
- /* Try to find the corresponding mapped zone */
-
- mapped = pandecode_find_mapped_gpu_mem_containing(ptr);
-
- if (mapped) {
- snprintf(out, 128, "%s + %d", mapped->name, (int) (ptr - mapped->gpu_va));
- return out;
- }
-
- /* Just use the raw address if other options are exhausted */
-
- snprintf(out, 128, MALI_PTR_FMT, ptr);
- return out;
-
-}
-
-void
-pandecode_initialize(void)
-{
- list_inithead(&mmaps.node);
-
-}
+++ /dev/null
-/*
- * Copyright (C) 2017-2019 Alyssa Rosenzweig
- * Copyright (C) 2017-2019 Connor Abbott
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <panfrost-job.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <memory.h>
-#include <stdbool.h>
-#include <stdarg.h>
-#include "decode.h"
-#include "util/u_math.h"
-
-#include "../pan_pretty_print.h"
-#include "../midgard/disassemble.h"
-#include "../bifrost/disassemble.h"
-int pandecode_replay_jc(mali_ptr jc_gpu_va, bool bifrost);
-
-#define MEMORY_PROP(obj, p) {\
- if (obj->p) { \
- char *a = pointer_as_memory_reference(obj->p); \
- pandecode_prop("%s = %s", #p, a); \
- free(a); \
- } \
-}
-
-#define DYN_MEMORY_PROP(obj, no, p) { \
- if (obj->p) \
- pandecode_prop("%s = %s_%d_p", #p, #p, no); \
-}
-
-/* Semantic logging type.
- *
- * Raw: for raw messages to be printed as is.
- * Message: for helpful information to be commented out in replays.
- * Property: for properties of a struct
- *
- * Use one of pandecode_log, pandecode_msg, or pandecode_prop as syntax sugar.
- */
-
-enum pandecode_log_type {
- PANDECODE_RAW,
- PANDECODE_MESSAGE,
- PANDECODE_PROPERTY
-};
-
-#define pandecode_log(...) pandecode_log_typed(PANDECODE_RAW, __VA_ARGS__)
-#define pandecode_msg(...) pandecode_log_typed(PANDECODE_MESSAGE, __VA_ARGS__)
-#define pandecode_prop(...) pandecode_log_typed(PANDECODE_PROPERTY, __VA_ARGS__)
-
-unsigned pandecode_indent = 0;
-
-static void
-pandecode_make_indent(void)
-{
- for (unsigned i = 0; i < pandecode_indent; ++i)
- printf(" ");
-}
-
-static void
-pandecode_log_typed(enum pandecode_log_type type, const char *format, ...)
-{
- va_list ap;
-
- pandecode_make_indent();
-
- if (type == PANDECODE_MESSAGE)
- printf("// ");
- else if (type == PANDECODE_PROPERTY)
- printf(".");
-
- va_start(ap, format);
- vprintf(format, ap);
- va_end(ap);
-
- if (type == PANDECODE_PROPERTY)
- printf(",\n");
-}
-
-static void
-pandecode_log_cont(const char *format, ...)
-{
- va_list ap;
-
- va_start(ap, format);
- vprintf(format, ap);
- va_end(ap);
-}
-
-struct pandecode_flag_info {
- u64 flag;
- const char *name;
-};
-
-static void
-pandecode_log_decoded_flags(const struct pandecode_flag_info *flag_info,
- u64 flags)
-{
- bool decodable_flags_found = false;
-
- for (int i = 0; flag_info[i].name; i++) {
- if ((flags & flag_info[i].flag) != flag_info[i].flag)
- continue;
-
- if (!decodable_flags_found) {
- decodable_flags_found = true;
- } else {
- pandecode_log_cont(" | ");
- }
-
- pandecode_log_cont("%s", flag_info[i].name);
-
- flags &= ~flag_info[i].flag;
- }
-
- if (decodable_flags_found) {
- if (flags)
- pandecode_log_cont(" | 0x%" PRIx64, flags);
- } else {
- pandecode_log_cont("0x%" PRIx64, flags);
- }
-}
-
-#define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag }
-static const struct pandecode_flag_info gl_enable_flag_info[] = {
- FLAG_INFO(OCCLUSION_QUERY),
- FLAG_INFO(OCCLUSION_PRECISE),
- FLAG_INFO(FRONT_CCW_TOP),
- FLAG_INFO(CULL_FACE_FRONT),
- FLAG_INFO(CULL_FACE_BACK),
- {}
-};
-#undef FLAG_INFO
-
-#define FLAG_INFO(flag) { MALI_CLEAR_##flag, "MALI_CLEAR_" #flag }
-static const struct pandecode_flag_info clear_flag_info[] = {
- FLAG_INFO(FAST),
- FLAG_INFO(SLOW),
- FLAG_INFO(SLOW_STENCIL),
- {}
-};
-#undef FLAG_INFO
-
-#define FLAG_INFO(flag) { MALI_MASK_##flag, "MALI_MASK_" #flag }
-static const struct pandecode_flag_info mask_flag_info[] = {
- FLAG_INFO(R),
- FLAG_INFO(G),
- FLAG_INFO(B),
- FLAG_INFO(A),
- {}
-};
-#undef FLAG_INFO
-
-#define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag }
-static const struct pandecode_flag_info u3_flag_info[] = {
- FLAG_INFO(HAS_MSAA),
- FLAG_INFO(CAN_DISCARD),
- FLAG_INFO(HAS_BLEND_SHADER),
- FLAG_INFO(DEPTH_TEST),
- {}
-};
-
-static const struct pandecode_flag_info u4_flag_info[] = {
- FLAG_INFO(NO_MSAA),
- FLAG_INFO(NO_DITHER),
- FLAG_INFO(DEPTH_RANGE_A),
- FLAG_INFO(DEPTH_RANGE_B),
- FLAG_INFO(STENCIL_TEST),
- FLAG_INFO(SAMPLE_ALPHA_TO_COVERAGE_NO_BLEND_SHADER),
- {}
-};
-#undef FLAG_INFO
-
-#define FLAG_INFO(flag) { MALI_FRAMEBUFFER_##flag, "MALI_FRAMEBUFFER_" #flag }
-static const struct pandecode_flag_info fb_fmt_flag_info[] = {
- FLAG_INFO(MSAA_A),
- FLAG_INFO(MSAA_B),
- FLAG_INFO(MSAA_8),
- {}
-};
-#undef FLAG_INFO
-
-#define FLAG_INFO(flag) { MALI_MFBD_FORMAT_##flag, "MALI_MFBD_FORMAT_" #flag }
-static const struct pandecode_flag_info mfbd_fmt_flag_info[] = {
- FLAG_INFO(MSAA),
- FLAG_INFO(SRGB),
- {}
-};
-#undef FLAG_INFO
-
-#define FLAG_INFO(flag) { MALI_EXTRA_##flag, "MALI_EXTRA_" #flag }
-static const struct pandecode_flag_info mfbd_extra_flag_info[] = {
- FLAG_INFO(PRESENT),
- FLAG_INFO(AFBC),
- FLAG_INFO(ZS),
- {}
-};
-#undef FLAG_INFO
-
-#define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag }
-static const struct pandecode_flag_info shader_midgard1_flag_info [] = {
- FLAG_INFO(EARLY_Z),
- FLAG_INFO(HELPER_INVOCATIONS),
- FLAG_INFO(READS_TILEBUFFER),
- FLAG_INFO(READS_ZS),
- {}
-};
-#undef FLAG_INFO
-
-#define FLAG_INFO(flag) { MALI_MFBD_##flag, "MALI_MFBD_" #flag }
-static const struct pandecode_flag_info mfbd_flag_info [] = {
- FLAG_INFO(DEPTH_WRITE),
- FLAG_INFO(EXTRA),
- {}
-};
-#undef FLAG_INFO
-
-
-extern char *replace_fragment;
-extern char *replace_vertex;
-
-static char *
-pandecode_job_type_name(enum mali_job_type type)
-{
-#define DEFINE_CASE(name) case JOB_TYPE_ ## name: return "JOB_TYPE_" #name
-
- switch (type) {
- DEFINE_CASE(NULL);
- DEFINE_CASE(SET_VALUE);
- DEFINE_CASE(CACHE_FLUSH);
- DEFINE_CASE(COMPUTE);
- DEFINE_CASE(VERTEX);
- DEFINE_CASE(TILER);
- DEFINE_CASE(FUSED);
- DEFINE_CASE(FRAGMENT);
-
- case JOB_NOT_STARTED:
- return "NOT_STARTED";
-
- default:
- pandecode_log("Warning! Unknown job type %x\n", type);
- return "!?!?!?";
- }
-
-#undef DEFINE_CASE
-}
-
-static char *
-pandecode_draw_mode_name(enum mali_draw_mode mode)
-{
-#define DEFINE_CASE(name) case MALI_ ## name: return "MALI_" #name
-
- switch (mode) {
- DEFINE_CASE(DRAW_NONE);
- DEFINE_CASE(POINTS);
- DEFINE_CASE(LINES);
- DEFINE_CASE(TRIANGLES);
- DEFINE_CASE(TRIANGLE_STRIP);
- DEFINE_CASE(TRIANGLE_FAN);
- DEFINE_CASE(LINE_STRIP);
- DEFINE_CASE(LINE_LOOP);
- DEFINE_CASE(POLYGON);
- DEFINE_CASE(QUADS);
- DEFINE_CASE(QUAD_STRIP);
-
- default:
- return "MALI_TRIANGLES /* XXX: Unknown GL mode, check dump */";
- }
-
-#undef DEFINE_CASE
-}
-
-#define DEFINE_CASE(name) case MALI_FUNC_ ## name: return "MALI_FUNC_" #name
-static char *
-pandecode_func_name(enum mali_func mode)
-{
- switch (mode) {
- DEFINE_CASE(NEVER);
- DEFINE_CASE(LESS);
- DEFINE_CASE(EQUAL);
- DEFINE_CASE(LEQUAL);
- DEFINE_CASE(GREATER);
- DEFINE_CASE(NOTEQUAL);
- DEFINE_CASE(GEQUAL);
- DEFINE_CASE(ALWAYS);
-
- default:
- return "MALI_FUNC_NEVER /* XXX: Unknown function, check dump */";
- }
-}
-#undef DEFINE_CASE
-
-/* Why is this duplicated? Who knows... */
-#define DEFINE_CASE(name) case MALI_ALT_FUNC_ ## name: return "MALI_ALT_FUNC_" #name
-static char *
-pandecode_alt_func_name(enum mali_alt_func mode)
-{
- switch (mode) {
- DEFINE_CASE(NEVER);
- DEFINE_CASE(LESS);
- DEFINE_CASE(EQUAL);
- DEFINE_CASE(LEQUAL);
- DEFINE_CASE(GREATER);
- DEFINE_CASE(NOTEQUAL);
- DEFINE_CASE(GEQUAL);
- DEFINE_CASE(ALWAYS);
-
- default:
- return "MALI_FUNC_NEVER /* XXX: Unknown function, check dump */";
- }
-}
-#undef DEFINE_CASE
-
-#define DEFINE_CASE(name) case MALI_STENCIL_ ## name: return "MALI_STENCIL_" #name
-static char *
-pandecode_stencil_op_name(enum mali_stencil_op op)
-{
- switch (op) {
- DEFINE_CASE(KEEP);
- DEFINE_CASE(REPLACE);
- DEFINE_CASE(ZERO);
- DEFINE_CASE(INVERT);
- DEFINE_CASE(INCR_WRAP);
- DEFINE_CASE(DECR_WRAP);
- DEFINE_CASE(INCR);
- DEFINE_CASE(DECR);
-
- default:
- return "MALI_STENCIL_KEEP /* XXX: Unknown stencil op, check dump */";
- }
-}
-
-#undef DEFINE_CASE
-
-#define DEFINE_CASE(name) case MALI_ATTR_ ## name: return "MALI_ATTR_" #name
-static char *pandecode_attr_mode_name(enum mali_attr_mode mode)
-{
- switch(mode) {
- DEFINE_CASE(UNUSED);
- DEFINE_CASE(LINEAR);
- DEFINE_CASE(POT_DIVIDE);
- DEFINE_CASE(MODULO);
- DEFINE_CASE(NPOT_DIVIDE);
- default: return "MALI_ATTR_UNUSED /* XXX: Unknown stencil op, check dump */";
- }
-}
-
-#undef DEFINE_CASE
-
-#define DEFINE_CASE(name) case MALI_CHANNEL_## name: return "MALI_CHANNEL_" #name
-static char *
-pandecode_channel_name(enum mali_channel channel)
-{
- switch (channel) {
- DEFINE_CASE(RED);
- DEFINE_CASE(GREEN);
- DEFINE_CASE(BLUE);
- DEFINE_CASE(ALPHA);
- DEFINE_CASE(ZERO);
- DEFINE_CASE(ONE);
- DEFINE_CASE(RESERVED_0);
- DEFINE_CASE(RESERVED_1);
-
- default:
- return "MALI_CHANNEL_ZERO /* XXX: Unknown channel, check dump */";
- }
-}
-#undef DEFINE_CASE
-
-#define DEFINE_CASE(name) case MALI_WRAP_## name: return "MALI_WRAP_" #name
-static char *
-pandecode_wrap_mode_name(enum mali_wrap_mode op)
-{
- switch (op) {
- DEFINE_CASE(REPEAT);
- DEFINE_CASE(CLAMP_TO_EDGE);
- DEFINE_CASE(CLAMP_TO_BORDER);
- DEFINE_CASE(MIRRORED_REPEAT);
-
- default:
- return "MALI_WRAP_REPEAT /* XXX: Unknown wrap mode, check dump */";
- }
-}
-#undef DEFINE_CASE
-
-#define DEFINE_CASE(name) case MALI_TEX_## name: return "MALI_TEX_" #name
-static char *
-pandecode_texture_type(enum mali_texture_type type)
-{
- switch (type) {
- DEFINE_CASE(1D);
- DEFINE_CASE(2D);
- DEFINE_CASE(3D);
- DEFINE_CASE(CUBE);
-
- default:
- unreachable("Unknown case");
- }
-}
-#undef DEFINE_CASE
-
-#define DEFINE_CASE(name) case MALI_MFBD_BLOCK_## name: return "MALI_MFBD_BLOCK_" #name
-static char *
-pandecode_mfbd_block_format(enum mali_mfbd_block_format fmt)
-{
- switch (fmt) {
- DEFINE_CASE(TILED);
- DEFINE_CASE(UNKNOWN);
- DEFINE_CASE(LINEAR);
- DEFINE_CASE(AFBC);
-
- default:
- unreachable("Invalid case");
- }
-}
-#undef DEFINE_CASE
-
-static inline char *
-pandecode_decode_fbd_type(enum mali_fbd_type type)
-{
- if (type == MALI_SFBD) return "SFBD";
- else if (type == MALI_MFBD) return "MFBD";
- else return "WATFBD /* XXX */";
-}
-
-/* Midgard's tiler descriptor is embedded within the
- * larger FBD */
-
-static void
-pandecode_midgard_tiler_descriptor(const struct midgard_tiler_descriptor *t)
-{
- pandecode_log(".tiler = {\n");
- pandecode_indent++;
-
- pandecode_prop("hierarchy_mask = 0x%" PRIx16, t->hierarchy_mask);
- pandecode_prop("flags = 0x%" PRIx16, t->flags);
- pandecode_prop("polygon_list_size = 0x%x", t->polygon_list_size);
-
- MEMORY_PROP(t, polygon_list);
- MEMORY_PROP(t, polygon_list_body);
-
- MEMORY_PROP(t, heap_start);
-
- {
- /* Points to the end of a buffer */
- char *a = pointer_as_memory_reference(t->heap_end - 1);
- pandecode_prop("heap_end = %s + 1", a);
- free(a);
- }
-
- bool nonzero_weights = false;
-
- for (unsigned w = 0; w < ARRAY_SIZE(t->weights); ++w) {
- nonzero_weights |= t->weights[w] != 0x0;
- }
-
- if (nonzero_weights) {
- pandecode_log(".weights = {");
-
- for (unsigned w = 0; w < ARRAY_SIZE(t->weights); ++w) {
- pandecode_log("%d, ", t->weights[w]);
- }
-
- pandecode_log("},");
- }
-
- pandecode_indent--;
- pandecode_log("}\n");
-}
-
-static void
-pandecode_replay_sfbd(uint64_t gpu_va, int job_no)
-{
- struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
- const struct mali_single_framebuffer *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va);
-
- pandecode_log("struct mali_single_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
- pandecode_indent++;
-
- pandecode_prop("unknown1 = 0x%" PRIx32, s->unknown1);
- pandecode_prop("unknown2 = 0x%" PRIx32, s->unknown2);
-
- pandecode_log(".format = ");
- pandecode_log_decoded_flags(fb_fmt_flag_info, s->format);
- pandecode_log_cont(",\n");
-
- pandecode_prop("width = MALI_POSITIVE(%" PRId16 ")", s->width + 1);
- pandecode_prop("height = MALI_POSITIVE(%" PRId16 ")", s->height + 1);
-
- MEMORY_PROP(s, framebuffer);
- pandecode_prop("stride = %d", s->stride);
-
- /* Earlier in the actual commandstream -- right before width -- but we
- * delay to flow nicer */
-
- pandecode_log(".clear_flags = ");
- pandecode_log_decoded_flags(clear_flag_info, s->clear_flags);
- pandecode_log_cont(",\n");
-
- if (s->depth_buffer | s->depth_buffer_enable) {
- MEMORY_PROP(s, depth_buffer);
- pandecode_prop("depth_buffer_enable = %s", DS_ENABLE(s->depth_buffer_enable));
- }
-
- if (s->stencil_buffer | s->stencil_buffer_enable) {
- MEMORY_PROP(s, stencil_buffer);
- pandecode_prop("stencil_buffer_enable = %s", DS_ENABLE(s->stencil_buffer_enable));
- }
-
- if (s->clear_color_1 | s->clear_color_2 | s->clear_color_3 | s->clear_color_4) {
- pandecode_prop("clear_color_1 = 0x%" PRIx32, s->clear_color_1);
- pandecode_prop("clear_color_2 = 0x%" PRIx32, s->clear_color_2);
- pandecode_prop("clear_color_3 = 0x%" PRIx32, s->clear_color_3);
- pandecode_prop("clear_color_4 = 0x%" PRIx32, s->clear_color_4);
- }
-
- if (s->clear_depth_1 != 0 || s->clear_depth_2 != 0 || s->clear_depth_3 != 0 || s->clear_depth_4 != 0) {
- pandecode_prop("clear_depth_1 = %f", s->clear_depth_1);
- pandecode_prop("clear_depth_2 = %f", s->clear_depth_2);
- pandecode_prop("clear_depth_3 = %f", s->clear_depth_3);
- pandecode_prop("clear_depth_4 = %f", s->clear_depth_4);
- }
-
- if (s->clear_stencil) {
- pandecode_prop("clear_stencil = 0x%x", s->clear_stencil);
- }
-
- MEMORY_PROP(s, unknown_address_0);
- pandecode_midgard_tiler_descriptor(&s->tiler);
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- pandecode_prop("zero0 = 0x%" PRIx64, s->zero0);
- pandecode_prop("zero1 = 0x%" PRIx64, s->zero1);
- pandecode_prop("zero2 = 0x%" PRIx32, s->zero2);
- pandecode_prop("zero4 = 0x%" PRIx32, s->zero4);
-
- printf(".zero3 = {");
-
- for (int i = 0; i < sizeof(s->zero3) / sizeof(s->zero3[0]); ++i)
- printf("%X, ", s->zero3[i]);
-
- printf("},\n");
-
- printf(".zero6 = {");
-
- for (int i = 0; i < sizeof(s->zero6) / sizeof(s->zero6[0]); ++i)
- printf("%X, ", s->zero6[i]);
-
- printf("},\n");
-}
-
-static void
-pandecode_u32_slide(unsigned name, const u32 *slide, unsigned count)
-{
- pandecode_log(".unknown%d = {", name);
-
- for (int i = 0; i < count; ++i)
- printf("%X, ", slide[i]);
-
- pandecode_log("},\n");
-}
-
-#define SHORT_SLIDE(num) \
- pandecode_u32_slide(num, s->unknown ## num, ARRAY_SIZE(s->unknown ## num))
-
-static void
-pandecode_compute_fbd(uint64_t gpu_va, int job_no)
-{
- struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
- const struct mali_compute_fbd *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va);
-
- pandecode_log("struct mali_compute_fbd framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
- pandecode_indent++;
-
- SHORT_SLIDE(1);
-
- pandecode_indent--;
- printf("},\n");
-}
-
-static void
-pandecode_replay_swizzle(unsigned swizzle)
-{
- pandecode_prop("swizzle = %s | (%s << 3) | (%s << 6) | (%s << 9)",
- pandecode_channel_name((swizzle >> 0) & 0x7),
- pandecode_channel_name((swizzle >> 3) & 0x7),
- pandecode_channel_name((swizzle >> 6) & 0x7),
- pandecode_channel_name((swizzle >> 9) & 0x7));
-}
-
-static void
-pandecode_rt_format(struct mali_rt_format format)
-{
- pandecode_log(".format = {\n");
- pandecode_indent++;
-
- pandecode_prop("unk1 = 0x%" PRIx32, format.unk1);
- pandecode_prop("unk2 = 0x%" PRIx32, format.unk2);
- pandecode_prop("unk3 = 0x%" PRIx32, format.unk3);
-
- pandecode_prop("block = %s",
- pandecode_mfbd_block_format(format.block));
-
- pandecode_prop("nr_channels = MALI_POSITIVE(%d)",
- MALI_NEGATIVE(format.nr_channels));
-
- pandecode_log(".flags = ");
- pandecode_log_decoded_flags(mfbd_fmt_flag_info, format.flags);
- pandecode_log_cont(",\n");
-
- pandecode_replay_swizzle(format.swizzle);
-
- pandecode_prop("unk4 = 0x%" PRIx32, format.unk4);
-
- pandecode_indent--;
- pandecode_log("},\n");
-}
-
-static void
-pandecode_render_target(uint64_t gpu_va, unsigned job_no, const struct bifrost_framebuffer *fb)
-{
- pandecode_log("struct bifrost_render_target rts_list_%"PRIx64"_%d[] = {\n", gpu_va, job_no);
- pandecode_indent++;
-
- for (int i = 0; i < MALI_NEGATIVE(fb->rt_count_1); i++) {
- mali_ptr rt_va = gpu_va + i * sizeof(struct bifrost_render_target);
- struct pandecode_mapped_memory *mem =
- pandecode_find_mapped_gpu_mem_containing(rt_va);
- const struct bifrost_render_target *PANDECODE_PTR_VAR(rt, mem, (mali_ptr) rt_va);
-
- pandecode_log("{\n");
- pandecode_indent++;
-
- pandecode_rt_format(rt->format);
-
- if (rt->format.block == MALI_MFBD_BLOCK_AFBC) {
- pandecode_log(".afbc = {\n");
- pandecode_indent++;
-
- char *a = pointer_as_memory_reference(rt->afbc.metadata);
- pandecode_prop("metadata = %s", a);
- free(a);
-
- pandecode_prop("stride = %d", rt->afbc.stride);
- pandecode_prop("unk = 0x%" PRIx32, rt->afbc.unk);
-
- pandecode_indent--;
- pandecode_log("},\n");
- } else {
- pandecode_log(".chunknown = {\n");
- pandecode_indent++;
-
- pandecode_prop("unk = 0x%" PRIx64, rt->chunknown.unk);
-
- char *a = pointer_as_memory_reference(rt->chunknown.pointer);
- pandecode_prop("pointer = %s", a);
- free(a);
-
- pandecode_indent--;
- pandecode_log("},\n");
- }
-
- MEMORY_PROP(rt, framebuffer);
- pandecode_prop("framebuffer_stride = %d", rt->framebuffer_stride);
-
- if (rt->clear_color_1 | rt->clear_color_2 | rt->clear_color_3 | rt->clear_color_4) {
- pandecode_prop("clear_color_1 = 0x%" PRIx32, rt->clear_color_1);
- pandecode_prop("clear_color_2 = 0x%" PRIx32, rt->clear_color_2);
- pandecode_prop("clear_color_3 = 0x%" PRIx32, rt->clear_color_3);
- pandecode_prop("clear_color_4 = 0x%" PRIx32, rt->clear_color_4);
- }
-
- if (rt->zero1 || rt->zero2 || rt->zero3) {
- pandecode_msg("render target zeros tripped\n");
- pandecode_prop("zero1 = 0x%" PRIx64, rt->zero1);
- pandecode_prop("zero2 = 0x%" PRIx32, rt->zero2);
- pandecode_prop("zero3 = 0x%" PRIx32, rt->zero3);
- }
-
- pandecode_indent--;
- pandecode_log("},\n");
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-}
-
-static unsigned
-pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets)
-{
- struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
- const struct bifrost_framebuffer *PANDECODE_PTR_VAR(fb, mem, (mali_ptr) gpu_va);
-
- if (fb->sample_locations) {
- /* The blob stores all possible sample locations in a single buffer
- * allocated on startup, and just switches the pointer when switching
- * MSAA state. For now, we just put the data into the cmdstream, but we
- * should do something like what the blob does with a real driver.
- *
- * There seem to be 32 slots for sample locations, followed by another
- * 16. The second 16 is just the center location followed by 15 zeros
- * in all the cases I've identified (maybe shader vs. depth/color
- * samples?).
- */
-
- struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(fb->sample_locations);
-
- const u16 *PANDECODE_PTR_VAR(samples, smem, fb->sample_locations);
-
- pandecode_log("uint16_t sample_locations_%d[] = {\n", job_no);
- pandecode_indent++;
-
- for (int i = 0; i < 32 + 16; i++) {
- pandecode_log("%d, %d,\n", samples[2 * i], samples[2 * i + 1]);
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
- }
-
- pandecode_log("struct bifrost_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
- pandecode_indent++;
-
- pandecode_prop("unk0 = 0x%x", fb->unk0);
-
- if (fb->sample_locations)
- pandecode_prop("sample_locations = sample_locations_%d", job_no);
-
- /* Assume that unknown1 was emitted in the last job for
- * now */
- MEMORY_PROP(fb, unknown1);
-
- pandecode_prop("width1 = MALI_POSITIVE(%d)", fb->width1 + 1);
- pandecode_prop("height1 = MALI_POSITIVE(%d)", fb->height1 + 1);
- pandecode_prop("width2 = MALI_POSITIVE(%d)", fb->width2 + 1);
- pandecode_prop("height2 = MALI_POSITIVE(%d)", fb->height2 + 1);
-
- pandecode_prop("unk1 = 0x%x", fb->unk1);
- pandecode_prop("unk2 = 0x%x", fb->unk2);
- pandecode_prop("rt_count_1 = MALI_POSITIVE(%d)", fb->rt_count_1 + 1);
- pandecode_prop("rt_count_2 = %d", fb->rt_count_2);
-
- pandecode_log(".mfbd_flags = ");
- pandecode_log_decoded_flags(mfbd_flag_info, fb->mfbd_flags);
- pandecode_log_cont(",\n");
-
- pandecode_prop("clear_stencil = 0x%x", fb->clear_stencil);
- pandecode_prop("clear_depth = %f", fb->clear_depth);
-
- pandecode_prop("unknown2 = 0x%x", fb->unknown2);
- MEMORY_PROP(fb, scratchpad);
- pandecode_midgard_tiler_descriptor(&fb->tiler);
-
- if (fb->zero3 || fb->zero4) {
- pandecode_msg("framebuffer zeros tripped\n");
- pandecode_prop("zero3 = 0x%" PRIx32, fb->zero3);
- pandecode_prop("zero4 = 0x%" PRIx32, fb->zero4);
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- gpu_va += sizeof(struct bifrost_framebuffer);
-
- if ((fb->mfbd_flags & MALI_MFBD_EXTRA) && with_render_targets) {
- mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
- const struct bifrost_fb_extra *PANDECODE_PTR_VAR(fbx, mem, (mali_ptr) gpu_va);
-
- pandecode_log("struct bifrost_fb_extra fb_extra_%"PRIx64"_%d = {\n", gpu_va, job_no);
- pandecode_indent++;
-
- MEMORY_PROP(fbx, checksum);
-
- if (fbx->checksum_stride)
- pandecode_prop("checksum_stride = %d", fbx->checksum_stride);
-
- pandecode_log(".flags = ");
- pandecode_log_decoded_flags(mfbd_extra_flag_info, fbx->flags);
- pandecode_log_cont(",\n");
-
- if (fbx->flags & MALI_EXTRA_AFBC_ZS) {
- pandecode_log(".ds_afbc = {\n");
- pandecode_indent++;
-
- MEMORY_PROP((&fbx->ds_afbc), depth_stencil_afbc_metadata);
- pandecode_prop("depth_stencil_afbc_stride = %d",
- fbx->ds_afbc.depth_stencil_afbc_stride);
- MEMORY_PROP((&fbx->ds_afbc), depth_stencil);
-
- if (fbx->ds_afbc.zero1 || fbx->ds_afbc.padding) {
- pandecode_msg("Depth/stencil AFBC zeros tripped\n");
- pandecode_prop("zero1 = 0x%" PRIx32,
- fbx->ds_afbc.zero1);
- pandecode_prop("padding = 0x%" PRIx64,
- fbx->ds_afbc.padding);
- }
-
- pandecode_indent--;
- pandecode_log("},\n");
- } else {
- pandecode_log(".ds_linear = {\n");
- pandecode_indent++;
-
- if (fbx->ds_linear.depth) {
- MEMORY_PROP((&fbx->ds_linear), depth);
- pandecode_prop("depth_stride = %d",
- fbx->ds_linear.depth_stride);
- }
-
- if (fbx->ds_linear.stencil) {
- MEMORY_PROP((&fbx->ds_linear), stencil);
- pandecode_prop("stencil_stride = %d",
- fbx->ds_linear.stencil_stride);
- }
-
- if (fbx->ds_linear.depth_stride_zero ||
- fbx->ds_linear.stencil_stride_zero ||
- fbx->ds_linear.zero1 || fbx->ds_linear.zero2) {
- pandecode_msg("Depth/stencil zeros tripped\n");
- pandecode_prop("depth_stride_zero = 0x%x",
- fbx->ds_linear.depth_stride_zero);
- pandecode_prop("stencil_stride_zero = 0x%x",
- fbx->ds_linear.stencil_stride_zero);
- pandecode_prop("zero1 = 0x%" PRIx32,
- fbx->ds_linear.zero1);
- pandecode_prop("zero2 = 0x%" PRIx32,
- fbx->ds_linear.zero2);
- }
-
- pandecode_indent--;
- pandecode_log("},\n");
- }
-
- if (fbx->zero3 || fbx->zero4) {
- pandecode_msg("fb_extra zeros tripped\n");
- pandecode_prop("zero3 = 0x%" PRIx64, fbx->zero3);
- pandecode_prop("zero4 = 0x%" PRIx64, fbx->zero4);
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- gpu_va += sizeof(struct bifrost_fb_extra);
- }
-
- if (with_render_targets)
- pandecode_render_target(gpu_va, job_no, fb);
-
- /* Passback the render target count */
- return MALI_NEGATIVE(fb->rt_count_1);
-}
-
-/* Just add a comment decoding the shift/odd fields forming the padded vertices
- * count */
-
-static void
-pandecode_padded_vertices(unsigned shift, unsigned k)
-{
- unsigned odd = 2*k + 1;
- unsigned pot = 1 << shift;
- pandecode_msg("padded_num_vertices = %d\n", odd * pot);
-}
-
-/* Given a magic divisor, recover what we were trying to divide by.
- *
- * Let m represent the magic divisor. By definition, m is an element on Z, whre
- * 0 <= m < 2^N, for N bits in m.
- *
- * Let q represent the number we would like to divide by.
- *
- * By definition of a magic divisor for N-bit unsigned integers (a number you
- * multiply by to magically get division), m is a number such that:
- *
- * (m * x) & (2^N - 1) = floor(x/q).
- * for all x on Z where 0 <= x < 2^N
- *
- * Ignore the case where any of the above values equals zero; it is irrelevant
- * for our purposes (instanced arrays).
- *
- * Choose x = q. Then:
- *
- * (m * x) & (2^N - 1) = floor(x/q).
- * (m * q) & (2^N - 1) = floor(q/q).
- *
- * floor(q/q) = floor(1) = 1, therefore:
- *
- * (m * q) & (2^N - 1) = 1
- *
- * Recall the identity that the bitwise AND of one less than a power-of-two
- * equals the modulo with that power of two, i.e. for all x:
- *
- * x & (2^N - 1) = x % N
- *
- * Therefore:
- *
- * mq % (2^N) = 1
- *
- * By definition, a modular multiplicative inverse of a number m is the number
- * q such that with respect to a modulos M:
- *
- * mq % M = 1
- *
- * Therefore, q is the modular multiplicative inverse of m with modulus 2^N.
- *
- */
-
-static void
-pandecode_magic_divisor(uint32_t magic, unsigned shift, unsigned orig_divisor, unsigned extra)
-{
- /* Compute the modular inverse of `magic` with respect to 2^(32 -
- * shift) the most lame way possible... just repeatedly add.
- * Asymptoptically slow but nobody cares in practice, unless you have
- * massive numbers of vertices or high divisors. */
-
- unsigned inverse = 0;
-
- /* Magic implicitly has the highest bit set */
- magic |= (1 << 31);
-
- /* Depending on rounding direction */
- if (extra)
- magic++;
-
- for (;;) {
- uint32_t product = magic * inverse;
-
- if (shift) {
- product >>= shift;
- }
-
- if (product == 1)
- break;
-
- ++inverse;
- }
-
- pandecode_msg("dividing by %d (maybe off by two)\n", inverse);
-
- /* Recall we're supposed to divide by (gl_level_divisor *
- * padded_num_vertices) */
-
- unsigned padded_num_vertices = inverse / orig_divisor;
-
- pandecode_msg("padded_num_vertices = %d\n", padded_num_vertices);
-}
-
-static void
-pandecode_replay_attributes(const struct pandecode_mapped_memory *mem,
- mali_ptr addr, int job_no, char *suffix,
- int count, bool varying)
-{
- char *prefix = varying ? "varyings" : "attributes";
-
- union mali_attr *attr = pandecode_fetch_gpu_mem(mem, addr, sizeof(union mali_attr) * count);
-
- char base[128];
- snprintf(base, sizeof(base), "%s_data_%d%s", prefix, job_no, suffix);
-
- for (int i = 0; i < count; ++i) {
- enum mali_attr_mode mode = attr[i].elements & 7;
-
- if (mode == MALI_ATTR_UNUSED)
- continue;
-
- mali_ptr raw_elements = attr[i].elements & ~7;
-
- /* TODO: Do we maybe want to dump the attribute values
- * themselves given the specified format? Or is that too hard?
- * */
-
- char *a = pointer_as_memory_reference(raw_elements);
- pandecode_log("mali_ptr %s_%d_p = %s;\n", base, i, a);
- free(a);
- }
-
- pandecode_log("union mali_attr %s_%d[] = {\n", prefix, job_no);
- pandecode_indent++;
-
- for (int i = 0; i < count; ++i) {
- pandecode_log("{\n");
- pandecode_indent++;
-
- unsigned mode = attr[i].elements & 7;
- pandecode_prop("elements = (%s_%d_p) | %s", base, i, pandecode_attr_mode_name(mode));
- pandecode_prop("shift = %d", attr[i].shift);
- pandecode_prop("extra_flags = %d", attr[i].extra_flags);
- pandecode_prop("stride = 0x%" PRIx32, attr[i].stride);
- pandecode_prop("size = 0x%" PRIx32, attr[i].size);
-
- /* Decode further where possible */
-
- if (mode == MALI_ATTR_MODULO) {
- pandecode_padded_vertices(
- attr[i].shift,
- attr[i].extra_flags);
- }
-
- pandecode_indent--;
- pandecode_log("}, \n");
-
- if (mode == MALI_ATTR_NPOT_DIVIDE) {
- i++;
- pandecode_log("{\n");
- pandecode_indent++;
- pandecode_prop("unk = 0x%x", attr[i].unk);
- pandecode_prop("magic_divisor = 0x%08x", attr[i].magic_divisor);
- if (attr[i].zero != 0)
- pandecode_prop("zero = 0x%x /* XXX zero tripped */", attr[i].zero);
- pandecode_prop("divisor = %d", attr[i].divisor);
- pandecode_magic_divisor(attr[i].magic_divisor, attr[i - 1].shift, attr[i].divisor, attr[i - 1].extra_flags);
- pandecode_indent--;
- pandecode_log("}, \n");
- }
-
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-}
-
-static mali_ptr
-pandecode_replay_shader_address(const char *name, mali_ptr ptr)
-{
- /* TODO: Decode flags */
- mali_ptr shader_ptr = ptr & ~15;
-
- char *a = pointer_as_memory_reference(shader_ptr);
- pandecode_prop("%s = (%s) | %d", name, a, (int) (ptr & 15));
- free(a);
-
- return shader_ptr;
-}
-
-static bool
-all_zero(unsigned *buffer, unsigned count)
-{
- for (unsigned i = 0; i < count; ++i) {
- if (buffer[i])
- return false;
- }
-
- return true;
-}
-
-static void
-pandecode_replay_stencil(const char *name, const struct mali_stencil_test *stencil)
-{
- if (all_zero((unsigned *) stencil, sizeof(stencil) / sizeof(unsigned)))
- return;
-
- const char *func = pandecode_func_name(stencil->func);
- const char *sfail = pandecode_stencil_op_name(stencil->sfail);
- const char *dpfail = pandecode_stencil_op_name(stencil->dpfail);
- const char *dppass = pandecode_stencil_op_name(stencil->dppass);
-
- if (stencil->zero)
- pandecode_msg("Stencil zero tripped: %X\n", stencil->zero);
-
- pandecode_log(".stencil_%s = {\n", name);
- pandecode_indent++;
- pandecode_prop("ref = %d", stencil->ref);
- pandecode_prop("mask = 0x%02X", stencil->mask);
- pandecode_prop("func = %s", func);
- pandecode_prop("sfail = %s", sfail);
- pandecode_prop("dpfail = %s", dpfail);
- pandecode_prop("dppass = %s", dppass);
- pandecode_indent--;
- pandecode_log("},\n");
-}
-
-static void
-pandecode_replay_blend_equation(const struct mali_blend_equation *blend)
-{
- if (blend->zero1)
- pandecode_msg("Blend zero tripped: %X\n", blend->zero1);
-
- pandecode_log(".equation = {\n");
- pandecode_indent++;
-
- pandecode_prop("rgb_mode = 0x%X", blend->rgb_mode);
- pandecode_prop("alpha_mode = 0x%X", blend->alpha_mode);
-
- pandecode_log(".color_mask = ");
- pandecode_log_decoded_flags(mask_flag_info, blend->color_mask);
- pandecode_log_cont(",\n");
-
- pandecode_indent--;
- pandecode_log("},\n");
-}
-
-/* Decodes a Bifrost blend constant. See the notes in bifrost_blend_rt */
-
-static unsigned
-decode_bifrost_constant(u16 constant)
-{
- float lo = (float) (constant & 0xFF);
- float hi = (float) (constant >> 8);
-
- return (hi / 255.0) + (lo / 65535.0);
-}
-
-static mali_ptr
-pandecode_bifrost_blend(void *descs, int job_no, int rt_no)
-{
- struct bifrost_blend_rt *b =
- ((struct bifrost_blend_rt *) descs) + rt_no;
-
- pandecode_log("struct bifrost_blend_rt blend_rt_%d_%d = {\n", job_no, rt_no);
- pandecode_indent++;
-
- pandecode_prop("flags = 0x%" PRIx16, b->flags);
- pandecode_prop("constant = 0x%" PRIx8 " /* %f */",
- b->constant, decode_bifrost_constant(b->constant));
-
- /* TODO figure out blend shader enable bit */
- pandecode_replay_blend_equation(&b->equation);
- pandecode_prop("unk2 = 0x%" PRIx16, b->unk2);
- pandecode_prop("index = 0x%" PRIx16, b->index);
- pandecode_prop("shader = 0x%" PRIx32, b->shader);
-
- pandecode_indent--;
- pandecode_log("},\n");
-
- return 0;
-}
-
-static mali_ptr
-pandecode_midgard_blend(union midgard_blend *blend, bool is_shader)
-{
- if (all_zero((unsigned *) blend, sizeof(blend) / sizeof(unsigned)))
- return 0;
-
- pandecode_log(".blend = {\n");
- pandecode_indent++;
-
- if (is_shader) {
- pandecode_replay_shader_address("shader", blend->shader);
- } else {
- pandecode_replay_blend_equation(&blend->equation);
- pandecode_prop("constant = %f", blend->constant);
- }
-
- pandecode_indent--;
- pandecode_log("},\n");
-
- /* Return blend shader to disassemble if present */
- return is_shader ? (blend->shader & ~0xF) : 0;
-}
-
-static mali_ptr
-pandecode_midgard_blend_mrt(void *descs, int job_no, int rt_no)
-{
- struct midgard_blend_rt *b =
- ((struct midgard_blend_rt *) descs) + rt_no;
-
- /* Flags determine presence of blend shader */
- bool is_shader = (b->flags & 0xF) >= 0x2;
-
- pandecode_log("struct midgard_blend_rt blend_rt_%d_%d = {\n", job_no, rt_no);
- pandecode_indent++;
-
- pandecode_prop("flags = 0x%" PRIx64, b->flags);
-
- mali_ptr shader = pandecode_midgard_blend(&b->blend, is_shader);
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- return shader;
-}
-
-static int
-pandecode_replay_attribute_meta(int job_no, int count, const struct mali_vertex_tiler_postfix *v, bool varying, char *suffix)
-{
- char base[128];
- char *prefix = varying ? "varying" : "attribute";
- unsigned max_index = 0;
- snprintf(base, sizeof(base), "%s_meta", prefix);
-
- pandecode_log("struct mali_attr_meta %s_%d%s[] = {\n", base, job_no, suffix);
- pandecode_indent++;
-
- struct mali_attr_meta *attr_meta;
- mali_ptr p = varying ? (v->varying_meta & ~0xF) : v->attribute_meta;
-
- struct pandecode_mapped_memory *attr_mem = pandecode_find_mapped_gpu_mem_containing(p);
-
- for (int i = 0; i < count; ++i, p += sizeof(struct mali_attr_meta)) {
- attr_meta = pandecode_fetch_gpu_mem(attr_mem, p,
- sizeof(*attr_mem));
-
- pandecode_log("{\n");
- pandecode_indent++;
- pandecode_prop("index = %d", attr_meta->index);
-
- if (attr_meta->index > max_index)
- max_index = attr_meta->index;
- pandecode_replay_swizzle(attr_meta->swizzle);
- pandecode_prop("format = %s", pandecode_format_name(attr_meta->format));
-
- pandecode_prop("unknown1 = 0x%" PRIx64, (u64) attr_meta->unknown1);
- pandecode_prop("unknown3 = 0x%" PRIx64, (u64) attr_meta->unknown3);
- pandecode_prop("src_offset = %d", attr_meta->src_offset);
- pandecode_indent--;
- pandecode_log("},\n");
-
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- return max_index;
-}
-
-static void
-pandecode_replay_indices(uintptr_t pindices, uint32_t index_count, int job_no)
-{
- struct pandecode_mapped_memory *imem = pandecode_find_mapped_gpu_mem_containing(pindices);
-
- if (imem) {
- /* Indices are literally just a u32 array :) */
-
- uint32_t *PANDECODE_PTR_VAR(indices, imem, pindices);
-
- pandecode_log("uint32_t indices_%d[] = {\n", job_no);
- pandecode_indent++;
-
- for (unsigned i = 0; i < (index_count + 1); i += 3)
- pandecode_log("%d, %d, %d,\n",
- indices[i],
- indices[i + 1],
- indices[i + 2]);
-
- pandecode_indent--;
- pandecode_log("};\n");
- }
-}
-
-/* return bits [lo, hi) of word */
-static u32
-bits(u32 word, u32 lo, u32 hi)
-{
- if (hi - lo >= 32)
- return word; // avoid undefined behavior with the shift
-
- return (word >> lo) & ((1 << (hi - lo)) - 1);
-}
-
-static void
-pandecode_replay_vertex_tiler_prefix(struct mali_vertex_tiler_prefix *p, int job_no)
-{
- pandecode_log_cont("{\n");
- pandecode_indent++;
-
- pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count);
- pandecode_prop("size_y_shift = %d", p->size_y_shift);
- pandecode_prop("size_z_shift = %d", p->size_z_shift);
- pandecode_prop("workgroups_x_shift = %d", p->workgroups_x_shift);
- pandecode_prop("workgroups_y_shift = %d", p->workgroups_y_shift);
- pandecode_prop("workgroups_z_shift = %d", p->workgroups_z_shift);
- pandecode_prop("workgroups_x_shift_2 = 0x%" PRIx32, p->workgroups_x_shift_2);
-
- /* Decode invocation_count. See the comment before the definition of
- * invocation_count for an explanation.
- */
- pandecode_msg("size: (%d, %d, %d)\n",
- bits(p->invocation_count, 0, p->size_y_shift) + 1,
- bits(p->invocation_count, p->size_y_shift, p->size_z_shift) + 1,
- bits(p->invocation_count, p->size_z_shift,
- p->workgroups_x_shift) + 1);
- pandecode_msg("workgroups: (%d, %d, %d)\n",
- bits(p->invocation_count, p->workgroups_x_shift,
- p->workgroups_y_shift) + 1,
- bits(p->invocation_count, p->workgroups_y_shift,
- p->workgroups_z_shift) + 1,
- bits(p->invocation_count, p->workgroups_z_shift,
- 32) + 1);
-
- /* TODO: Decode */
- if (p->unknown_draw)
- pandecode_prop("unknown_draw = 0x%" PRIx32, p->unknown_draw);
-
- pandecode_prop("workgroups_x_shift_3 = 0x%" PRIx32, p->workgroups_x_shift_3);
-
- pandecode_prop("draw_mode = %s", pandecode_draw_mode_name(p->draw_mode));
-
- /* Index count only exists for tiler jobs anyway */
-
- if (p->index_count)
- pandecode_prop("index_count = MALI_POSITIVE(%" PRId32 ")", p->index_count + 1);
-
- if (p->negative_start)
- pandecode_prop("negative_start = %d", p->negative_start);
-
- DYN_MEMORY_PROP(p, job_no, indices);
-
- if (p->zero1) {
- pandecode_msg("Zero tripped\n");
- pandecode_prop("zero1 = 0x%" PRIx32, p->zero1);
- }
-
- pandecode_indent--;
- pandecode_log("},\n");
-}
-
-static void
-pandecode_replay_uniform_buffers(mali_ptr pubufs, int ubufs_count, int job_no)
-{
- struct pandecode_mapped_memory *umem = pandecode_find_mapped_gpu_mem_containing(pubufs);
-
- struct mali_uniform_buffer_meta *PANDECODE_PTR_VAR(ubufs, umem, pubufs);
-
- for (int i = 0; i < ubufs_count; i++) {
- mali_ptr ptr = ubufs[i].ptr << 2;
- struct pandecode_mapped_memory *umem2 = pandecode_find_mapped_gpu_mem_containing(ptr);
- uint32_t *PANDECODE_PTR_VAR(ubuf, umem2, ptr);
- char name[50];
- snprintf(name, sizeof(name), "ubuf_%d", i);
- /* The blob uses ubuf 0 to upload internal stuff and
- * uniforms that won't fit/are accessed indirectly, so
- * it puts it in the batchbuffer.
- */
- pandecode_log("uint32_t %s_%d[] = {\n", name, job_no);
- pandecode_indent++;
-
- for (int j = 0; j <= ubufs[i].size; j++) {
- for (int k = 0; k < 4; k++) {
- if (k == 0)
- pandecode_log("0x%"PRIx32", ", ubuf[4 * j + k]);
- else
- pandecode_log_cont("0x%"PRIx32", ", ubuf[4 * j + k]);
-
- }
-
- pandecode_log_cont("\n");
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
- }
-
- pandecode_log("struct mali_uniform_buffer_meta uniform_buffers_%d[] = {\n",
- job_no);
- pandecode_indent++;
-
- for (int i = 0; i < ubufs_count; i++) {
- pandecode_log("{\n");
- pandecode_indent++;
- pandecode_prop("size = MALI_POSITIVE(%d)", ubufs[i].size + 1);
- pandecode_prop("ptr = ubuf_%d_%d_p >> 2", i, job_no);
- pandecode_indent--;
- pandecode_log("},\n");
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-}
-
-static void
-pandecode_replay_scratchpad(uintptr_t pscratchpad, int job_no, char *suffix)
-{
-
- struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(pscratchpad);
-
- struct bifrost_scratchpad *PANDECODE_PTR_VAR(scratchpad, mem, pscratchpad);
-
- if (scratchpad->zero)
- pandecode_msg("XXX scratchpad zero tripped");
-
- pandecode_log("struct bifrost_scratchpad scratchpad_%"PRIx64"_%d%s = {\n", pscratchpad, job_no, suffix);
- pandecode_indent++;
-
- pandecode_prop("flags = 0x%x", scratchpad->flags);
- MEMORY_PROP(scratchpad, gpu_scratchpad);
-
- pandecode_indent--;
- pandecode_log("};\n");
-}
-
-static void
-pandecode_shader_disassemble(mali_ptr shader_ptr, int shader_no, int type,
- bool is_bifrost)
-{
- struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(shader_ptr);
- uint8_t *PANDECODE_PTR_VAR(code, mem, shader_ptr);
-
- /* Compute maximum possible size */
- size_t sz = mem->length - (shader_ptr - mem->gpu_va);
-
- /* Print some boilerplate to clearly denote the assembly (which doesn't
- * obey indentation rules), and actually do the disassembly! */
-
- printf("\n\n");
-
- if (is_bifrost) {
- disassemble_bifrost(code, sz, false);
- } else {
- disassemble_midgard(code, sz);
- }
-
- printf("\n\n");
-}
-
-static void
-pandecode_replay_vertex_tiler_postfix_pre(const struct mali_vertex_tiler_postfix *p,
- int job_no, enum mali_job_type job_type,
- char *suffix, bool is_bifrost)
-{
- mali_ptr shader_meta_ptr = (u64) (uintptr_t) (p->_shader_upper << 4);
- struct pandecode_mapped_memory *attr_mem;
-
- unsigned rt_count = 1;
-
- /* On Bifrost, since the tiler heap (for tiler jobs) and the scratchpad
- * are the only things actually needed from the FBD, vertex/tiler jobs
- * no longer reference the FBD -- instead, this field points to some
- * info about the scratchpad.
- */
- if (is_bifrost)
- pandecode_replay_scratchpad(p->framebuffer & ~FBD_TYPE, job_no, suffix);
- else if (p->framebuffer & MALI_MFBD)
- rt_count = pandecode_replay_mfbd_bfr((u64) ((uintptr_t) p->framebuffer) & FBD_MASK, job_no, false);
- else if (job_type == JOB_TYPE_COMPUTE)
- pandecode_compute_fbd((u64) (uintptr_t) p->framebuffer, job_no);
- else
- pandecode_replay_sfbd((u64) (uintptr_t) p->framebuffer, job_no);
-
- int varying_count = 0, attribute_count = 0, uniform_count = 0, uniform_buffer_count = 0;
- int texture_count = 0, sampler_count = 0;
-
- if (shader_meta_ptr) {
- struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(shader_meta_ptr);
- struct mali_shader_meta *PANDECODE_PTR_VAR(s, smem, shader_meta_ptr);
-
- pandecode_log("struct mali_shader_meta shader_meta_%"PRIx64"_%d%s = {\n", shader_meta_ptr, job_no, suffix);
- pandecode_indent++;
-
- /* Save for dumps */
- attribute_count = s->attribute_count;
- varying_count = s->varying_count;
- texture_count = s->texture_count;
- sampler_count = s->sampler_count;
-
- if (is_bifrost) {
- uniform_count = s->bifrost2.uniform_count;
- uniform_buffer_count = s->bifrost1.uniform_buffer_count;
- } else {
- uniform_count = s->midgard1.uniform_count;
- uniform_buffer_count = s->midgard1.uniform_buffer_count;
- }
-
- mali_ptr shader_ptr = pandecode_replay_shader_address("shader", s->shader);
-
- pandecode_prop("texture_count = %" PRId16, s->texture_count);
- pandecode_prop("sampler_count = %" PRId16, s->sampler_count);
- pandecode_prop("attribute_count = %" PRId16, s->attribute_count);
- pandecode_prop("varying_count = %" PRId16, s->varying_count);
-
- if (is_bifrost) {
- pandecode_log(".bifrost1 = {\n");
- pandecode_indent++;
-
- pandecode_prop("uniform_buffer_count = %" PRId32, s->bifrost1.uniform_buffer_count);
- pandecode_prop("unk1 = 0x%" PRIx32, s->bifrost1.unk1);
-
- pandecode_indent--;
- pandecode_log("},\n");
- } else {
- pandecode_log(".midgard1 = {\n");
- pandecode_indent++;
-
- pandecode_prop("uniform_count = %" PRId16, s->midgard1.uniform_count);
- pandecode_prop("uniform_buffer_count = %" PRId16, s->midgard1.uniform_buffer_count);
- pandecode_prop("work_count = %" PRId16, s->midgard1.work_count);
-
- pandecode_log(".flags = ");
- pandecode_log_decoded_flags(shader_midgard1_flag_info, s->midgard1.flags);
- pandecode_log_cont(",\n");
-
- pandecode_prop("unknown2 = 0x%" PRIx32, s->midgard1.unknown2);
-
- pandecode_indent--;
- pandecode_log("},\n");
- }
-
- if (s->depth_units || s->depth_factor) {
- if (is_bifrost)
- pandecode_prop("depth_units = %f", s->depth_units);
- else
- pandecode_prop("depth_units = MALI_NEGATIVE(%f)", s->depth_units - 1.0f);
-
- pandecode_prop("depth_factor = %f", s->depth_factor);
- }
-
- if (s->alpha_coverage) {
- bool invert_alpha_coverage = s->alpha_coverage & 0xFFF0;
- uint16_t inverted_coverage = invert_alpha_coverage ? ~s->alpha_coverage : s->alpha_coverage;
-
- pandecode_prop("alpha_coverage = %sMALI_ALPHA_COVERAGE(%f)",
- invert_alpha_coverage ? "~" : "",
- MALI_GET_ALPHA_COVERAGE(inverted_coverage));
- }
-
- if (s->unknown2_3 || s->unknown2_4) {
- pandecode_log(".unknown2_3 = ");
-
- int unknown2_3 = s->unknown2_3;
- int unknown2_4 = s->unknown2_4;
-
- /* We're not quite sure what these flags mean without the depth test, if anything */
-
- if (unknown2_3 & (MALI_DEPTH_TEST | MALI_DEPTH_FUNC_MASK)) {
- const char *func = pandecode_func_name(MALI_GET_DEPTH_FUNC(unknown2_3));
- unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
-
- pandecode_log_cont("MALI_DEPTH_FUNC(%s) | ", func);
- }
-
- pandecode_log_decoded_flags(u3_flag_info, unknown2_3);
- pandecode_log_cont(",\n");
-
- pandecode_log(".unknown2_4 = ");
- pandecode_log_decoded_flags(u4_flag_info, unknown2_4);
- pandecode_log_cont(",\n");
- }
-
- if (s->stencil_mask_front || s->stencil_mask_back) {
- pandecode_prop("stencil_mask_front = 0x%02X", s->stencil_mask_front);
- pandecode_prop("stencil_mask_back = 0x%02X", s->stencil_mask_back);
- }
-
- pandecode_replay_stencil("front", &s->stencil_front);
- pandecode_replay_stencil("back", &s->stencil_back);
-
- if (is_bifrost) {
- pandecode_log(".bifrost2 = {\n");
- pandecode_indent++;
-
- pandecode_prop("unk3 = 0x%" PRIx32, s->bifrost2.unk3);
- pandecode_prop("preload_regs = 0x%" PRIx32, s->bifrost2.preload_regs);
- pandecode_prop("uniform_count = %" PRId32, s->bifrost2.uniform_count);
- pandecode_prop("unk4 = 0x%" PRIx32, s->bifrost2.unk4);
-
- pandecode_indent--;
- pandecode_log("},\n");
- } else if (s->midgard2.unknown2_7) {
- pandecode_log(".midgard2 = {\n");
- pandecode_indent++;
-
- pandecode_prop("unknown2_7 = 0x%" PRIx32, s->midgard2.unknown2_7);
- pandecode_indent--;
- pandecode_log("},\n");
- }
-
- if (s->unknown2_8)
- pandecode_prop("unknown2_8 = 0x%" PRIx32, s->unknown2_8);
-
- if (!is_bifrost) {
- /* TODO: Blend shaders routing/disasm */
-
- pandecode_midgard_blend(&s->blend, false);
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- /* MRT blend fields are used whenever MFBD is used, with
- * per-RT descriptors */
-
- if (job_type == JOB_TYPE_TILER) {
- void* blend_base = (void *) (s + 1);
-
- for (unsigned i = 0; i < rt_count; i++) {
- mali_ptr shader = 0;
-
- if (is_bifrost)
- shader = pandecode_bifrost_blend(blend_base, job_no, i);
- else
- shader = pandecode_midgard_blend_mrt(blend_base, job_no, i);
-
- if (shader)
- pandecode_shader_disassemble(shader, job_no, job_type, false);
- }
- }
-
- pandecode_shader_disassemble(shader_ptr, job_no, job_type, is_bifrost);
- } else
- pandecode_msg("<no shader>\n");
-
- if (p->viewport) {
- struct pandecode_mapped_memory *fmem = pandecode_find_mapped_gpu_mem_containing(p->viewport);
- struct mali_viewport *PANDECODE_PTR_VAR(f, fmem, p->viewport);
-
- pandecode_log("struct mali_viewport viewport_%d%s = {\n", job_no, suffix);
- pandecode_indent++;
-
- pandecode_prop("clip_minx = %f", f->clip_minx);
- pandecode_prop("clip_miny = %f", f->clip_miny);
- pandecode_prop("clip_minz = %f", f->clip_minz);
- pandecode_prop("clip_maxx = %f", f->clip_maxx);
- pandecode_prop("clip_maxy = %f", f->clip_maxy);
- pandecode_prop("clip_maxz = %f", f->clip_maxz);
-
- /* Only the higher coordinates are MALI_POSITIVE scaled */
-
- pandecode_prop("viewport0 = { %d, %d }",
- f->viewport0[0], f->viewport0[1]);
-
- pandecode_prop("viewport1 = { MALI_POSITIVE(%d), MALI_POSITIVE(%d) }",
- f->viewport1[0] + 1, f->viewport1[1] + 1);
-
- pandecode_indent--;
- pandecode_log("};\n");
- }
-
- if (p->attribute_meta) {
- unsigned max_attr_index = pandecode_replay_attribute_meta(job_no, attribute_count, p, false, suffix);
-
- attr_mem = pandecode_find_mapped_gpu_mem_containing(p->attributes);
- pandecode_replay_attributes(attr_mem, p->attributes, job_no, suffix, max_attr_index + 1, false);
- }
-
- /* Varyings are encoded like attributes but not actually sent; we just
- * pass a zero buffer with the right stride/size set, (or whatever)
- * since the GPU will write to it itself */
-
- if (p->varyings) {
- attr_mem = pandecode_find_mapped_gpu_mem_containing(p->varyings);
-
- /* Number of descriptors depends on whether there are
- * non-internal varyings */
-
- pandecode_replay_attributes(attr_mem, p->varyings, job_no, suffix, varying_count > 1 ? 4 : 1, true);
- }
-
- if (p->varying_meta) {
- pandecode_replay_attribute_meta(job_no, varying_count, p, true, suffix);
- }
-
- bool is_compute = job_type == JOB_TYPE_COMPUTE;
-
- if (p->uniforms && !is_compute) {
- int rows = uniform_count, width = 4;
- size_t sz = rows * width * sizeof(float);
-
- struct pandecode_mapped_memory *uniform_mem = pandecode_find_mapped_gpu_mem_containing(p->uniforms);
- pandecode_fetch_gpu_mem(uniform_mem, p->uniforms, sz);
- u32 *PANDECODE_PTR_VAR(uniforms, uniform_mem, p->uniforms);
-
- pandecode_log("u32 uniforms_%d%s[] = {\n", job_no, suffix);
-
- pandecode_indent++;
-
- for (int row = 0; row < rows; row++) {
- for (int i = 0; i < width; i++) {
- u32 v = uniforms[i];
- float f;
- memcpy(&f, &v, sizeof(v));
- pandecode_log_cont("%X /* %f */, ", v, f);
- }
-
- pandecode_log_cont("\n");
-
- uniforms += width;
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
- } else if (p->uniforms) {
- int rows = uniform_count * 2;
- size_t sz = rows * sizeof(mali_ptr);
-
- struct pandecode_mapped_memory *uniform_mem = pandecode_find_mapped_gpu_mem_containing(p->uniforms);
- pandecode_fetch_gpu_mem(uniform_mem, p->uniforms, sz);
- mali_ptr *PANDECODE_PTR_VAR(uniforms, uniform_mem, p->uniforms);
-
- pandecode_log("mali_ptr uniforms_%d%s[] = {\n", job_no, suffix);
-
- pandecode_indent++;
-
- for (int row = 0; row < rows; row++) {
- char *a = pointer_as_memory_reference(uniforms[row]);
- pandecode_log("%s,\n", a);
- free(a);
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- }
-
- if (p->uniform_buffers) {
- pandecode_replay_uniform_buffers(p->uniform_buffers, uniform_buffer_count, job_no);
- }
-
- if (p->texture_trampoline) {
- struct pandecode_mapped_memory *mmem = pandecode_find_mapped_gpu_mem_containing(p->texture_trampoline);
-
- if (mmem) {
- mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline);
-
- pandecode_log("uint64_t texture_trampoline_%d[] = {\n", job_no);
- pandecode_indent++;
-
- for (int tex = 0; tex < texture_count; ++tex) {
- mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline + tex * sizeof(mali_ptr));
- char *a = pointer_as_memory_reference(*u);
- pandecode_log("%s,\n", a);
- free(a);
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- /* Now, finally, descend down into the texture descriptor */
- for (int tex = 0; tex < texture_count; ++tex) {
- mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline + tex * sizeof(mali_ptr));
- struct pandecode_mapped_memory *tmem = pandecode_find_mapped_gpu_mem_containing(*u);
-
- if (tmem) {
- struct mali_texture_descriptor *PANDECODE_PTR_VAR(t, tmem, *u);
-
- pandecode_log("struct mali_texture_descriptor texture_descriptor_%"PRIx64"_%d_%d = {\n", *u, job_no, tex);
- pandecode_indent++;
-
- pandecode_prop("width = MALI_POSITIVE(%" PRId16 ")", t->width + 1);
- pandecode_prop("height = MALI_POSITIVE(%" PRId16 ")", t->height + 1);
- pandecode_prop("depth = MALI_POSITIVE(%" PRId16 ")", t->depth + 1);
- pandecode_prop("array_size = MALI_POSITIVE(%" PRId16 ")", t->array_size + 1);
- pandecode_prop("unknown3 = %" PRId16, t->unknown3);
- pandecode_prop("unknown3A = %" PRId8, t->unknown3A);
- pandecode_prop("nr_mipmap_levels = %" PRId8, t->nr_mipmap_levels);
-
- struct mali_texture_format f = t->format;
-
- pandecode_log(".format = {\n");
- pandecode_indent++;
-
- pandecode_replay_swizzle(f.swizzle);
- pandecode_prop("format = %s", pandecode_format_name(f.format));
- pandecode_prop("type = %s", pandecode_texture_type(f.type));
- pandecode_prop("srgb = %" PRId32, f.srgb);
- pandecode_prop("unknown1 = %" PRId32, f.unknown1);
- pandecode_prop("usage2 = 0x%" PRIx32, f.usage2);
-
- pandecode_indent--;
- pandecode_log("},\n");
-
- pandecode_replay_swizzle(t->swizzle);
-
- if (t->swizzle_zero) {
- /* Shouldn't happen */
- pandecode_msg("Swizzle zero tripped but replay will be fine anyway");
- pandecode_prop("swizzle_zero = %d", t->swizzle_zero);
- }
-
- pandecode_prop("unknown3 = 0x%" PRIx32, t->unknown3);
-
- pandecode_prop("unknown5 = 0x%" PRIx32, t->unknown5);
- pandecode_prop("unknown6 = 0x%" PRIx32, t->unknown6);
- pandecode_prop("unknown7 = 0x%" PRIx32, t->unknown7);
-
- pandecode_log(".payload = {\n");
- pandecode_indent++;
-
- /* A bunch of bitmap pointers follow.
- * We work out the correct number,
- * based on the mipmap/cubemap
- * properties, but dump extra
- * possibilities to futureproof */
-
- int bitmap_count = MALI_NEGATIVE(t->nr_mipmap_levels);
- bool manual_stride = f.usage2 & MALI_TEX_MANUAL_STRIDE;
-
- /* Miptree for each face */
- if (f.type == MALI_TEX_CUBE)
- bitmap_count *= 6;
-
- /* Array of textures */
- bitmap_count *= MALI_NEGATIVE(t->array_size);
-
- /* Stride for each element */
- if (manual_stride)
- bitmap_count *= 2;
-
- /* Sanity check the size */
- int max_count = sizeof(t->payload) / sizeof(t->payload[0]);
- assert (bitmap_count <= max_count);
-
- /* Dump more to be safe, but not _that_ much more */
- int safe_count = MIN2(bitmap_count * 2, max_count);
-
- for (int i = 0; i < safe_count; ++i) {
- char *prefix = (i >= bitmap_count) ? "// " : "";
-
- /* How we dump depends if this is a stride or a pointer */
-
- if ((f.usage2 & MALI_TEX_MANUAL_STRIDE) && (i & 1)) {
- /* signed 32-bit snuck in as a 64-bit pointer */
- uint64_t stride_set = t->payload[i];
- uint32_t clamped_stride = stride_set;
- int32_t stride = clamped_stride;
- assert(stride_set == clamped_stride);
- pandecode_log("%s(mali_ptr) %d /* stride */, \n", prefix, stride);
- } else {
- char *a = pointer_as_memory_reference(t->payload[i]);
- pandecode_log("%s%s, \n", prefix, a);
- free(a);
- }
- }
-
- pandecode_indent--;
- pandecode_log("},\n");
-
- pandecode_indent--;
- pandecode_log("};\n");
- }
- }
- }
- }
-
- if (p->sampler_descriptor) {
- struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(p->sampler_descriptor);
-
- if (smem) {
- struct mali_sampler_descriptor *s;
-
- mali_ptr d = p->sampler_descriptor;
-
- for (int i = 0; i < sampler_count; ++i) {
- s = pandecode_fetch_gpu_mem(smem, d + sizeof(*s) * i, sizeof(*s));
-
- pandecode_log("struct mali_sampler_descriptor sampler_descriptor_%d_%d = {\n", job_no, i);
- pandecode_indent++;
-
- /* Only the lower two bits are understood right now; the rest we display as hex */
- pandecode_log(".filter_mode = MALI_TEX_MIN(%s) | MALI_TEX_MAG(%s) | 0x%" PRIx32",\n",
- MALI_FILTER_NAME(s->filter_mode & MALI_TEX_MIN_MASK),
- MALI_FILTER_NAME(s->filter_mode & MALI_TEX_MAG_MASK),
- s->filter_mode & ~3);
-
- pandecode_prop("min_lod = FIXED_16(%f)", DECODE_FIXED_16(s->min_lod));
- pandecode_prop("max_lod = FIXED_16(%f)", DECODE_FIXED_16(s->max_lod));
-
- pandecode_prop("wrap_s = %s", pandecode_wrap_mode_name(s->wrap_s));
- pandecode_prop("wrap_t = %s", pandecode_wrap_mode_name(s->wrap_t));
- pandecode_prop("wrap_r = %s", pandecode_wrap_mode_name(s->wrap_r));
-
- pandecode_prop("compare_func = %s", pandecode_alt_func_name(s->compare_func));
-
- if (s->zero || s->zero2) {
- pandecode_msg("Zero tripped\n");
- pandecode_prop("zero = 0x%X, 0x%X\n", s->zero, s->zero2);
- }
-
- pandecode_prop("seamless_cube_map = %d", s->seamless_cube_map);
-
- pandecode_prop("border_color = { %f, %f, %f, %f }",
- s->border_color[0],
- s->border_color[1],
- s->border_color[2],
- s->border_color[3]);
-
- pandecode_indent--;
- pandecode_log("};\n");
- }
- }
- }
-}
-
-static void
-pandecode_replay_vertex_tiler_postfix(const struct mali_vertex_tiler_postfix *p, int job_no, bool is_bifrost)
-{
- pandecode_log_cont("{\n");
- pandecode_indent++;
-
- MEMORY_PROP(p, position_varying);
- DYN_MEMORY_PROP(p, job_no, uniform_buffers);
- DYN_MEMORY_PROP(p, job_no, texture_trampoline);
- DYN_MEMORY_PROP(p, job_no, sampler_descriptor);
- DYN_MEMORY_PROP(p, job_no, uniforms);
- DYN_MEMORY_PROP(p, job_no, attributes);
- DYN_MEMORY_PROP(p, job_no, attribute_meta);
- DYN_MEMORY_PROP(p, job_no, varyings);
- DYN_MEMORY_PROP(p, job_no, varying_meta);
- DYN_MEMORY_PROP(p, job_no, viewport);
- DYN_MEMORY_PROP(p, job_no, occlusion_counter);
-
- if (is_bifrost)
- pandecode_prop("framebuffer = scratchpad_%d_p", job_no);
- else
- pandecode_prop("framebuffer = framebuffer_%d_p | %s", job_no, p->framebuffer & MALI_MFBD ? "MALI_MFBD" : "0");
-
- pandecode_prop("_shader_upper = (shader_meta_%d_p) >> 4", job_no);
- pandecode_prop("flags = %d", p->flags);
-
- pandecode_indent--;
- pandecode_log("},\n");
-}
-
-static void
-pandecode_replay_vertex_only_bfr(struct bifrost_vertex_only *v)
-{
- pandecode_log_cont("{\n");
- pandecode_indent++;
-
- pandecode_prop("unk2 = 0x%x", v->unk2);
-
- if (v->zero0 || v->zero1) {
- pandecode_msg("vertex only zero tripped");
- pandecode_prop("zero0 = 0x%" PRIx32, v->zero0);
- pandecode_prop("zero1 = 0x%" PRIx64, v->zero1);
- }
-
- pandecode_indent--;
- pandecode_log("}\n");
-}
-
-static void
-pandecode_replay_tiler_heap_meta(mali_ptr gpu_va, int job_no)
-{
-
- struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
- const struct bifrost_tiler_heap_meta *PANDECODE_PTR_VAR(h, mem, gpu_va);
-
- pandecode_log("struct mali_tiler_heap_meta tiler_heap_meta_%d = {\n", job_no);
- pandecode_indent++;
-
- if (h->zero) {
- pandecode_msg("tiler heap zero tripped\n");
- pandecode_prop("zero = 0x%x", h->zero);
- }
-
- for (int i = 0; i < 12; i++) {
- if (h->zeros[i] != 0) {
- pandecode_msg("tiler heap zero %d tripped, value %x\n",
- i, h->zeros[i]);
- }
- }
-
- pandecode_prop("heap_size = 0x%x", h->heap_size);
- MEMORY_PROP(h, tiler_heap_start);
- MEMORY_PROP(h, tiler_heap_free);
-
- /* this might point to the beginning of another buffer, when it's
- * really the end of the tiler heap buffer, so we have to be careful
- * here.
- */
- char *a = pointer_as_memory_reference(h->tiler_heap_end - 1);
- pandecode_prop("tiler_heap_end = %s + 1", a);
- free(a);
-
- pandecode_indent--;
- pandecode_log("};\n");
-}
-
-static void
-pandecode_replay_tiler_meta(mali_ptr gpu_va, int job_no)
-{
- struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
- const struct bifrost_tiler_meta *PANDECODE_PTR_VAR(t, mem, gpu_va);
-
- pandecode_replay_tiler_heap_meta(t->tiler_heap_meta, job_no);
-
- pandecode_log("struct bifrost_tiler_meta tiler_meta_%d = {\n", job_no);
- pandecode_indent++;
-
- if (t->zero0 || t->zero1) {
- pandecode_msg("tiler meta zero tripped");
- pandecode_prop("zero0 = 0x%" PRIx64, t->zero0);
- pandecode_prop("zero1 = 0x%" PRIx64, t->zero1);
- }
-
- pandecode_prop("hierarchy_mask = 0x%" PRIx16, t->hierarchy_mask);
- pandecode_prop("flags = 0x%" PRIx16, t->flags);
-
- pandecode_prop("width = MALI_POSITIVE(%d)", t->width + 1);
- pandecode_prop("height = MALI_POSITIVE(%d)", t->height + 1);
- DYN_MEMORY_PROP(t, job_no, tiler_heap_meta);
-
- for (int i = 0; i < 12; i++) {
- if (t->zeros[i] != 0) {
- pandecode_msg("tiler heap zero %d tripped, value %" PRIx64 "\n",
- i, t->zeros[i]);
- }
- }
-
- pandecode_indent--;
- pandecode_log("};\n");
-}
-
-static void
-pandecode_replay_gl_enables(uint32_t gl_enables, int job_type)
-{
- pandecode_log(".gl_enables = ");
-
- pandecode_log_decoded_flags(gl_enable_flag_info, gl_enables);
-
- pandecode_log_cont(",\n");
-}
-
-static void
-pandecode_replay_primitive_size(union midgard_primitive_size u, bool constant)
-{
- if (u.pointer == 0x0)
- return;
-
- pandecode_log(".primitive_size = {\n");
- pandecode_indent++;
-
- if (constant) {
- pandecode_prop("constant = %f", u.constant);
- } else {
- MEMORY_PROP((&u), pointer);
- }
-
- pandecode_indent--;
- pandecode_log("},\n");
-}
-
-static void
-pandecode_replay_tiler_only_bfr(const struct bifrost_tiler_only *t, int job_no)
-{
- pandecode_log_cont("{\n");
- pandecode_indent++;
-
- /* TODO: gl_PointSize on Bifrost */
- pandecode_replay_primitive_size(t->primitive_size, true);
-
- DYN_MEMORY_PROP(t, job_no, tiler_meta);
- pandecode_replay_gl_enables(t->gl_enables, JOB_TYPE_TILER);
-
- if (t->zero1 || t->zero2 || t->zero3 || t->zero4 || t->zero5
- || t->zero6 || t->zero7 || t->zero8) {
- pandecode_msg("tiler only zero tripped");
- pandecode_prop("zero1 = 0x%" PRIx64, t->zero1);
- pandecode_prop("zero2 = 0x%" PRIx64, t->zero2);
- pandecode_prop("zero3 = 0x%" PRIx64, t->zero3);
- pandecode_prop("zero4 = 0x%" PRIx64, t->zero4);
- pandecode_prop("zero5 = 0x%" PRIx64, t->zero5);
- pandecode_prop("zero6 = 0x%" PRIx64, t->zero6);
- pandecode_prop("zero7 = 0x%" PRIx32, t->zero7);
- pandecode_prop("zero8 = 0x%" PRIx64, t->zero8);
- }
-
- pandecode_indent--;
- pandecode_log("},\n");
-}
-
-static int
-pandecode_replay_vertex_job_bfr(const struct mali_job_descriptor_header *h,
- const struct pandecode_mapped_memory *mem,
- mali_ptr payload, int job_no)
-{
- struct bifrost_payload_vertex *PANDECODE_PTR_VAR(v, mem, payload);
-
- pandecode_replay_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", true);
-
- pandecode_log("struct bifrost_payload_vertex payload_%d = {\n", job_no);
- pandecode_indent++;
-
- pandecode_log(".prefix = ");
- pandecode_replay_vertex_tiler_prefix(&v->prefix, job_no);
-
- pandecode_log(".vertex = ");
- pandecode_replay_vertex_only_bfr(&v->vertex);
-
- pandecode_log(".postfix = ");
- pandecode_replay_vertex_tiler_postfix(&v->postfix, job_no, true);
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- return sizeof(*v);
-}
-
-static int
-pandecode_replay_tiler_job_bfr(const struct mali_job_descriptor_header *h,
- const struct pandecode_mapped_memory *mem,
- mali_ptr payload, int job_no)
-{
- struct bifrost_payload_tiler *PANDECODE_PTR_VAR(t, mem, payload);
-
- pandecode_replay_vertex_tiler_postfix_pre(&t->postfix, job_no, h->job_type, "", true);
-
- pandecode_replay_indices(t->prefix.indices, t->prefix.index_count, job_no);
- pandecode_replay_tiler_meta(t->tiler.tiler_meta, job_no);
-
- pandecode_log("struct bifrost_payload_tiler payload_%d = {\n", job_no);
- pandecode_indent++;
-
- pandecode_log(".prefix = ");
- pandecode_replay_vertex_tiler_prefix(&t->prefix, job_no);
-
- pandecode_log(".tiler = ");
- pandecode_replay_tiler_only_bfr(&t->tiler, job_no);
-
- pandecode_log(".postfix = ");
- pandecode_replay_vertex_tiler_postfix(&t->postfix, job_no, true);
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- return sizeof(*t);
-}
-
-static int
-pandecode_replay_vertex_or_tiler_job_mdg(const struct mali_job_descriptor_header *h,
- const struct pandecode_mapped_memory *mem,
- mali_ptr payload, int job_no)
-{
- struct midgard_payload_vertex_tiler *PANDECODE_PTR_VAR(v, mem, payload);
-
- pandecode_replay_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", false);
-
- pandecode_replay_indices(v->prefix.indices, v->prefix.index_count, job_no);
-
- pandecode_log("struct midgard_payload_vertex_tiler payload_%d = {\n", job_no);
- pandecode_indent++;
-
- bool has_primitive_pointer = v->prefix.unknown_draw & MALI_DRAW_VARYING_SIZE;
- pandecode_replay_primitive_size(v->primitive_size, !has_primitive_pointer);
-
- pandecode_log(".prefix = ");
- pandecode_replay_vertex_tiler_prefix(&v->prefix, job_no);
-
- pandecode_replay_gl_enables(v->gl_enables, h->job_type);
-
- if (v->instance_shift || v->instance_odd) {
- pandecode_prop("instance_shift = 0x%d /* %d */",
- v->instance_shift, 1 << v->instance_shift);
- pandecode_prop("instance_odd = 0x%X /* %d */",
- v->instance_odd, (2 * v->instance_odd) + 1);
-
- pandecode_padded_vertices(v->instance_shift, v->instance_odd);
- }
-
- if (v->draw_start)
- pandecode_prop("draw_start = %d", v->draw_start);
-
-#ifndef __LP64__
-
- if (v->zero3) {
- pandecode_msg("Zero tripped\n");
- pandecode_prop("zero3 = 0x%" PRIx32, v->zero3);
- }
-
-#endif
-
- if (v->zero5) {
- pandecode_msg("Zero tripped\n");
- pandecode_prop("zero5 = 0x%" PRIx64, v->zero5);
- }
-
- pandecode_log(".postfix = ");
- pandecode_replay_vertex_tiler_postfix(&v->postfix, job_no, false);
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- return sizeof(*v);
-}
-
-static int
-pandecode_replay_fragment_job(const struct pandecode_mapped_memory *mem,
- mali_ptr payload, int job_no,
- bool is_bifrost)
-{
- const struct mali_payload_fragment *PANDECODE_PTR_VAR(s, mem, payload);
-
- bool fbd_dumped = false;
-
- if (!is_bifrost && (s->framebuffer & FBD_TYPE) == MALI_SFBD) {
- /* Only SFBDs are understood, not MFBDs. We're speculating,
- * based on the versioning, kernel code, etc, that the
- * difference is between Single FrameBuffer Descriptor and
- * Multiple FrmaeBuffer Descriptor; the change apparently lines
- * up with multi-framebuffer support being added (T7xx onwards,
- * including Gxx). In any event, there's some field shuffling
- * that we haven't looked into yet. */
-
- pandecode_replay_sfbd(s->framebuffer & FBD_MASK, job_no);
- fbd_dumped = true;
- } else if ((s->framebuffer & FBD_TYPE) == MALI_MFBD) {
- /* We don't know if Bifrost supports SFBD's at all, since the
- * driver never uses them. And the format is different from
- * Midgard anyways, due to the tiler heap and scratchpad being
- * moved out into separate structures, so it's not clear what a
- * Bifrost SFBD would even look like without getting an actual
- * trace, which appears impossible.
- */
-
- pandecode_replay_mfbd_bfr(s->framebuffer & FBD_MASK, job_no, true);
- fbd_dumped = true;
- }
-
- uintptr_t p = (uintptr_t) s->framebuffer & FBD_MASK;
- pandecode_log("struct mali_payload_fragment payload_%"PRIx64"_%d = {\n", payload, job_no);
- pandecode_indent++;
-
- /* See the comments by the macro definitions for mathematical context
- * on why this is so weird */
-
- if (MALI_TILE_COORD_FLAGS(s->max_tile_coord) || MALI_TILE_COORD_FLAGS(s->min_tile_coord))
- pandecode_msg("Tile coordinate flag missed, replay wrong\n");
-
- pandecode_prop("min_tile_coord = MALI_COORDINATE_TO_TILE_MIN(%d, %d)",
- MALI_TILE_COORD_X(s->min_tile_coord) << MALI_TILE_SHIFT,
- MALI_TILE_COORD_Y(s->min_tile_coord) << MALI_TILE_SHIFT);
-
- pandecode_prop("max_tile_coord = MALI_COORDINATE_TO_TILE_MAX(%d, %d)",
- (MALI_TILE_COORD_X(s->max_tile_coord) + 1) << MALI_TILE_SHIFT,
- (MALI_TILE_COORD_Y(s->max_tile_coord) + 1) << MALI_TILE_SHIFT);
-
- /* If the FBD was just decoded, we can refer to it by pointer. If not,
- * we have to fallback on offsets. */
-
- const char *fbd_type = s->framebuffer & MALI_MFBD ? "MALI_MFBD" : "MALI_SFBD";
-
- if (fbd_dumped)
- pandecode_prop("framebuffer = framebuffer_%d_p | %s", job_no, fbd_type);
- else
- pandecode_prop("framebuffer = %s | %s", pointer_as_memory_reference(p), fbd_type);
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- return sizeof(*s);
-}
-
-static int job_descriptor_number = 0;
-
-int
-pandecode_replay_jc(mali_ptr jc_gpu_va, bool bifrost)
-{
- struct mali_job_descriptor_header *h;
-
- int start_number = 0;
-
- bool first = true;
- bool last_size;
-
- do {
- struct pandecode_mapped_memory *mem =
- pandecode_find_mapped_gpu_mem_containing(jc_gpu_va);
-
- void *payload;
-
- h = PANDECODE_PTR(mem, jc_gpu_va, struct mali_job_descriptor_header);
-
- /* On Midgard, for 32-bit jobs except for fragment jobs, the
- * high 32-bits of the 64-bit pointer are reused to store
- * something else.
- */
- int offset = h->job_descriptor_size == MALI_JOB_32 &&
- h->job_type != JOB_TYPE_FRAGMENT ? 4 : 0;
- mali_ptr payload_ptr = jc_gpu_va + sizeof(*h) - offset;
-
- payload = pandecode_fetch_gpu_mem(mem, payload_ptr,
- MALI_PAYLOAD_SIZE);
-
- int job_no = job_descriptor_number++;
-
- if (first)
- start_number = job_no;
-
- pandecode_log("struct mali_job_descriptor_header job_%"PRIx64"_%d = {\n", jc_gpu_va, job_no);
- pandecode_indent++;
-
- pandecode_prop("job_type = %s", pandecode_job_type_name(h->job_type));
-
- /* Save for next job fixing */
- last_size = h->job_descriptor_size;
-
- if (h->job_descriptor_size)
- pandecode_prop("job_descriptor_size = %d", h->job_descriptor_size);
-
- if (h->exception_status != 0x1)
- pandecode_prop("exception_status = %x (source ID: 0x%x access: 0x%x exception: 0x%x)",
- h->exception_status,
- (h->exception_status >> 16) & 0xFFFF,
- (h->exception_status >> 8) & 0x3,
- h->exception_status & 0xFF);
-
- if (h->first_incomplete_task)
- pandecode_prop("first_incomplete_task = %d", h->first_incomplete_task);
-
- if (h->fault_pointer)
- pandecode_prop("fault_pointer = 0x%" PRIx64, h->fault_pointer);
-
- if (h->job_barrier)
- pandecode_prop("job_barrier = %d", h->job_barrier);
-
- pandecode_prop("job_index = %d", h->job_index);
-
- if (h->unknown_flags)
- pandecode_prop("unknown_flags = %d", h->unknown_flags);
-
- if (h->job_dependency_index_1)
- pandecode_prop("job_dependency_index_1 = %d", h->job_dependency_index_1);
-
- if (h->job_dependency_index_2)
- pandecode_prop("job_dependency_index_2 = %d", h->job_dependency_index_2);
-
- pandecode_indent--;
- pandecode_log("};\n");
-
- /* Do not touch the field yet -- decode the payload first, and
- * don't touch that either. This is essential for the uploads
- * to occur in sequence and therefore be dynamically allocated
- * correctly. Do note the size, however, for that related
- * reason. */
-
- switch (h->job_type) {
- case JOB_TYPE_SET_VALUE: {
- struct mali_payload_set_value *s = payload;
- pandecode_log("struct mali_payload_set_value payload_%"PRIx64"_%d = {\n", payload_ptr, job_no);
- pandecode_indent++;
- MEMORY_PROP(s, out);
- pandecode_prop("unknown = 0x%" PRIX64, s->unknown);
- pandecode_indent--;
- pandecode_log("};\n");
-
- break;
- }
-
- case JOB_TYPE_TILER:
- case JOB_TYPE_VERTEX:
- case JOB_TYPE_COMPUTE:
- if (bifrost) {
- if (h->job_type == JOB_TYPE_TILER)
- pandecode_replay_tiler_job_bfr(h, mem, payload_ptr, job_no);
- else
- pandecode_replay_vertex_job_bfr(h, mem, payload_ptr, job_no);
- } else
- pandecode_replay_vertex_or_tiler_job_mdg(h, mem, payload_ptr, job_no);
-
- break;
-
- case JOB_TYPE_FRAGMENT:
- pandecode_replay_fragment_job(mem, payload_ptr, job_no, bifrost);
- break;
-
- default:
- break;
- }
-
- /* Handle linkage */
-
- if (!first) {
- pandecode_log("((struct mali_job_descriptor_header *) (uintptr_t) job_%d_p)->", job_no - 1);
-
- if (last_size)
- pandecode_log_cont("next_job_64 = job_%d_p;\n\n", job_no);
- else
- pandecode_log_cont("next_job_32 = (u32) (uintptr_t) job_%d_p;\n\n", job_no);
- }
-
- first = false;
-
- } while ((jc_gpu_va = h->job_descriptor_size ? h->next_job_64 : h->next_job_32));
-
- return start_number;
-}
+++ /dev/null
-/*
- * Copyright (C) 2017-2019 Lyude Paul
- * Copyright (C) 2017-2019 Alyssa Rosenzweig
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef __PAN_DECODE_H__
-#define __PAN_DECODE_H__
-
-#include <stdlib.h>
-#include <stddef.h>
-#include <panfrost-job.h>
-#include "util/list.h"
-
-struct pandecode_mapped_memory {
- struct list_head node;
-
- size_t length;
-
- void *addr;
- mali_ptr gpu_va;
-
- char name[32];
-};
-
-void pandecode_initialize(void);
-
-char *pointer_as_memory_reference(mali_ptr ptr);
-
-struct pandecode_mapped_memory *pandecode_find_mapped_gpu_mem_containing(mali_ptr addr);
-
-void
-pandecode_inject_mmap(mali_ptr gpu_va, void *cpu, unsigned sz, const char *name);
-
-static inline void *
-__pandecode_fetch_gpu_mem(const struct pandecode_mapped_memory *mem,
- mali_ptr gpu_va, size_t size,
- int line, const char *filename)
-{
- if (!mem)
- mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
-
- if (!mem) {
- fprintf(stderr, "Access to unknown memory %" PRIx64 " in %s:%d",
- gpu_va, filename, line);
- assert(0);
- }
-
- assert(mem);
- assert(size + (gpu_va - mem->gpu_va) <= mem->length);
-
- return mem->addr + gpu_va - mem->gpu_va;
-}
-
-#define pandecode_fetch_gpu_mem(mem, gpu_va, size) \
- __pandecode_fetch_gpu_mem(mem, gpu_va, size, __LINE__, __FILE__)
-
-/* Returns a validated pointer to mapped GPU memory with the given pointer type,
- * size automatically determined from the pointer type
- */
-#define PANDECODE_PTR(mem, gpu_va, type) \
- ((type*)(__pandecode_fetch_gpu_mem(mem, gpu_va, sizeof(type), \
- __LINE__, __FILE__)))
-
-/* Usage: <variable type> PANDECODE_PTR_VAR(name, mem, gpu_va) */
-#define PANDECODE_PTR_VAR(name, mem, gpu_va) \
- name = __pandecode_fetch_gpu_mem(mem, gpu_va, sizeof(*name), \
- __LINE__, __FILE__)
-
-/* Common entrypoint */
-int pandecode_replay_jc(mali_ptr jc_gpu_va, bool bifrost);
-
-#endif /* __MMAP_TRACE_H__ */
--- /dev/null
+/*
+ * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com>
+ * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com>
+ * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __bifrost_h__
+#define __bifrost_h__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+struct bifrost_header {
+ unsigned unk0 : 7;
+ // If true, convert any infinite result of any floating-point operation to
+ // the biggest representable number.
+ unsigned suppress_inf: 1;
+ // Convert any NaN results to 0.
+ unsigned suppress_nan : 1;
+ unsigned unk1 : 2;
+ // true if the execution mask of the next clause is the same as the mask of
+ // the current clause.
+ unsigned back_to_back : 1;
+ unsigned no_end_of_shader: 1;
+ unsigned unk2 : 2;
+ // Set to true for fragment shaders, to implement this bit of spec text
+ // from section 7.1.5 of the GLSL ES spec:
+ //
+ // "Stores to image and buffer variables performed by helper invocations
+ // have no effect on the underlying image or buffer memory."
+ //
+ // Helper invocations are threads (invocations) corresponding to pixels in
+ // a quad that aren't actually part of the triangle, but are included to
+ // make derivatives work correctly. They're usually turned on, but they
+ // need to be masked off for GLSL-level stores. This bit seems to be the
+ // only bit that's actually different between fragment shaders and other
+ // shaders, so this is probably what it's doing.
+ unsigned elide_writes : 1;
+ // If backToBack is off:
+ // - true for conditional branches and fallthrough
+ // - false for unconditional branches
+ // The blob seems to always set it to true if back-to-back is on.
+ unsigned branch_cond : 1;
+ // This bit is set when the next clause writes to the data register of some
+ // previous clause.
+ unsigned datareg_writebarrier: 1;
+ unsigned datareg : 6;
+ unsigned scoreboard_deps: 8;
+ unsigned scoreboard_index: 3;
+ unsigned clause_type: 4;
+ unsigned unk3 : 1; // part of clauseType?
+ unsigned next_clause_type: 4;
+ unsigned unk4 : 1; // part of nextClauseType?
+};
+
+struct bifrost_fma_inst {
+ unsigned src0 : 3;
+ unsigned op : 20;
+};
+
+struct bifrost_add_inst {
+ unsigned src0 : 3;
+ unsigned op : 17;
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "main/mtypes.h"
+#include "compiler/glsl/standalone.h"
+#include "compiler/glsl/glsl_to_nir.h"
+#include "compiler/nir_types.h"
+#include "disassemble.h"
+#include "util/u_dynarray.h"
+
+static void
+disassemble(const char *filename)
+{
+ FILE *fp = fopen(filename, "rb");
+ assert(fp);
+
+ fseek(fp, 0, SEEK_END);
+ int filesize = ftell(fp);
+ rewind(fp);
+
+ unsigned char *code = malloc(filesize);
+ int res = fread(code, 1, filesize, fp);
+ if (res != filesize) {
+ printf("Couldn't read full file\n");
+ }
+ fclose(fp);
+
+ disassemble_bifrost(code, filesize, false);
+ free(code);
+}
+
+int
+main(int argc, char **argv)
+{
+ if (argc < 2) {
+ printf("Pass a command\n");
+ exit(1);
+ }
+ if (strcmp(argv[1], "disasm") == 0) {
+ disassemble(argv[2]);
+ }
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com>
+ * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com>
+ * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <string.h>
+
+#include "bifrost.h"
+#include "disassemble.h"
+#include "util/macros.h"
+
+// return bits (high, lo]
+static uint64_t bits(uint32_t word, unsigned lo, unsigned high)
+{
+ if (high == 32)
+ return word >> lo;
+ return (word & ((1 << high) - 1)) >> lo;
+}
+
+// each of these structs represents an instruction that's dispatched in one
+// cycle. Note that these instructions are packed in funny ways within the
+// clause, hence the need for a separate struct.
+struct bifrost_alu_inst {
+ uint32_t fma_bits;
+ uint32_t add_bits;
+ uint64_t reg_bits;
+};
+
+struct bifrost_regs {
+ unsigned uniform_const : 8;
+ unsigned reg2 : 6;
+ unsigned reg3 : 6;
+ unsigned reg0 : 5;
+ unsigned reg1 : 6;
+ unsigned ctrl : 4;
+};
+
+static unsigned get_reg0(struct bifrost_regs regs)
+{
+ if (regs.ctrl == 0)
+ return regs.reg0 | ((regs.reg1 & 0x1) << 5);
+
+ return regs.reg0 <= regs.reg1 ? regs.reg0 : 63 - regs.reg0;
+}
+
+static unsigned get_reg1(struct bifrost_regs regs)
+{
+ return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1;
+}
+
+enum bifrost_reg_write_unit {
+ REG_WRITE_NONE = 0, // don't write
+ REG_WRITE_TWO, // write using reg2
+ REG_WRITE_THREE, // write using reg3
+};
+
+// this represents the decoded version of the ctrl register field.
+struct bifrost_reg_ctrl{
+ bool read_reg0;
+ bool read_reg1;
+ bool read_reg3;
+ enum bifrost_reg_write_unit fma_write_unit;
+ enum bifrost_reg_write_unit add_write_unit;
+ bool clause_start;
+};
+
+enum fma_src_type {
+ FMA_ONE_SRC,
+ FMA_TWO_SRC,
+ FMA_FADD,
+ FMA_FMINMAX,
+ FMA_FADD16,
+ FMA_FMINMAX16,
+ FMA_FCMP,
+ FMA_FCMP16,
+ FMA_THREE_SRC,
+ FMA_FMA,
+ FMA_FMA16,
+ FMA_FOUR_SRC,
+ FMA_FMA_MSCALE,
+ FMA_SHIFT_ADD64,
+};
+
+struct fma_op_info {
+ unsigned op;
+ char name[30];
+ enum fma_src_type src_type;
+};
+
+enum add_src_type {
+ ADD_ONE_SRC,
+ ADD_TWO_SRC,
+ ADD_FADD,
+ ADD_FMINMAX,
+ ADD_FADD16,
+ ADD_FMINMAX16,
+ ADD_THREE_SRC,
+ ADD_FADDMscale,
+ ADD_FCMP,
+ ADD_FCMP16,
+ ADD_TEX_COMPACT, // texture instruction with embedded sampler
+ ADD_TEX, // texture instruction with sampler/etc. in uniform port
+ ADD_VARYING_INTERP,
+ ADD_BLENDING,
+ ADD_LOAD_ATTR,
+ ADD_VARYING_ADDRESS,
+ ADD_BRANCH,
+};
+
+struct add_op_info {
+ unsigned op;
+ char name[30];
+ enum add_src_type src_type;
+ bool has_data_reg;
+};
+
+struct bifrost_tex_ctrl {
+ unsigned sampler_index : 4; // also used to signal indirects
+ unsigned tex_index : 7;
+ bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices
+ bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather)
+ unsigned unk0 : 2;
+ bool texel_offset : 1; // *Offset()
+ bool is_shadow : 1;
+ bool is_array : 1;
+ unsigned tex_type : 2; // 2D, 3D, Cube, Buffer
+ bool compute_lod : 1; // 0 for *Lod()
+ bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied
+ bool calc_gradients : 1; // 0 for *Grad()
+ unsigned unk1 : 1;
+ unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits?
+ unsigned unk2 : 4;
+};
+
+struct bifrost_dual_tex_ctrl {
+ unsigned sampler_index0 : 2;
+ unsigned unk0 : 2;
+ unsigned tex_index0 : 2;
+ unsigned sampler_index1 : 2;
+ unsigned tex_index1 : 2;
+ unsigned unk1 : 22;
+};
+
+enum branch_cond {
+ BR_COND_LT = 0,
+ BR_COND_LE = 1,
+ BR_COND_GE = 2,
+ BR_COND_GT = 3,
+ // Equal vs. not-equal determined by src0/src1 comparison
+ BR_COND_EQ = 4,
+ // floating-point comparisons
+ // Becomes UNE when you flip the arguments
+ BR_COND_OEQ = 5,
+ // TODO what happens when you flip the arguments?
+ BR_COND_OGT = 6,
+ BR_COND_OLT = 7,
+};
+
+enum branch_bit_size {
+ BR_SIZE_32 = 0,
+ BR_SIZE_16XX = 1,
+ BR_SIZE_16YY = 2,
+ // For the above combinations of bitsize and location, an extra bit is
+ // encoded via comparing the sources. The only possible source of ambiguity
+ // would be if the sources were the same, but then the branch condition
+ // would be always true or always false anyways, so we can ignore it. But
+ // this no longer works when comparing the y component to the x component,
+ // since it's valid to compare the y component of a source against its own
+ // x component. Instead, the extra bit is encoded via an extra bitsize.
+ BR_SIZE_16YX0 = 3,
+ BR_SIZE_16YX1 = 4,
+ BR_SIZE_32_AND_16X = 5,
+ BR_SIZE_32_AND_16Y = 6,
+ // Used for comparisons with zero and always-true, see below. I think this
+ // only works for integer comparisons.
+ BR_SIZE_ZERO = 7,
+};
+
+enum branch_code {
+ BR_ALWAYS = 63,
+};
+
+void dump_header(struct bifrost_header header, bool verbose);
+void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts,
+ unsigned data_reg, unsigned offset, bool verbose);
+bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose);
+
+void dump_header(struct bifrost_header header, bool verbose) {
+ if (header.clause_type != 0) {
+ printf("id(%du) ", header.scoreboard_index);
+ }
+
+ if (header.scoreboard_deps != 0) {
+ printf("next-wait(");
+ bool first = true;
+ for (unsigned i = 0; i < 8; i++) {
+ if (header.scoreboard_deps & (1 << i)) {
+ if (!first) {
+ printf(", ");
+ }
+ printf("%d", i);
+ first = false;
+ }
+ }
+ printf(") ");
+ }
+
+ if (header.datareg_writebarrier)
+ printf("data-reg-barrier ");
+
+ if (!header.no_end_of_shader)
+ printf("eos ");
+
+ if (!header.back_to_back) {
+ printf("nbb ");
+ if (header.branch_cond)
+ printf("branch-cond ");
+ else
+ printf("branch-uncond ");
+ }
+
+ if (header.elide_writes)
+ printf("we ");
+
+ if (header.suppress_inf)
+ printf("suppress-inf ");
+ if (header.suppress_nan)
+ printf("suppress-nan ");
+
+ if (header.unk0)
+ printf("unk0 ");
+ if (header.unk1)
+ printf("unk1 ");
+ if (header.unk2)
+ printf("unk2 ");
+ if (header.unk3)
+ printf("unk3 ");
+ if (header.unk4)
+ printf("unk4 ");
+
+ printf("\n");
+
+ if (verbose) {
+ printf("# clause type %d, next clause type %d\n",
+ header.clause_type, header.next_clause_type);
+ }
+}
+
+static struct bifrost_reg_ctrl DecodeRegCtrl(struct bifrost_regs regs)
+{
+ struct bifrost_reg_ctrl decoded = {};
+ unsigned ctrl;
+ if (regs.ctrl == 0) {
+ ctrl = regs.reg1 >> 2;
+ decoded.read_reg0 = !(regs.reg1 & 0x2);
+ decoded.read_reg1 = false;
+ } else {
+ ctrl = regs.ctrl;
+ decoded.read_reg0 = decoded.read_reg1 = true;
+ }
+ switch (ctrl) {
+ case 1:
+ decoded.fma_write_unit = REG_WRITE_TWO;
+ break;
+ case 3:
+ decoded.fma_write_unit = REG_WRITE_TWO;
+ decoded.read_reg3 = true;
+ break;
+ case 4:
+ decoded.read_reg3 = true;
+ break;
+ case 5:
+ decoded.add_write_unit = REG_WRITE_TWO;
+ break;
+ case 6:
+ decoded.add_write_unit = REG_WRITE_TWO;
+ decoded.read_reg3 = true;
+ break;
+ case 8:
+ decoded.clause_start = true;
+ break;
+ case 9:
+ decoded.fma_write_unit = REG_WRITE_TWO;
+ decoded.clause_start = true;
+ break;
+ case 11:
+ break;
+ case 12:
+ decoded.read_reg3 = true;
+ decoded.clause_start = true;
+ break;
+ case 13:
+ decoded.add_write_unit = REG_WRITE_TWO;
+ decoded.clause_start = true;
+ break;
+ case 15:
+ decoded.fma_write_unit = REG_WRITE_THREE;
+ decoded.add_write_unit = REG_WRITE_TWO;
+ break;
+ default:
+ printf("# unknown reg ctrl %d\n", ctrl);
+ }
+
+ return decoded;
+}
+
+// Pass in the add_write_unit or fma_write_unit, and this returns which register
+// the ADD/FMA units are writing to
+static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs)
+{
+ switch (unit) {
+ case REG_WRITE_TWO:
+ return regs.reg2;
+ case REG_WRITE_THREE:
+ return regs.reg3;
+ default: /* REG_WRITE_NONE */
+ assert(0);
+ return 0;
+ }
+}
+
+static void dump_regs(struct bifrost_regs srcs)
+{
+ struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(srcs);
+ printf("# ");
+ if (ctrl.read_reg0)
+ printf("port 0: R%d ", get_reg0(srcs));
+ if (ctrl.read_reg1)
+ printf("port 1: R%d ", get_reg1(srcs));
+
+ if (ctrl.fma_write_unit == REG_WRITE_TWO)
+ printf("port 2: R%d (write FMA) ", srcs.reg2);
+ else if (ctrl.add_write_unit == REG_WRITE_TWO)
+ printf("port 2: R%d (write ADD) ", srcs.reg2);
+
+ if (ctrl.fma_write_unit == REG_WRITE_THREE)
+ printf("port 3: R%d (write FMA) ", srcs.reg3);
+ else if (ctrl.add_write_unit == REG_WRITE_THREE)
+ printf("port 3: R%d (write ADD) ", srcs.reg3);
+ else if (ctrl.read_reg3)
+ printf("port 3: R%d (read) ", srcs.reg3);
+
+ if (srcs.uniform_const) {
+ if (srcs.uniform_const & 0x80) {
+ printf("uniform: U%d", (srcs.uniform_const & 0x7f) * 2);
+ }
+ }
+
+ printf("\n");
+}
+static void dump_const_imm(uint32_t imm)
+{
+ union {
+ float f;
+ uint32_t i;
+ } fi;
+ fi.i = imm;
+ printf("0x%08x /* %f */", imm, fi.f);
+}
+
+static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs)
+{
+ unsigned low_bits = srcs.uniform_const & 0xf;
+ uint64_t imm;
+ switch (srcs.uniform_const >> 4) {
+ case 4: imm = consts[0]; break;
+ case 5: imm = consts[1]; break;
+ case 6: imm = consts[2]; break;
+ case 7: imm = consts[3]; break;
+ case 2: imm = consts[4]; break;
+ case 3: imm = consts[5]; break;
+ default: assert(0); break;
+ }
+ return imm | low_bits;
+}
+
+static void dump_uniform_const_src(struct bifrost_regs srcs, uint64_t *consts, bool high32)
+{
+ if (srcs.uniform_const & 0x80) {
+ unsigned uniform = (srcs.uniform_const & 0x7f) * 2;
+ printf("U%d", uniform + (high32 ? 1 : 0));
+ } else if (srcs.uniform_const >= 0x20) {
+ uint64_t imm = get_const(consts, srcs);
+ if (high32)
+ dump_const_imm(imm >> 32);
+ else
+ dump_const_imm(imm);
+ } else {
+ switch (srcs.uniform_const) {
+ case 0: printf("0"); break;
+ case 5: printf("atest-data"); break;
+ case 6: printf("sample-ptr"); break;
+ case 8:
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ printf("blend-descriptor%u", (unsigned) srcs.uniform_const - 8);
+ break;
+ default:
+ printf("unkConst%u", (unsigned) srcs.uniform_const);
+ break;
+ }
+
+ if (high32)
+ printf(".y");
+ else
+ printf(".x");
+ }
+}
+
+static void dump_src(unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA)
+{
+ switch (src) {
+ case 0: printf("R%d", get_reg0(srcs)); break;
+ case 1: printf("R%d", get_reg1(srcs)); break;
+ case 2: printf("R%d", srcs.reg3); break;
+ case 3:
+ if (isFMA)
+ printf("0");
+ else
+ printf("T"); // i.e. the output of FMA this cycle
+ break;
+ case 4:
+ dump_uniform_const_src(srcs, consts, false);
+ break;
+ case 5:
+ dump_uniform_const_src(srcs, consts, true);
+ break;
+ case 6: printf("T0"); break;
+ case 7: printf("T1"); break;
+ }
+}
+
+static void dump_output_mod(unsigned mod)
+{
+ switch (mod) {
+ case 0:
+ break;
+ case 1:
+ printf(".clamp_0_inf"); break; // max(out, 0)
+ case 2:
+ printf(".clamp_m1_1"); break; // clamp(out, -1, 1)
+ case 3:
+ printf(".clamp_0_1"); break; // clamp(out, 0, 1)
+ default:
+ break;
+ }
+}
+
+static void dump_minmax_mode(unsigned mod)
+{
+ switch (mod) {
+ case 0:
+ /* Same as fmax() and fmin() -- return the other number if any
+ * number is NaN. Also always return +0 if one argument is +0 and
+ * the other is -0.
+ */
+ break;
+ case 1:
+ /* Instead of never returning a NaN, always return one. The
+ * "greater"/"lesser" NaN is always returned, first by checking the
+ * sign and then the mantissa bits.
+ */
+ printf(".nan_wins"); break;
+ case 2:
+ /* For max, implement src0 > src1 ? src0 : src1
+ * For min, implement src0 < src1 ? src0 : src1
+ *
+ * This includes handling NaN's and signedness of 0 differently
+ * from above, since +0 and -0 compare equal and comparisons always
+ * return false for NaN's. As a result, this mode is *not*
+ * commutative.
+ */
+ printf(".src1_wins"); break;
+ case 3:
+ /* For max, implement src0 < src1 ? src1 : src0
+ * For min, implement src0 > src1 ? src1 : src0
+ */
+ printf(".src0_wins"); break;
+ default:
+ break;
+ }
+}
+
+static void dump_round_mode(unsigned mod)
+{
+ switch (mod) {
+ case 0:
+ /* roundTiesToEven, the IEEE default. */
+ break;
+ case 1:
+ /* roundTowardPositive in the IEEE spec. */
+ printf(".round_pos"); break;
+ case 2:
+ /* roundTowardNegative in the IEEE spec. */
+ printf(".round_neg"); break;
+ case 3:
+ /* roundTowardZero in the IEEE spec. */
+ printf(".round_zero"); break;
+ default:
+ break;
+ }
+}
+
+static const struct fma_op_info FMAOpInfos[] = {
+ { 0x00000, "FMA.f32", FMA_FMA },
+ { 0x40000, "MAX.f32", FMA_FMINMAX },
+ { 0x44000, "MIN.f32", FMA_FMINMAX },
+ { 0x48000, "FCMP.GL", FMA_FCMP },
+ { 0x4c000, "FCMP.D3D", FMA_FCMP },
+ { 0x4ff98, "ADD.i32", FMA_TWO_SRC },
+ { 0x4ffd8, "SUB.i32", FMA_TWO_SRC },
+ { 0x4fff0, "SUBB.i32", FMA_TWO_SRC },
+ { 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE },
+ { 0x58000, "ADD.f32", FMA_FADD },
+ { 0x5c000, "CSEL.FEQ.f32", FMA_FOUR_SRC },
+ { 0x5c200, "CSEL.FGT.f32", FMA_FOUR_SRC },
+ { 0x5c400, "CSEL.FGE.f32", FMA_FOUR_SRC },
+ { 0x5c600, "CSEL.IEQ.f32", FMA_FOUR_SRC },
+ { 0x5c800, "CSEL.IGT.i32", FMA_FOUR_SRC },
+ { 0x5ca00, "CSEL.IGE.i32", FMA_FOUR_SRC },
+ { 0x5cc00, "CSEL.UGT.i32", FMA_FOUR_SRC },
+ { 0x5ce00, "CSEL.UGE.i32", FMA_FOUR_SRC },
+ { 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC },
+ { 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC },
+ { 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC },
+ { 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC },
+ { 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC },
+ { 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0
+ { 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC },
+ { 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC },
+ { 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC },
+ { 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC },
+ { 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0
+ { 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC },
+ { 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC },
+ { 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC },
+ { 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC },
+ { 0x60200, "RSHIFT_NAND.i32", FMA_THREE_SRC },
+ { 0x603c0, "RSHIFT_NAND.v2i16", FMA_THREE_SRC },
+ { 0x60e00, "RSHIFT_OR.i32", FMA_THREE_SRC },
+ { 0x60fc0, "RSHIFT_OR.v2i16", FMA_THREE_SRC },
+ { 0x61200, "RSHIFT_AND.i32", FMA_THREE_SRC },
+ { 0x613c0, "RSHIFT_AND.v2i16", FMA_THREE_SRC },
+ { 0x61e00, "RSHIFT_NOR.i32", FMA_THREE_SRC }, // ~((src0 << src2) | src1)
+ { 0x61fc0, "RSHIFT_NOR.v2i16", FMA_THREE_SRC }, // ~((src0 << src2) | src1)
+ { 0x62200, "LSHIFT_NAND.i32", FMA_THREE_SRC },
+ { 0x623c0, "LSHIFT_NAND.v2i16", FMA_THREE_SRC },
+ { 0x62e00, "LSHIFT_OR.i32", FMA_THREE_SRC }, // (src0 << src2) | src1
+ { 0x62fc0, "LSHIFT_OR.v2i16", FMA_THREE_SRC }, // (src0 << src2) | src1
+ { 0x63200, "LSHIFT_AND.i32", FMA_THREE_SRC }, // (src0 << src2) & src1
+ { 0x633c0, "LSHIFT_AND.v2i16", FMA_THREE_SRC },
+ { 0x63e00, "LSHIFT_NOR.i32", FMA_THREE_SRC },
+ { 0x63fc0, "LSHIFT_NOR.v2i16", FMA_THREE_SRC },
+ { 0x64200, "RSHIFT_XOR.i32", FMA_THREE_SRC },
+ { 0x643c0, "RSHIFT_XOR.v2i16", FMA_THREE_SRC },
+ { 0x64600, "RSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
+ { 0x647c0, "RSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
+ { 0x64a00, "LSHIFT_XOR.i32", FMA_THREE_SRC },
+ { 0x64bc0, "LSHIFT_XOR.v2i16", FMA_THREE_SRC },
+ { 0x64e00, "LSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
+ { 0x64fc0, "LSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1)
+ { 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC },
+ { 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1
+ { 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2)
+ { 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC },
+ { 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC },
+ { 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC },
+ { 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC },
+ { 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC },
+ { 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC },
+ { 0x80000, "FMA.v2f16", FMA_FMA16 },
+ { 0xc0000, "MAX.v2f16", FMA_FMINMAX16 },
+ { 0xc4000, "MIN.v2f16", FMA_FMINMAX16 },
+ { 0xc8000, "FCMP.GL", FMA_FCMP16 },
+ { 0xcc000, "FCMP.D3D", FMA_FCMP16 },
+ { 0xcf900, "ADD.v2i16", FMA_TWO_SRC },
+ { 0xcfc10, "ADDC.i32", FMA_TWO_SRC },
+ { 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC },
+ { 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC },
+ { 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC },
+ { 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC },
+ { 0xd8000, "ADD.v2f16", FMA_FADD16 },
+ { 0xdc000, "CSEL.FEQ.v2f16", FMA_FOUR_SRC },
+ { 0xdc200, "CSEL.FGT.v2f16", FMA_FOUR_SRC },
+ { 0xdc400, "CSEL.FGE.v2f16", FMA_FOUR_SRC },
+ { 0xdc600, "CSEL.IEQ.v2f16", FMA_FOUR_SRC },
+ { 0xdc800, "CSEL.IGT.v2i16", FMA_FOUR_SRC },
+ { 0xdca00, "CSEL.IGE.v2i16", FMA_FOUR_SRC },
+ { 0xdcc00, "CSEL.UGT.v2i16", FMA_FOUR_SRC },
+ { 0xdce00, "CSEL.UGE.v2i16", FMA_FOUR_SRC },
+ { 0xdd000, "F32_TO_F16", FMA_TWO_SRC },
+ { 0xe0046, "F16_TO_I16.XX", FMA_ONE_SRC },
+ { 0xe0047, "F16_TO_U16.XX", FMA_ONE_SRC },
+ { 0xe004e, "F16_TO_I16.YX", FMA_ONE_SRC },
+ { 0xe004f, "F16_TO_U16.YX", FMA_ONE_SRC },
+ { 0xe0056, "F16_TO_I16.XY", FMA_ONE_SRC },
+ { 0xe0057, "F16_TO_U16.XY", FMA_ONE_SRC },
+ { 0xe005e, "F16_TO_I16.YY", FMA_ONE_SRC },
+ { 0xe005f, "F16_TO_U16.YY", FMA_ONE_SRC },
+ { 0xe00c0, "I16_TO_F16.XX", FMA_ONE_SRC },
+ { 0xe00c1, "U16_TO_F16.XX", FMA_ONE_SRC },
+ { 0xe00c8, "I16_TO_F16.YX", FMA_ONE_SRC },
+ { 0xe00c9, "U16_TO_F16.YX", FMA_ONE_SRC },
+ { 0xe00d0, "I16_TO_F16.XY", FMA_ONE_SRC },
+ { 0xe00d1, "U16_TO_F16.XY", FMA_ONE_SRC },
+ { 0xe00d8, "I16_TO_F16.YY", FMA_ONE_SRC },
+ { 0xe00d9, "U16_TO_F16.YY", FMA_ONE_SRC },
+ { 0xe0136, "F32_TO_I32", FMA_ONE_SRC },
+ { 0xe0137, "F32_TO_U32", FMA_ONE_SRC },
+ { 0xe0178, "I32_TO_F32", FMA_ONE_SRC },
+ { 0xe0179, "U32_TO_F32", FMA_ONE_SRC },
+ { 0xe0198, "I16_TO_I32.X", FMA_ONE_SRC },
+ { 0xe0199, "U16_TO_U32.X", FMA_ONE_SRC },
+ { 0xe019a, "I16_TO_I32.Y", FMA_ONE_SRC },
+ { 0xe019b, "U16_TO_U32.Y", FMA_ONE_SRC },
+ { 0xe019c, "I16_TO_F32.X", FMA_ONE_SRC },
+ { 0xe019d, "U16_TO_F32.X", FMA_ONE_SRC },
+ { 0xe019e, "I16_TO_F32.Y", FMA_ONE_SRC },
+ { 0xe019f, "U16_TO_F32.Y", FMA_ONE_SRC },
+ { 0xe01a2, "F16_TO_F32.X", FMA_ONE_SRC },
+ { 0xe01a3, "F16_TO_F32.Y", FMA_ONE_SRC },
+ { 0xe032c, "NOP", FMA_ONE_SRC },
+ { 0xe032d, "MOV", FMA_ONE_SRC },
+ { 0xe032f, "SWZ.YY.v2i16", FMA_ONE_SRC },
+ // From the ARM patent US20160364209A1:
+ // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s,
+ // and x1 is a floating point value in a predetermined range where the
+ // value 1 is within the range and not at one extremity of the range (e.g.
+ // choose a range where 1 is towards middle of range)."
+ //
+ // This computes x1.
+ { 0xe0345, "LOG_FREXPM", FMA_ONE_SRC },
+ // Given a floating point number m * 2^e, returns m * 2^{-1}. This is
+ // exactly the same as the mantissa part of frexp().
+ { 0xe0365, "FRCP_FREXPM", FMA_ONE_SRC },
+ // Given a floating point number m * 2^e, returns m * 2^{-2} if e is even,
+ // and m * 2^{-1} if e is odd. In other words, scales by powers of 4 until
+ // within the range [0.25, 1). Used for square-root and reciprocal
+ // square-root.
+ { 0xe0375, "FSQRT_FREXPM", FMA_ONE_SRC },
+ // Given a floating point number m * 2^e, computes -e - 1 as an integer.
+ // Zero and infinity/NaN return 0.
+ { 0xe038d, "FRCP_FREXPE", FMA_ONE_SRC },
+ // Computes floor(e/2) + 1.
+ { 0xe03a5, "FSQRT_FREXPE", FMA_ONE_SRC },
+ // Given a floating point number m * 2^e, computes -floor(e/2) - 1 as an
+ // integer.
+ { 0xe03ad, "FRSQ_FREXPE", FMA_ONE_SRC },
+ { 0xe03c5, "LOG_FREXPE", FMA_ONE_SRC },
+ { 0xe0b80, "IMAX3", FMA_THREE_SRC },
+ { 0xe0bc0, "UMAX3", FMA_THREE_SRC },
+ { 0xe0c00, "IMIN3", FMA_THREE_SRC },
+ { 0xe0c40, "UMIN3", FMA_THREE_SRC },
+ { 0xe0f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0
+ { 0xe0fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment
+ { 0xe1845, "CEIL", FMA_ONE_SRC },
+ { 0xe1885, "FLOOR", FMA_ONE_SRC },
+ { 0xe19b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC },
+ { 0xe19b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC },
+ // These instructions in the FMA slot, together with LSHIFT_ADD_HIGH32.i32
+ // in the ADD slot, allow one to do a 64-bit addition with an extra small
+ // shift on one of the sources. There are three possible scenarios:
+ //
+ // 1) Full 64-bit addition. Do:
+ // out.x = LSHIFT_ADD_LOW32.i64 src1.x, src2.x, shift
+ // out.y = LSHIFT_ADD_HIGH32.i32 src1.y, src2.y
+ //
+ // The shift amount is applied to src2 before adding. The shift amount, and
+ // any extra bits from src2 plus the overflow bit, are sent directly from
+ // FMA to ADD instead of being passed explicitly. Hence, these two must be
+ // bundled together into the same instruction.
+ //
+ // 2) Add a 64-bit value src1 to a zero-extended 32-bit value src2. Do:
+ // out.x = LSHIFT_ADD_LOW32.u32 src1.x, src2, shift
+ // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0
+ //
+ // Note that in this case, the second argument to LSHIFT_ADD_HIGH32 is
+ // ignored, so it can actually be anything. As before, the shift is applied
+ // to src2 before adding.
+ //
+ // 3) Add a 64-bit value to a sign-extended 32-bit value src2. Do:
+ // out.x = LSHIFT_ADD_LOW32.i32 src1.x, src2, shift
+ // out.y = LSHIFT_ADD_HIGH32.i32 src1.x, 0
+ //
+ // The only difference is the .i32 instead of .u32. Otherwise, this is
+ // exactly the same as before.
+ //
+ // In all these instructions, the shift amount is stored where the third
+ // source would be, so the shift has to be a small immediate from 0 to 7.
+ // This is fine for the expected use-case of these instructions, which is
+ // manipulating 64-bit pointers.
+ //
+ // These instructions can also be combined with various load/store
+ // instructions which normally take a 64-bit pointer in order to add a
+ // 32-bit or 64-bit offset to the pointer before doing the operation,
+ // optionally shifting the offset. The load/store op implicity does
+ // LSHIFT_ADD_HIGH32.i32 internally. Letting ptr be the pointer, and offset
+ // the desired offset, the cases go as follows:
+ //
+ // 1) Add a 64-bit offset:
+ // LSHIFT_ADD_LOW32.i64 ptr.x, offset.x, shift
+ // ld_st_op ptr.y, offset.y, ...
+ //
+ // Note that the output of LSHIFT_ADD_LOW32.i64 is not used, instead being
+ // implicitly sent to the load/store op to serve as the low 32 bits of the
+ // pointer.
+ //
+ // 2) Add a 32-bit unsigned offset:
+ // temp = LSHIFT_ADD_LOW32.u32 ptr.x, offset, shift
+ // ld_st_op temp, ptr.y, ...
+ //
+ // Now, the low 32 bits of offset << shift + ptr are passed explicitly to
+ // the ld_st_op, to match the case where there is no offset and ld_st_op is
+ // called directly.
+ //
+ // 3) Add a 32-bit signed offset:
+ // temp = LSHIFT_ADD_LOW32.i32 ptr.x, offset, shift
+ // ld_st_op temp, ptr.y, ...
+ //
+ // Again, the same as the unsigned case except for the offset.
+ { 0xe1c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 },
+ { 0xe1cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 },
+ { 0xe1d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 },
+ { 0xe1e00, "SEL.XX.i16", FMA_TWO_SRC },
+ { 0xe1e08, "SEL.YX.i16", FMA_TWO_SRC },
+ { 0xe1e10, "SEL.XY.i16", FMA_TWO_SRC },
+ { 0xe1e18, "SEL.YY.i16", FMA_TWO_SRC },
+ { 0xe7800, "IMAD", FMA_THREE_SRC },
+ { 0xe78db, "POPCNT", FMA_ONE_SRC },
+};
+
+static struct fma_op_info find_fma_op_info(unsigned op)
+{
+ for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) {
+ unsigned opCmp = ~0;
+ switch (FMAOpInfos[i].src_type) {
+ case FMA_ONE_SRC:
+ opCmp = op;
+ break;
+ case FMA_TWO_SRC:
+ opCmp = op & ~0x7;
+ break;
+ case FMA_FCMP:
+ case FMA_FCMP16:
+ opCmp = op & ~0x1fff;
+ break;
+ case FMA_THREE_SRC:
+ case FMA_SHIFT_ADD64:
+ opCmp = op & ~0x3f;
+ break;
+ case FMA_FADD:
+ case FMA_FMINMAX:
+ case FMA_FADD16:
+ case FMA_FMINMAX16:
+ opCmp = op & ~0x3fff;
+ break;
+ case FMA_FMA:
+ case FMA_FMA16:
+ opCmp = op & ~0x3ffff;
+ break;
+ case FMA_FOUR_SRC:
+ opCmp = op & ~0x1ff;
+ break;
+ case FMA_FMA_MSCALE:
+ opCmp = op & ~0x7fff;
+ break;
+ default:
+ opCmp = ~0;
+ break;
+ }
+ if (FMAOpInfos[i].op == opCmp)
+ return FMAOpInfos[i];
+ }
+
+ struct fma_op_info info;
+ snprintf(info.name, sizeof(info.name), "op%04x", op);
+ info.op = op;
+ info.src_type = FMA_THREE_SRC;
+ return info;
+}
+
+static void dump_fcmp(unsigned op)
+{
+ switch (op) {
+ case 0:
+ printf(".OEQ");
+ break;
+ case 1:
+ printf(".OGT");
+ break;
+ case 2:
+ printf(".OGE");
+ break;
+ case 3:
+ printf(".UNE");
+ break;
+ case 4:
+ printf(".OLT");
+ break;
+ case 5:
+ printf(".OLE");
+ break;
+ default:
+ printf(".unk%d", op);
+ break;
+ }
+}
+
+static void dump_16swizzle(unsigned swiz)
+{
+ if (swiz == 2)
+ return;
+ printf(".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]);
+}
+
+static void dump_fma_expand_src0(unsigned ctrl)
+{
+ switch (ctrl) {
+ case 3:
+ case 4:
+ case 6:
+ printf(".x");
+ break;
+ case 5:
+ case 7:
+ printf(".y");
+ break;
+ case 0:
+ case 1:
+ case 2:
+ break;
+ default:
+ printf(".unk");
+ break;
+ }
+}
+
+static void dump_fma_expand_src1(unsigned ctrl)
+{
+ switch (ctrl) {
+ case 1:
+ case 3:
+ printf(".x");
+ break;
+ case 2:
+ case 4:
+ case 5:
+ printf(".y");
+ break;
+ case 0:
+ case 6:
+ case 7:
+ break;
+ default:
+ printf(".unk");
+ break;
+ }
+}
+
+static void dump_fma(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose)
+{
+ if (verbose) {
+ printf("# FMA: %016" PRIx64 "\n", word);
+ }
+ struct bifrost_fma_inst FMA;
+ memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst));
+ struct fma_op_info info = find_fma_op_info(FMA.op);
+
+ printf("%s", info.name);
+ if (info.src_type == FMA_FADD ||
+ info.src_type == FMA_FMINMAX ||
+ info.src_type == FMA_FMA ||
+ info.src_type == FMA_FADD16 ||
+ info.src_type == FMA_FMINMAX16 ||
+ info.src_type == FMA_FMA16) {
+ dump_output_mod(bits(FMA.op, 12, 14));
+ switch (info.src_type) {
+ case FMA_FADD:
+ case FMA_FMA:
+ case FMA_FADD16:
+ case FMA_FMA16:
+ dump_round_mode(bits(FMA.op, 10, 12));
+ break;
+ case FMA_FMINMAX:
+ case FMA_FMINMAX16:
+ dump_minmax_mode(bits(FMA.op, 10, 12));
+ break;
+ default:
+ assert(0);
+ }
+ } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) {
+ dump_fcmp(bits(FMA.op, 10, 13));
+ if (info.src_type == FMA_FCMP)
+ printf(".f32");
+ else
+ printf(".v2f16");
+ } else if (info.src_type == FMA_FMA_MSCALE) {
+ if (FMA.op & (1 << 11)) {
+ switch ((FMA.op >> 9) & 0x3) {
+ case 0:
+ /* This mode seems to do a few things:
+ * - Makes 0 * infinity (and incidentally 0 * nan) return 0,
+ * since generating a nan would poison the result of
+ * 1/infinity and 1/0.
+ * - Fiddles with which nan is returned in nan * nan,
+ * presumably to make sure that the same exact nan is
+ * returned for 1/nan.
+ */
+ printf(".rcp_mode");
+ break;
+ case 3:
+ /* Similar to the above, but src0 always wins when multiplying
+ * 0 by infinity.
+ */
+ printf(".sqrt_mode");
+ break;
+ default:
+ printf(".unk%d_mode", (int) (FMA.op >> 9) & 0x3);
+ }
+ } else {
+ dump_output_mod(bits(FMA.op, 9, 11));
+ }
+ }
+
+ printf(" ");
+
+ struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs);
+ if (next_ctrl.fma_write_unit != REG_WRITE_NONE) {
+ printf("{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs));
+ } else {
+ printf("T0, ");
+ }
+
+ switch (info.src_type) {
+ case FMA_ONE_SRC:
+ dump_src(FMA.src0, regs, consts, true);
+ break;
+ case FMA_TWO_SRC:
+ dump_src(FMA.src0, regs, consts, true);
+ printf(", ");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ break;
+ case FMA_FADD:
+ case FMA_FMINMAX:
+ if (FMA.op & 0x10)
+ printf("-");
+ if (FMA.op & 0x200)
+ printf("abs(");
+ dump_src(FMA.src0, regs, consts, true);
+ dump_fma_expand_src0((FMA.op >> 6) & 0x7);
+ if (FMA.op & 0x200)
+ printf(")");
+ printf(", ");
+ if (FMA.op & 0x20)
+ printf("-");
+ if (FMA.op & 0x8)
+ printf("abs(");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ dump_fma_expand_src1((FMA.op >> 6) & 0x7);
+ if (FMA.op & 0x8)
+ printf(")");
+ break;
+ case FMA_FADD16:
+ case FMA_FMINMAX16: {
+ bool abs1 = FMA.op & 0x8;
+ bool abs2 = (FMA.op & 0x7) < FMA.src0;
+ if (FMA.op & 0x10)
+ printf("-");
+ if (abs1 || abs2)
+ printf("abs(");
+ dump_src(FMA.src0, regs, consts, true);
+ dump_16swizzle((FMA.op >> 6) & 0x3);
+ if (abs1 || abs2)
+ printf(")");
+ printf(", ");
+ if (FMA.op & 0x20)
+ printf("-");
+ if (abs1 && abs2)
+ printf("abs(");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ dump_16swizzle((FMA.op >> 8) & 0x3);
+ if (abs1 && abs2)
+ printf(")");
+ break;
+ }
+ case FMA_FCMP:
+ if (FMA.op & 0x200)
+ printf("abs(");
+ dump_src(FMA.src0, regs, consts, true);
+ dump_fma_expand_src0((FMA.op >> 6) & 0x7);
+ if (FMA.op & 0x200)
+ printf(")");
+ printf(", ");
+ if (FMA.op & 0x20)
+ printf("-");
+ if (FMA.op & 0x8)
+ printf("abs(");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ dump_fma_expand_src1((FMA.op >> 6) & 0x7);
+ if (FMA.op & 0x8)
+ printf(")");
+ break;
+ case FMA_FCMP16:
+ dump_src(FMA.src0, regs, consts, true);
+ // Note: this is kinda a guess, I haven't seen the blob set this to
+ // anything other than the identity, but it matches FMA_TWO_SRCFmod16
+ dump_16swizzle((FMA.op >> 6) & 0x3);
+ printf(", ");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ dump_16swizzle((FMA.op >> 8) & 0x3);
+ break;
+ case FMA_SHIFT_ADD64:
+ dump_src(FMA.src0, regs, consts, true);
+ printf(", ");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ printf(", ");
+ printf("shift:%u", (FMA.op >> 3) & 0x7);
+ break;
+ case FMA_THREE_SRC:
+ dump_src(FMA.src0, regs, consts, true);
+ printf(", ");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ printf(", ");
+ dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+ break;
+ case FMA_FMA:
+ if (FMA.op & (1 << 14))
+ printf("-");
+ if (FMA.op & (1 << 9))
+ printf("abs(");
+ dump_src(FMA.src0, regs, consts, true);
+ dump_fma_expand_src0((FMA.op >> 6) & 0x7);
+ if (FMA.op & (1 << 9))
+ printf(")");
+ printf(", ");
+ if (FMA.op & (1 << 16))
+ printf("abs(");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ dump_fma_expand_src1((FMA.op >> 6) & 0x7);
+ if (FMA.op & (1 << 16))
+ printf(")");
+ printf(", ");
+ if (FMA.op & (1 << 15))
+ printf("-");
+ if (FMA.op & (1 << 17))
+ printf("abs(");
+ dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+ if (FMA.op & (1 << 17))
+ printf(")");
+ break;
+ case FMA_FMA16:
+ if (FMA.op & (1 << 14))
+ printf("-");
+ dump_src(FMA.src0, regs, consts, true);
+ dump_16swizzle((FMA.op >> 6) & 0x3);
+ printf(", ");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ dump_16swizzle((FMA.op >> 8) & 0x3);
+ printf(", ");
+ if (FMA.op & (1 << 15))
+ printf("-");
+ dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+ dump_16swizzle((FMA.op >> 16) & 0x3);
+ break;
+ case FMA_FOUR_SRC:
+ dump_src(FMA.src0, regs, consts, true);
+ printf(", ");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ printf(", ");
+ dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+ printf(", ");
+ dump_src((FMA.op >> 6) & 0x7, regs, consts, true);
+ break;
+ case FMA_FMA_MSCALE:
+ if (FMA.op & (1 << 12))
+ printf("abs(");
+ dump_src(FMA.src0, regs, consts, true);
+ if (FMA.op & (1 << 12))
+ printf(")");
+ printf(", ");
+ if (FMA.op & (1 << 13))
+ printf("-");
+ dump_src(FMA.op & 0x7, regs, consts, true);
+ printf(", ");
+ if (FMA.op & (1 << 14))
+ printf("-");
+ dump_src((FMA.op >> 3) & 0x7, regs, consts, true);
+ printf(", ");
+ dump_src((FMA.op >> 6) & 0x7, regs, consts, true);
+ break;
+ }
+ printf("\n");
+}
+
+static const struct add_op_info add_op_infos[] = {
+ { 0x00000, "MAX.f32", ADD_FMINMAX },
+ { 0x02000, "MIN.f32", ADD_FMINMAX },
+ { 0x04000, "ADD.f32", ADD_FADD },
+ { 0x06000, "FCMP.GL", ADD_FCMP },
+ { 0x07000, "FCMP.D3D", ADD_FCMP },
+ { 0x07856, "F16_TO_I16", ADD_ONE_SRC },
+ { 0x07857, "F16_TO_U16", ADD_ONE_SRC },
+ { 0x078c0, "I16_TO_F16.XX", ADD_ONE_SRC },
+ { 0x078c1, "U16_TO_F16.XX", ADD_ONE_SRC },
+ { 0x078c8, "I16_TO_F16.YX", ADD_ONE_SRC },
+ { 0x078c9, "U16_TO_F16.YX", ADD_ONE_SRC },
+ { 0x078d0, "I16_TO_F16.XY", ADD_ONE_SRC },
+ { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC },
+ { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC },
+ { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC },
+ { 0x07936, "F32_TO_I32", ADD_ONE_SRC },
+ { 0x07937, "F32_TO_U32", ADD_ONE_SRC },
+ { 0x07978, "I32_TO_F32", ADD_ONE_SRC },
+ { 0x07979, "U32_TO_F32", ADD_ONE_SRC },
+ { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC },
+ { 0x07999, "U16_TO_U32.X", ADD_ONE_SRC },
+ { 0x0799a, "I16_TO_I32.Y", ADD_ONE_SRC },
+ { 0x0799b, "U16_TO_U32.Y", ADD_ONE_SRC },
+ { 0x0799c, "I16_TO_F32.X", ADD_ONE_SRC },
+ { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC },
+ { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC },
+ { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC },
+ // take the low 16 bits, and expand it to a 32-bit float
+ { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC },
+ // take the high 16 bits, ...
+ { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC },
+ { 0x07b2b, "SWZ.YX.v2i16", ADD_ONE_SRC },
+ { 0x07b2c, "NOP", ADD_ONE_SRC },
+ { 0x07b29, "SWZ.XX.v2i16", ADD_ONE_SRC },
+ // Logically, this should be SWZ.XY, but that's equivalent to a move, and
+ // this seems to be the canonical way the blob generates a MOV.
+ { 0x07b2d, "MOV", ADD_ONE_SRC },
+ { 0x07b2f, "SWZ.YY.v2i16", ADD_ONE_SRC },
+ // Given a floating point number m * 2^e, returns m ^ 2^{-1}.
+ { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC },
+ { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC },
+ { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC },
+ { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC },
+ { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC },
+ // From the ARM patent US20160364209A1:
+ // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s,
+ // and x1 is a floating point value in a predetermined range where the
+ // value 1 is within the range and not at one extremity of the range (e.g.
+ // choose a range where 1 is towards middle of range)."
+ //
+ // This computes s.
+ { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC },
+ { 0x07d45, "CEIL", ADD_ONE_SRC },
+ { 0x07d85, "FLOOR", ADD_ONE_SRC },
+ { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC },
+ { 0x08000, "LD_ATTR.f16", ADD_LOAD_ATTR, true },
+ { 0x08100, "LD_ATTR.v2f16", ADD_LOAD_ATTR, true },
+ { 0x08200, "LD_ATTR.v3f16", ADD_LOAD_ATTR, true },
+ { 0x08300, "LD_ATTR.v4f16", ADD_LOAD_ATTR, true },
+ { 0x08400, "LD_ATTR.f32", ADD_LOAD_ATTR, true },
+ { 0x08500, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
+ { 0x08600, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
+ { 0x08700, "LD_ATTR.v4f32", ADD_LOAD_ATTR, true },
+ { 0x08800, "LD_ATTR.i32", ADD_LOAD_ATTR, true },
+ { 0x08900, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
+ { 0x08a00, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
+ { 0x08b00, "LD_ATTR.v4i32", ADD_LOAD_ATTR, true },
+ { 0x08c00, "LD_ATTR.u32", ADD_LOAD_ATTR, true },
+ { 0x08d00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
+ { 0x08e00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
+ { 0x08f00, "LD_ATTR.v4u32", ADD_LOAD_ATTR, true },
+ { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true },
+ { 0x0b000, "TEX", ADD_TEX_COMPACT, true },
+ { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true },
+ { 0x0c1a0, "LD_UBO.i32", ADD_TWO_SRC, true },
+ { 0x0c1b8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
+ { 0x0c1c8, "LOAD.v2i32", ADD_TWO_SRC, true },
+ { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true },
+ { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
+ { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true },
+ // src0 = offset, src1 = binding
+ { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true },
+ { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true },
+ { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true },
+ { 0x0c278, "ST_SCRATCH.v4i32", ADD_TWO_SRC, true },
+ { 0x0c588, "STORE.i32", ADD_TWO_SRC, true },
+ { 0x0c5b8, "ST_SCRATCH.i32", ADD_TWO_SRC, true },
+ { 0x0c5c8, "STORE.v2i32", ADD_TWO_SRC, true },
+ { 0x0c5f8, "ST_SCRATCH.v2i32", ADD_TWO_SRC, true },
+ { 0x0c648, "LOAD.u16", ADD_TWO_SRC, true }, // zero-extends
+ { 0x0ca88, "LOAD.v3i32", ADD_TWO_SRC, true },
+ { 0x0caa0, "LD_UBO.v3i32", ADD_TWO_SRC, true },
+ { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true },
+ { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true },
+ { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true },
+ // *_FAST does not exist on G71 (added to G51, G72, and everything after)
+ { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC },
+ { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC },
+ // Given a floating point number m * 2^e, produces a table-based
+ // approximation of 2/m using the top 17 bits. Includes special cases for
+ // infinity, NaN, and zero, and copies the sign bit.
+ { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC },
+ // Exists on G71
+ { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC },
+ // A similar table for inverse square root, using the high 17 bits of the
+ // mantissa as well as the low bit of the exponent.
+ { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC },
+ { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC },
+ { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC },
+ // Used in the argument reduction for log. Given a floating-point number
+ // m * 2^e, uses the top 4 bits of m to produce an approximation to 1/m
+ // with the exponent forced to 0 and only the top 5 bits are nonzero. 0,
+ // infinity, and NaN all return 1.0.
+ // See the ARM patent for more information.
+ { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC },
+ { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC },
+ { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC },
+ { 0x0cf48, "ATAN_TABLE", ADD_TWO_SRC },
+ { 0x0cf50, "SIN_TABLE", ADD_ONE_SRC },
+ { 0x0cf51, "COS_TABLE", ADD_ONE_SRC },
+ { 0x0cf58, "EXP_TABLE", ADD_ONE_SRC },
+ { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC },
+ { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC },
+ { 0x0d000, "BRANCH", ADD_BRANCH },
+ // For each bit i, return src2[i] ? src0[i] : src1[i]. In other words, this
+ // is the same as (src2 & src0) | (~src2 & src1).
+ { 0x0e8c0, "MUX", ADD_THREE_SRC },
+ { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC },
+ { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC },
+ { 0x0ea60, "SEL.XX.i16", ADD_TWO_SRC },
+ { 0x0ea70, "SEL.XY.i16", ADD_TWO_SRC },
+ { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC },
+ { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC },
+ { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC },
+ { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0
+ { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC },
+ { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC },
+ { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC },
+ { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC },
+ { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0
+ { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC },
+ { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC },
+ { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC },
+ { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC },
+ { 0x10000, "MAX.v2f16", ADD_FMINMAX16 },
+ { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale },
+ { 0x12000, "MIN.v2f16", ADD_FMINMAX16 },
+ { 0x14000, "ADD.v2f16", ADD_FADD16 },
+ { 0x17000, "FCMP.D3D", ADD_FCMP16 },
+ { 0x178c0, "ADD.i32", ADD_TWO_SRC },
+ { 0x17900, "ADD.v2i16", ADD_TWO_SRC },
+ { 0x17ac0, "SUB.i32", ADD_TWO_SRC },
+ { 0x17c10, "ADDC.i32", ADD_TWO_SRC }, // adds src0 to the bottom bit of src1
+ { 0x17d80, "ADD.i32.i16.X", ADD_TWO_SRC },
+ { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC },
+ { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC },
+ { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC },
+ // Compute varying address and datatype (for storing in the vertex shader),
+ // and store the vec3 result in the data register. The result is passed as
+ // the 3 normal arguments to ST_VAR.
+ { 0x18000, "LD_VAR_ADDR.f16", ADD_VARYING_ADDRESS, true },
+ { 0x18100, "LD_VAR_ADDR.f32", ADD_VARYING_ADDRESS, true },
+ { 0x18200, "LD_VAR_ADDR.i32", ADD_VARYING_ADDRESS, true },
+ { 0x18300, "LD_VAR_ADDR.u32", ADD_VARYING_ADDRESS, true },
+ // Implements alpha-to-coverage, as well as possibly the late depth and
+ // stencil tests. The first source is the existing sample mask in R60
+ // (possibly modified by gl_SampleMask), and the second source is the alpha
+ // value. The sample mask is written right away based on the
+ // alpha-to-coverage result using the normal register write mechanism,
+ // since that doesn't need to read from any memory, and then written again
+ // later based on the result of the stencil and depth tests using the
+ // special register.
+ { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true },
+ { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true },
+ { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true },
+ // store a varying given the address and datatype from LD_VAR_ADDR
+ { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true },
+ { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true },
+ { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true },
+ { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true },
+ // This takes the sample coverage mask (computed by ATEST above) as a
+ // regular argument, in addition to the vec4 color in the special register.
+ { 0x1952c, "BLEND", ADD_BLENDING, true },
+ { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true },
+ { 0x1ae60, "TEX", ADD_TEX, true },
+ { 0x1c000, "RSHIFT_NAND.i32", ADD_THREE_SRC },
+ { 0x1c300, "RSHIFT_OR.i32", ADD_THREE_SRC },
+ { 0x1c400, "RSHIFT_AND.i32", ADD_THREE_SRC },
+ { 0x1c700, "RSHIFT_NOR.i32", ADD_THREE_SRC },
+ { 0x1c800, "LSHIFT_NAND.i32", ADD_THREE_SRC },
+ { 0x1cb00, "LSHIFT_OR.i32", ADD_THREE_SRC },
+ { 0x1cc00, "LSHIFT_AND.i32", ADD_THREE_SRC },
+ { 0x1cf00, "LSHIFT_NOR.i32", ADD_THREE_SRC },
+ { 0x1d000, "RSHIFT_XOR.i32", ADD_THREE_SRC },
+ { 0x1d100, "RSHIFT_XNOR.i32", ADD_THREE_SRC },
+ { 0x1d200, "LSHIFT_XOR.i32", ADD_THREE_SRC },
+ { 0x1d300, "LSHIFT_XNOR.i32", ADD_THREE_SRC },
+ { 0x1d400, "LSHIFT_ADD.i32", ADD_THREE_SRC },
+ { 0x1d500, "LSHIFT_SUB.i32", ADD_THREE_SRC },
+ { 0x1d500, "LSHIFT_RSUB.i32", ADD_THREE_SRC },
+ { 0x1d700, "RSHIFT_ADD.i32", ADD_THREE_SRC },
+ { 0x1d800, "RSHIFT_SUB.i32", ADD_THREE_SRC },
+ { 0x1d900, "RSHIFT_RSUB.i32", ADD_THREE_SRC },
+ { 0x1da00, "ARSHIFT_ADD.i32", ADD_THREE_SRC },
+ { 0x1db00, "ARSHIFT_SUB.i32", ADD_THREE_SRC },
+ { 0x1dc00, "ARSHIFT_RSUB.i32", ADD_THREE_SRC },
+ { 0x1dd18, "OR.i32", ADD_TWO_SRC },
+ { 0x1dd20, "AND.i32", ADD_TWO_SRC },
+ { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC },
+ { 0x1dd50, "XOR.i32", ADD_TWO_SRC },
+ { 0x1dd80, "RSHIFT.i32", ADD_TWO_SRC },
+ { 0x1dda0, "ARSHIFT.i32", ADD_TWO_SRC },
+};
+
+static struct add_op_info find_add_op_info(unsigned op)
+{
+ for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) {
+ unsigned opCmp = ~0;
+ switch (add_op_infos[i].src_type) {
+ case ADD_ONE_SRC:
+ case ADD_BLENDING:
+ opCmp = op;
+ break;
+ case ADD_TWO_SRC:
+ opCmp = op & ~0x7;
+ break;
+ case ADD_THREE_SRC:
+ opCmp = op & ~0x3f;
+ break;
+ case ADD_TEX:
+ opCmp = op & ~0xf;
+ break;
+ case ADD_FADD:
+ case ADD_FMINMAX:
+ case ADD_FADD16:
+ opCmp = op & ~0x1fff;
+ break;
+ case ADD_FMINMAX16:
+ case ADD_FADDMscale:
+ opCmp = op & ~0xfff;
+ break;
+ case ADD_FCMP:
+ case ADD_FCMP16:
+ opCmp = op & ~0x7ff;
+ break;
+ case ADD_TEX_COMPACT:
+ opCmp = op & ~0x3ff;
+ break;
+ case ADD_VARYING_INTERP:
+ opCmp = op & ~0x7ff;
+ break;
+ case ADD_VARYING_ADDRESS:
+ opCmp = op & ~0xff;
+ break;
+ case ADD_LOAD_ATTR:
+ opCmp = op & ~0x7f;
+ break;
+ case ADD_BRANCH:
+ opCmp = op & ~0xfff;
+ break;
+ default:
+ opCmp = ~0;
+ break;
+ }
+ if (add_op_infos[i].op == opCmp)
+ return add_op_infos[i];
+ }
+
+ struct add_op_info info;
+ snprintf(info.name, sizeof(info.name), "op%04x", op);
+ info.op = op;
+ info.src_type = ADD_TWO_SRC;
+ info.has_data_reg = true;
+ return info;
+}
+
+static void dump_add(uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts,
+ unsigned data_reg, unsigned offset, bool verbose)
+{
+ if (verbose) {
+ printf("# ADD: %016" PRIx64 "\n", word);
+ }
+ struct bifrost_add_inst ADD;
+ memcpy((char *) &ADD, (char *) &word, sizeof(ADD));
+ struct add_op_info info = find_add_op_info(ADD.op);
+
+ printf("%s", info.name);
+
+ // float16 seems like it doesn't support output modifiers
+ if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) {
+ // output modifiers
+ dump_output_mod(bits(ADD.op, 8, 10));
+ if (info.src_type == ADD_FADD)
+ dump_round_mode(bits(ADD.op, 10, 12));
+ else
+ dump_minmax_mode(bits(ADD.op, 10, 12));
+ } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) {
+ dump_fcmp(bits(ADD.op, 3, 6));
+ if (info.src_type == ADD_FCMP)
+ printf(".f32");
+ else
+ printf(".v2f16");
+ } else if (info.src_type == ADD_FADDMscale) {
+ switch ((ADD.op >> 6) & 0x7) {
+ case 0: break;
+ // causes GPU hangs on G71
+ case 1: printf(".invalid"); break;
+ // Same as usual outmod value.
+ case 2: printf(".clamp_0_1"); break;
+ // If src0 is infinite or NaN, flush it to zero so that the other
+ // source is passed through unmodified.
+ case 3: printf(".flush_src0_inf_nan"); break;
+ // Vice versa.
+ case 4: printf(".flush_src1_inf_nan"); break;
+ // Every other case seems to behave the same as the above?
+ default: printf(".unk%d", (ADD.op >> 6) & 0x7); break;
+ }
+ } else if (info.src_type == ADD_VARYING_INTERP) {
+ if (ADD.op & 0x200)
+ printf(".reuse");
+ if (ADD.op & 0x400)
+ printf(".flat");
+ switch ((ADD.op >> 7) & 0x3) {
+ case 0: printf(".per_frag"); break;
+ case 1: printf(".centroid"); break;
+ case 2: break;
+ case 3: printf(".explicit"); break;
+ }
+ printf(".v%d", ((ADD.op >> 5) & 0x3) + 1);
+ } else if (info.src_type == ADD_BRANCH) {
+ enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f);
+ if (branchCode == BR_ALWAYS) {
+ // unconditional branch
+ } else {
+ enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7);
+ enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
+ bool portSwapped = (ADD.op & 0x7) < ADD.src0;
+ // See the comment in branch_bit_size
+ if (size == BR_SIZE_16YX0)
+ portSwapped = true;
+ if (size == BR_SIZE_16YX1)
+ portSwapped = false;
+ // These sizes are only for floating point comparisons, so the
+ // non-floating-point comparisons are reused to encode the flipped
+ // versions.
+ if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y)
+ portSwapped = false;
+ // There's only one argument, so we reuse the extra argument to
+ // encode this.
+ if (size == BR_SIZE_ZERO)
+ portSwapped = !(ADD.op & 1);
+
+ switch (cond) {
+ case BR_COND_LT:
+ if (portSwapped)
+ printf(".LT.u");
+ else
+ printf(".LT.i");
+ break;
+ case BR_COND_LE:
+ if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) {
+ printf(".UNE.f");
+ } else {
+ if (portSwapped)
+ printf(".LE.u");
+ else
+ printf(".LE.i");
+ }
+ break;
+ case BR_COND_GT:
+ if (portSwapped)
+ printf(".GT.u");
+ else
+ printf(".GT.i");
+ break;
+ case BR_COND_GE:
+ if (portSwapped)
+ printf(".GE.u");
+ else
+ printf(".GE.i");
+ break;
+ case BR_COND_EQ:
+ if (portSwapped)
+ printf(".NE.i");
+ else
+ printf(".EQ.i");
+ break;
+ case BR_COND_OEQ:
+ if (portSwapped)
+ printf(".UNE.f");
+ else
+ printf(".OEQ.f");
+ break;
+ case BR_COND_OGT:
+ if (portSwapped)
+ printf(".OGT.unk.f");
+ else
+ printf(".OGT.f");
+ break;
+ case BR_COND_OLT:
+ if (portSwapped)
+ printf(".OLT.unk.f");
+ else
+ printf(".OLT.f");
+ break;
+ }
+ switch (size) {
+ case BR_SIZE_32:
+ case BR_SIZE_32_AND_16X:
+ case BR_SIZE_32_AND_16Y:
+ printf("32");
+ break;
+ case BR_SIZE_16XX:
+ case BR_SIZE_16YY:
+ case BR_SIZE_16YX0:
+ case BR_SIZE_16YX1:
+ printf("16");
+ break;
+ case BR_SIZE_ZERO: {
+ unsigned ctrl = (ADD.op >> 1) & 0x3;
+ if (ctrl == 0)
+ printf("32.Z");
+ else
+ printf("16.Z");
+ break;
+ }
+ }
+ }
+ }
+ printf(" ");
+
+ struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(next_regs);
+ if (next_ctrl.add_write_unit != REG_WRITE_NONE) {
+ printf("{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs));
+ } else {
+ printf("T1, ");
+ }
+
+ switch (info.src_type) {
+ case ADD_BLENDING:
+ // Note: in this case, regs.uniform_const == location | 0x8
+ // This probably means we can't load uniforms or immediates in the
+ // same instruction. This re-uses the encoding that normally means
+ // "disabled", where the low 4 bits are ignored. Perhaps the extra
+ // 0x8 or'd in indicates this is happening.
+ printf("location:%d, ", regs.uniform_const & 0x7);
+ // fallthrough
+ case ADD_ONE_SRC:
+ dump_src(ADD.src0, regs, consts, false);
+ break;
+ case ADD_TEX:
+ case ADD_TEX_COMPACT: {
+ int tex_index;
+ int sampler_index;
+ bool dualTex = false;
+ if (info.src_type == ADD_TEX_COMPACT) {
+ tex_index = (ADD.op >> 3) & 0x7;
+ sampler_index = (ADD.op >> 7) & 0x7;
+ bool unknown = (ADD.op & 0x40);
+ // TODO: figure out if the unknown bit is ever 0
+ if (!unknown)
+ printf("unknown ");
+ } else {
+ uint64_t constVal = get_const(consts, regs);
+ uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal;
+ struct bifrost_tex_ctrl ctrl;
+ memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl));
+
+ // TODO: figure out what actually triggers dual-tex
+ if (ctrl.result_type == 9) {
+ struct bifrost_dual_tex_ctrl dualCtrl;
+ memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl));
+ printf("(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ",
+ dualCtrl.tex_index0, dualCtrl.sampler_index0,
+ dualCtrl.tex_index1, dualCtrl.sampler_index1);
+ if (dualCtrl.unk0 != 3)
+ printf("unk:%d ", dualCtrl.unk0);
+ dualTex = true;
+ } else {
+ if (ctrl.no_merge_index) {
+ tex_index = ctrl.tex_index;
+ sampler_index = ctrl.sampler_index;
+ } else {
+ tex_index = sampler_index = ctrl.tex_index;
+ unsigned unk = ctrl.sampler_index >> 2;
+ if (unk != 3)
+ printf("unk:%d ", unk);
+ if (ctrl.sampler_index & 1)
+ tex_index = -1;
+ if (ctrl.sampler_index & 2)
+ sampler_index = -1;
+ }
+
+ if (ctrl.unk0 != 3)
+ printf("unk0:%d ", ctrl.unk0);
+ if (ctrl.unk1)
+ printf("unk1 ");
+ if (ctrl.unk2 != 0xf)
+ printf("unk2:%x ", ctrl.unk2);
+
+ switch (ctrl.result_type) {
+ case 0x4:
+ printf("f32 "); break;
+ case 0xe:
+ printf("i32 "); break;
+ case 0xf:
+ printf("u32 "); break;
+ default:
+ printf("unktype(%x) ", ctrl.result_type);
+ }
+
+ switch (ctrl.tex_type) {
+ case 0:
+ printf("cube "); break;
+ case 1:
+ printf("buffer "); break;
+ case 2:
+ printf("2D "); break;
+ case 3:
+ printf("3D "); break;
+ }
+
+ if (ctrl.is_shadow)
+ printf("shadow ");
+ if (ctrl.is_array)
+ printf("array ");
+
+ if (!ctrl.filter) {
+ if (ctrl.calc_gradients) {
+ int comp = (controlBits >> 20) & 0x3;
+ printf("txg comp:%d ", comp);
+ } else {
+ printf("txf ");
+ }
+ } else {
+ if (!ctrl.not_supply_lod) {
+ if (ctrl.compute_lod)
+ printf("lod_bias ");
+ else
+ printf("lod ");
+ }
+
+ if (!ctrl.calc_gradients)
+ printf("grad ");
+ }
+
+ if (ctrl.texel_offset)
+ printf("offset ");
+ }
+ }
+
+ if (!dualTex) {
+ if (tex_index == -1)
+ printf("tex:indirect ");
+ else
+ printf("tex:%d ", tex_index);
+
+ if (sampler_index == -1)
+ printf("samp:indirect ");
+ else
+ printf("samp:%d ", sampler_index);
+ }
+ break;
+ }
+ case ADD_VARYING_INTERP: {
+ unsigned addr = ADD.op & 0x1f;
+ if (addr < 0b10100) {
+ // direct addr
+ printf("%d", addr);
+ } else if (addr < 0b11000) {
+ if (addr == 22)
+ printf("fragw");
+ else if (addr == 23)
+ printf("fragz");
+ else
+ printf("unk%d", addr);
+ } else {
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ }
+ printf(", ");
+ dump_src(ADD.src0, regs, consts, false);
+ break;
+ }
+ case ADD_VARYING_ADDRESS: {
+ dump_src(ADD.src0, regs, consts, false);
+ printf(", ");
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ printf(", ");
+ unsigned location = (ADD.op >> 3) & 0x1f;
+ if (location < 16) {
+ printf("location:%d", location);
+ } else if (location == 20) {
+ printf("location:%u", (uint32_t) get_const(consts, regs));
+ } else if (location == 21) {
+ printf("location:%u", (uint32_t) (get_const(consts, regs) >> 32));
+ } else {
+ printf("location:%d(unk)", location);
+ }
+ break;
+ }
+ case ADD_LOAD_ATTR:
+ printf("location:%d, ", (ADD.op >> 3) & 0xf);
+ case ADD_TWO_SRC:
+ dump_src(ADD.src0, regs, consts, false);
+ printf(", ");
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ break;
+ case ADD_THREE_SRC:
+ dump_src(ADD.src0, regs, consts, false);
+ printf(", ");
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ printf(", ");
+ dump_src((ADD.op >> 3) & 0x7, regs, consts, false);
+ break;
+ case ADD_FADD:
+ case ADD_FMINMAX:
+ if (ADD.op & 0x10)
+ printf("-");
+ if (ADD.op & 0x1000)
+ printf("abs(");
+ dump_src(ADD.src0, regs, consts, false);
+ switch ((ADD.op >> 6) & 0x3) {
+ case 3:
+ printf(".x");
+ break;
+ default:
+ break;
+ }
+ if (ADD.op & 0x1000)
+ printf(")");
+ printf(", ");
+ if (ADD.op & 0x20)
+ printf("-");
+ if (ADD.op & 0x8)
+ printf("abs(");
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ switch ((ADD.op >> 6) & 0x3) {
+ case 1:
+ case 3:
+ printf(".x");
+ break;
+ case 2:
+ printf(".y");
+ break;
+ case 0:
+ break;
+ default:
+ printf(".unk");
+ break;
+ }
+ if (ADD.op & 0x8)
+ printf(")");
+ break;
+ case ADD_FADD16:
+ if (ADD.op & 0x10)
+ printf("-");
+ if (ADD.op & 0x1000)
+ printf("abs(");
+ dump_src(ADD.src0, regs, consts, false);
+ if (ADD.op & 0x1000)
+ printf(")");
+ dump_16swizzle((ADD.op >> 6) & 0x3);
+ printf(", ");
+ if (ADD.op & 0x20)
+ printf("-");
+ if (ADD.op & 0x8)
+ printf("abs(");
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ dump_16swizzle((ADD.op >> 8) & 0x3);
+ if (ADD.op & 0x8)
+ printf(")");
+ break;
+ case ADD_FMINMAX16: {
+ bool abs1 = ADD.op & 0x8;
+ bool abs2 = (ADD.op & 0x7) < ADD.src0;
+ if (ADD.op & 0x10)
+ printf("-");
+ if (abs1 || abs2)
+ printf("abs(");
+ dump_src(ADD.src0, regs, consts, false);
+ dump_16swizzle((ADD.op >> 6) & 0x3);
+ if (abs1 || abs2)
+ printf(")");
+ printf(", ");
+ if (ADD.op & 0x20)
+ printf("-");
+ if (abs1 && abs2)
+ printf("abs(");
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ dump_16swizzle((ADD.op >> 8) & 0x3);
+ if (abs1 && abs2)
+ printf(")");
+ break;
+ }
+ case ADD_FADDMscale: {
+ if (ADD.op & 0x400)
+ printf("-");
+ if (ADD.op & 0x200)
+ printf("abs(");
+ dump_src(ADD.src0, regs, consts, false);
+ if (ADD.op & 0x200)
+ printf(")");
+
+ printf(", ");
+
+ if (ADD.op & 0x800)
+ printf("-");
+ dump_src(ADD.op & 0x7, regs, consts, false);
+
+ printf(", ");
+
+ dump_src((ADD.op >> 3) & 0x7, regs, consts, false);
+ break;
+ }
+ case ADD_FCMP:
+ if (ADD.op & 0x400) {
+ printf("-");
+ }
+ if (ADD.op & 0x100) {
+ printf("abs(");
+ }
+ dump_src(ADD.src0, regs, consts, false);
+ switch ((ADD.op >> 6) & 0x3) {
+ case 3:
+ printf(".x");
+ break;
+ default:
+ break;
+ }
+ if (ADD.op & 0x100) {
+ printf(")");
+ }
+ printf(", ");
+ if (ADD.op & 0x200) {
+ printf("abs(");
+ }
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ switch ((ADD.op >> 6) & 0x3) {
+ case 1:
+ case 3:
+ printf(".x");
+ break;
+ case 2:
+ printf(".y");
+ break;
+ case 0:
+ break;
+ default:
+ printf(".unk");
+ break;
+ }
+ if (ADD.op & 0x200) {
+ printf(")");
+ }
+ break;
+ case ADD_FCMP16:
+ dump_src(ADD.src0, regs, consts, false);
+ dump_16swizzle((ADD.op >> 6) & 0x3);
+ printf(", ");
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ dump_16swizzle((ADD.op >> 8) & 0x3);
+ break;
+ case ADD_BRANCH: {
+ enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f);
+ enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
+ if (code != BR_ALWAYS) {
+ dump_src(ADD.src0, regs, consts, false);
+ switch (size) {
+ case BR_SIZE_16XX:
+ printf(".x");
+ break;
+ case BR_SIZE_16YY:
+ case BR_SIZE_16YX0:
+ case BR_SIZE_16YX1:
+ printf(".y");
+ break;
+ case BR_SIZE_ZERO: {
+ unsigned ctrl = (ADD.op >> 1) & 0x3;
+ switch (ctrl) {
+ case 1:
+ printf(".y");
+ break;
+ case 2:
+ printf(".x");
+ break;
+ default:
+ break;
+ }
+ }
+ default:
+ break;
+ }
+ printf(", ");
+ }
+ if (code != BR_ALWAYS && size != BR_SIZE_ZERO) {
+ dump_src(ADD.op & 0x7, regs, consts, false);
+ switch (size) {
+ case BR_SIZE_16XX:
+ case BR_SIZE_16YX0:
+ case BR_SIZE_16YX1:
+ case BR_SIZE_32_AND_16X:
+ printf(".x");
+ break;
+ case BR_SIZE_16YY:
+ case BR_SIZE_32_AND_16Y:
+ printf(".y");
+ break;
+ default:
+ break;
+ }
+ printf(", ");
+ }
+ // I haven't had the chance to test if this actually specifies the
+ // branch offset, since I couldn't get it to produce values other
+ // than 5 (uniform/const high), but these three bits are always
+ // consistent across branch instructions, so it makes sense...
+ int offsetSrc = (ADD.op >> 3) & 0x7;
+ if (offsetSrc == 4 || offsetSrc == 5) {
+ // If the offset is known/constant, we can decode it
+ uint32_t raw_offset;
+ if (offsetSrc == 4)
+ raw_offset = get_const(consts, regs);
+ else
+ raw_offset = get_const(consts, regs) >> 32;
+ // The high 4 bits are flags, while the rest is the
+ // twos-complement offset in bytes (here we convert to
+ // clauses).
+ int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8;
+
+ // If high4 is the high 4 bits of the last 64-bit constant,
+ // this is calculated as (high4 + 4) & 0xf, or 0 if the branch
+ // offset itself is the last constant. Not sure if this is
+ // actually used, or just garbage in unused bits, but in any
+ // case, we can just ignore it here since it's redundant. Note
+ // that if there is any padding, this will be 4 since the
+ // padding counts as the last constant.
+ unsigned flags = raw_offset >> 28;
+ (void) flags;
+
+ // Note: the offset is in bytes, relative to the beginning of the
+ // current clause, so a zero offset would be a loop back to the
+ // same clause (annoyingly different from Midgard).
+ printf("clause_%d", offset + branch_offset);
+ } else {
+ dump_src(offsetSrc, regs, consts, false);
+ }
+ }
+ }
+ if (info.has_data_reg) {
+ printf(", R%d", data_reg);
+ }
+ printf("\n");
+}
+
+void dump_instr(const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts,
+ unsigned data_reg, unsigned offset, bool verbose)
+{
+ struct bifrost_regs regs;
+ memcpy((char *) ®s, (char *) &instr->reg_bits, sizeof(regs));
+
+ if (verbose) {
+ printf("# regs: %016" PRIx64 "\n", instr->reg_bits);
+ dump_regs(regs);
+ }
+ dump_fma(instr->fma_bits, regs, next_regs, consts, verbose);
+ dump_add(instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose);
+}
+
+bool dump_clause(uint32_t *words, unsigned *size, unsigned offset, bool verbose) {
+ // State for a decoded clause
+ struct bifrost_alu_inst instrs[8] = {};
+ uint64_t consts[6] = {};
+ unsigned num_instrs = 0;
+ unsigned num_consts = 0;
+ uint64_t header_bits = 0;
+ bool stopbit = false;
+
+ unsigned i;
+ for (i = 0; ; i++, words += 4) {
+ if (verbose) {
+ printf("# ");
+ for (int j = 0; j < 4; j++)
+ printf("%08x ", words[3 - j]); // low bit on the right
+ printf("\n");
+ }
+ unsigned tag = bits(words[0], 0, 8);
+
+ // speculatively decode some things that are common between many formats, so we can share some code
+ struct bifrost_alu_inst main_instr = {};
+ // 20 bits
+ main_instr.add_bits = bits(words[2], 2, 32 - 13);
+ // 23 bits
+ main_instr.fma_bits = bits(words[1], 11, 32) | bits(words[2], 0, 2) << (32 - 11);
+ // 35 bits
+ main_instr.reg_bits = ((uint64_t) bits(words[1], 0, 11)) << 24 | (uint64_t) bits(words[0], 8, 32);
+
+ uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60;
+ uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32;
+
+ bool stop = tag & 0x40;
+
+ if (verbose) {
+ printf("# tag: 0x%02x\n", tag);
+ }
+ if (tag & 0x80) {
+ unsigned idx = stop ? 5 : 2;
+ main_instr.add_bits |= ((tag >> 3) & 0x7) << 17;
+ instrs[idx + 1] = main_instr;
+ instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17);
+ instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10;
+ consts[0] = bits(words[3], 17, 32) << 4;
+ } else {
+ bool done = false;
+ switch ((tag >> 3) & 0x7) {
+ case 0x0:
+ switch (tag & 0x7) {
+ case 0x3:
+ main_instr.add_bits |= bits(words[3], 29, 32) << 17;
+ instrs[1] = main_instr;
+ num_instrs = 2;
+ done = stop;
+ break;
+ case 0x4:
+ instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
+ instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
+ consts[0] = const0;
+ num_instrs = 3;
+ num_consts = 1;
+ done = stop;
+ break;
+ case 0x1:
+ case 0x5:
+ instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
+ instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
+ main_instr.add_bits |= bits(words[3], 26, 29) << 17;
+ instrs[3] = main_instr;
+ if ((tag & 0x7) == 0x5) {
+ num_instrs = 4;
+ done = stop;
+ }
+ break;
+ case 0x6:
+ instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
+ instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
+ consts[0] = const0;
+ num_instrs = 6;
+ num_consts = 1;
+ done = stop;
+ break;
+ case 0x7:
+ instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
+ instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
+ main_instr.add_bits |= bits(words[3], 26, 29) << 17;
+ instrs[6] = main_instr;
+ num_instrs = 7;
+ done = stop;
+ break;
+ default:
+ printf("unknown tag bits 0x%02x\n", tag);
+ }
+ break;
+ case 0x2:
+ case 0x3: {
+ unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7;
+ main_instr.add_bits |= (tag & 0x7) << 17;
+ instrs[idx] = main_instr;
+ consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19;
+ num_consts = 1;
+ num_instrs = idx + 1;
+ done = stop;
+ break;
+ }
+ case 0x4: {
+ unsigned idx = stop ? 4 : 1;
+ main_instr.add_bits |= (tag & 0x7) << 17;
+ instrs[idx] = main_instr;
+ instrs[idx + 1].fma_bits |= bits(words[3], 22, 32);
+ instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19));
+ break;
+ }
+ case 0x1:
+ // only constants can come after this
+ num_instrs = 1;
+ done = stop;
+ case 0x5:
+ header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19));
+ main_instr.add_bits |= (tag & 0x7) << 17;
+ instrs[0] = main_instr;
+ break;
+ case 0x6:
+ case 0x7: {
+ unsigned pos = tag & 0xf;
+ // note that `pos' encodes both the total number of
+ // instructions and the position in the constant stream,
+ // presumably because decoded constants and instructions
+ // share a buffer in the decoder, but we only care about
+ // the position in the constant stream; the total number of
+ // instructions is redundant.
+ unsigned const_idx = 7;
+ switch (pos) {
+ case 0:
+ case 1:
+ case 2:
+ case 6:
+ const_idx = 0;
+ break;
+ case 3:
+ case 4:
+ case 7:
+ case 9:
+ const_idx = 1;
+ break;
+ case 5:
+ case 0xa:
+ const_idx = 2;
+ break;
+ case 8:
+ case 0xb:
+ case 0xc:
+ const_idx = 3;
+ break;
+ case 0xd:
+ const_idx = 4;
+ break;
+ default:
+ printf("# unknown pos 0x%x\n", pos);
+ }
+ if (num_consts < const_idx + 2)
+ num_consts = const_idx + 2;
+ consts[const_idx] = const0;
+ consts[const_idx + 1] = const1;
+ done = stop;
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (done)
+ break;
+ }
+ }
+
+ *size = i + 1;
+
+ if (verbose) {
+ printf("# header: %012" PRIx64 "\n", header_bits);
+ }
+
+ struct bifrost_header header;
+ memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header));
+ dump_header(header, verbose);
+ if (!header.no_end_of_shader)
+ stopbit = true;
+
+ printf("{\n");
+ for (i = 0; i < num_instrs; i++) {
+ struct bifrost_regs next_regs;
+ if (i + 1 == num_instrs) {
+ memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits,
+ sizeof(next_regs));
+ } else {
+ memcpy((char *) &next_regs, (char *) &instrs[i + 1].reg_bits,
+ sizeof(next_regs));
+ }
+
+ dump_instr(&instrs[i], next_regs, consts, header.datareg, offset, verbose);
+ }
+ printf("}\n");
+
+ if (verbose) {
+ for (unsigned i = 0; i < num_consts; i++) {
+ printf("# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff);
+ printf("# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32);
+ }
+ }
+ return stopbit;
+}
+
+void disassemble_bifrost(uint8_t *code, size_t size, bool verbose)
+{
+ uint32_t *words = (uint32_t *) code;
+ uint32_t *words_end = words + (size / 4);
+ // used for displaying branch targets
+ unsigned offset = 0;
+ while (words != words_end)
+ {
+ // we don't know what the program-end bit is quite yet, so for now just
+ // assume that an all-0 quadword is padding
+ uint32_t zero[4] = {};
+ if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0)
+ break;
+ printf("clause_%d:\n", offset);
+ unsigned size;
+ if (dump_clause(words, &size, offset, verbose) == true) {
+ break;
+ }
+ words += size * 4;
+ offset += size;
+ }
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com>
+ * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com>
+ * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdbool.h>
+#include <stddef.h>
+void disassemble_bifrost(uint8_t *code, size_t size, bool verbose);
--- /dev/null
+# Copyright © 2018 Rob Clark
+# Copyright © 2019 Collabora
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libpanfrost_bifrost_files = files(
+ 'disassemble.c',
+)
+
+libpanfrost_bifrost = static_library(
+ 'panfrost_bifrost',
+ [libpanfrost_bifrost_files],
+ include_directories : [inc_common],
+ c_args : [c_vis_args, no_override_init_args],
+ cpp_args : [cpp_vis_args],
+ build_by_default : false,
+)
--- /dev/null
+/*
+ * © Copyright 2017-2018 Alyssa Rosenzweig
+ * © Copyright 2017-2018 Connor Abbott
+ * © Copyright 2017-2018 Lyude Paul
+ * © Copyright2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __PANFROST_JOB_H__
+#define __PANFROST_JOB_H__
+
+#include <stdint.h>
+#include <panfrost-misc.h>
+
+#define MALI_SHORT_PTR_BITS (sizeof(uintptr_t)*8)
+
+#define MALI_FBD_HIERARCHY_WEIGHTS 8
+
+#define MALI_PAYLOAD_SIZE 256
+
+typedef u32 mali_jd_core_req;
+
+enum mali_job_type {
+ JOB_NOT_STARTED = 0,
+ JOB_TYPE_NULL = 1,
+ JOB_TYPE_SET_VALUE = 2,
+ JOB_TYPE_CACHE_FLUSH = 3,
+ JOB_TYPE_COMPUTE = 4,
+ JOB_TYPE_VERTEX = 5,
+ JOB_TYPE_GEOMETRY = 6,
+ JOB_TYPE_TILER = 7,
+ JOB_TYPE_FUSED = 8,
+ JOB_TYPE_FRAGMENT = 9,
+};
+
+enum mali_draw_mode {
+ MALI_DRAW_NONE = 0x0,
+ MALI_POINTS = 0x1,
+ MALI_LINES = 0x2,
+ MALI_LINE_STRIP = 0x4,
+ MALI_LINE_LOOP = 0x6,
+ MALI_TRIANGLES = 0x8,
+ MALI_TRIANGLE_STRIP = 0xA,
+ MALI_TRIANGLE_FAN = 0xC,
+ MALI_POLYGON = 0xD,
+ MALI_QUADS = 0xE,
+ MALI_QUAD_STRIP = 0xF,
+
+ /* All other modes invalid */
+};
+
+/* Applies to tiler_gl_enables */
+
+
+#define MALI_OCCLUSION_QUERY (1 << 3)
+#define MALI_OCCLUSION_PRECISE (1 << 4)
+
+/* Set for a glFrontFace(GL_CCW) in a Y=0=TOP coordinate system (like Gallium).
+ * In OpenGL, this would corresponds to glFrontFace(GL_CW). Mesa and the blob
+ * disagree about how to do viewport flipping, so the blob actually sets this
+ * for GL_CW but then has a negative viewport stride */
+#define MALI_FRONT_CCW_TOP (1 << 5)
+
+#define MALI_CULL_FACE_FRONT (1 << 6)
+#define MALI_CULL_FACE_BACK (1 << 7)
+
+/* TODO: Might this actually be a finer bitfield? */
+#define MALI_DEPTH_STENCIL_ENABLE 0x6400
+
+#define DS_ENABLE(field) \
+ (field == MALI_DEPTH_STENCIL_ENABLE) \
+ ? "MALI_DEPTH_STENCIL_ENABLE" \
+ : (field == 0) ? "0" \
+ : "0 /* XXX: Unknown, check hexdump */"
+
+/* Used in stencil and depth tests */
+
+enum mali_func {
+ MALI_FUNC_NEVER = 0,
+ MALI_FUNC_LESS = 1,
+ MALI_FUNC_EQUAL = 2,
+ MALI_FUNC_LEQUAL = 3,
+ MALI_FUNC_GREATER = 4,
+ MALI_FUNC_NOTEQUAL = 5,
+ MALI_FUNC_GEQUAL = 6,
+ MALI_FUNC_ALWAYS = 7
+};
+
+/* Same OpenGL, but mixed up. Why? Because forget me, that's why! */
+
+enum mali_alt_func {
+ MALI_ALT_FUNC_NEVER = 0,
+ MALI_ALT_FUNC_GREATER = 1,
+ MALI_ALT_FUNC_EQUAL = 2,
+ MALI_ALT_FUNC_GEQUAL = 3,
+ MALI_ALT_FUNC_LESS = 4,
+ MALI_ALT_FUNC_NOTEQUAL = 5,
+ MALI_ALT_FUNC_LEQUAL = 6,
+ MALI_ALT_FUNC_ALWAYS = 7
+};
+
+/* Flags apply to unknown2_3? */
+
+#define MALI_HAS_MSAA (1 << 0)
+#define MALI_CAN_DISCARD (1 << 5)
+
+/* Applies on SFBD systems, specifying that programmable blending is in use */
+#define MALI_HAS_BLEND_SHADER (1 << 6)
+
+/* func is mali_func */
+#define MALI_DEPTH_FUNC(func) (func << 8)
+#define MALI_GET_DEPTH_FUNC(flags) ((flags >> 8) & 0x7)
+#define MALI_DEPTH_FUNC_MASK MALI_DEPTH_FUNC(0x7)
+
+#define MALI_DEPTH_TEST (1 << 11)
+
+/* Next flags to unknown2_4 */
+#define MALI_STENCIL_TEST (1 << 0)
+
+/* What?! */
+#define MALI_SAMPLE_ALPHA_TO_COVERAGE_NO_BLEND_SHADER (1 << 1)
+
+#define MALI_NO_DITHER (1 << 9)
+#define MALI_DEPTH_RANGE_A (1 << 12)
+#define MALI_DEPTH_RANGE_B (1 << 13)
+#define MALI_NO_MSAA (1 << 14)
+
+/* Stencil test state is all encoded in a single u32, just with a lot of
+ * enums... */
+
+enum mali_stencil_op {
+ MALI_STENCIL_KEEP = 0,
+ MALI_STENCIL_REPLACE = 1,
+ MALI_STENCIL_ZERO = 2,
+ MALI_STENCIL_INVERT = 3,
+ MALI_STENCIL_INCR_WRAP = 4,
+ MALI_STENCIL_DECR_WRAP = 5,
+ MALI_STENCIL_INCR = 6,
+ MALI_STENCIL_DECR = 7
+};
+
+struct mali_stencil_test {
+ unsigned ref : 8;
+ unsigned mask : 8;
+ enum mali_func func : 3;
+ enum mali_stencil_op sfail : 3;
+ enum mali_stencil_op dpfail : 3;
+ enum mali_stencil_op dppass : 3;
+ unsigned zero : 4;
+} __attribute__((packed));
+
+#define MALI_MASK_R (1 << 0)
+#define MALI_MASK_G (1 << 1)
+#define MALI_MASK_B (1 << 2)
+#define MALI_MASK_A (1 << 3)
+
+enum mali_nondominant_mode {
+ MALI_BLEND_NON_MIRROR = 0,
+ MALI_BLEND_NON_ZERO = 1
+};
+
+enum mali_dominant_blend {
+ MALI_BLEND_DOM_SOURCE = 0,
+ MALI_BLEND_DOM_DESTINATION = 1
+};
+
+enum mali_dominant_factor {
+ MALI_DOMINANT_UNK0 = 0,
+ MALI_DOMINANT_ZERO = 1,
+ MALI_DOMINANT_SRC_COLOR = 2,
+ MALI_DOMINANT_DST_COLOR = 3,
+ MALI_DOMINANT_UNK4 = 4,
+ MALI_DOMINANT_SRC_ALPHA = 5,
+ MALI_DOMINANT_DST_ALPHA = 6,
+ MALI_DOMINANT_CONSTANT = 7,
+};
+
+enum mali_blend_modifier {
+ MALI_BLEND_MOD_UNK0 = 0,
+ MALI_BLEND_MOD_NORMAL = 1,
+ MALI_BLEND_MOD_SOURCE_ONE = 2,
+ MALI_BLEND_MOD_DEST_ONE = 3,
+};
+
+struct mali_blend_mode {
+ enum mali_blend_modifier clip_modifier : 2;
+ unsigned unused_0 : 1;
+ unsigned negate_source : 1;
+
+ enum mali_dominant_blend dominant : 1;
+
+ enum mali_nondominant_mode nondominant_mode : 1;
+
+ unsigned unused_1 : 1;
+
+ unsigned negate_dest : 1;
+
+ enum mali_dominant_factor dominant_factor : 3;
+ unsigned complement_dominant : 1;
+} __attribute__((packed));
+
+struct mali_blend_equation {
+ /* Of type mali_blend_mode */
+ unsigned rgb_mode : 12;
+ unsigned alpha_mode : 12;
+
+ unsigned zero1 : 4;
+
+ /* Corresponds to MALI_MASK_* above and glColorMask arguments */
+
+ unsigned color_mask : 4;
+} __attribute__((packed));
+
+/* Used with channel swizzling */
+enum mali_channel {
+ MALI_CHANNEL_RED = 0,
+ MALI_CHANNEL_GREEN = 1,
+ MALI_CHANNEL_BLUE = 2,
+ MALI_CHANNEL_ALPHA = 3,
+ MALI_CHANNEL_ZERO = 4,
+ MALI_CHANNEL_ONE = 5,
+ MALI_CHANNEL_RESERVED_0 = 6,
+ MALI_CHANNEL_RESERVED_1 = 7,
+};
+
+struct mali_channel_swizzle {
+ enum mali_channel r : 3;
+ enum mali_channel g : 3;
+ enum mali_channel b : 3;
+ enum mali_channel a : 3;
+} __attribute__((packed));
+
+/* Compressed per-pixel formats. Each of these formats expands to one to four
+ * floating-point or integer numbers, as defined by the OpenGL specification.
+ * There are various places in OpenGL where the user can specify a compressed
+ * format in memory, which all use the same 8-bit enum in the various
+ * descriptors, although different hardware units support different formats.
+ */
+
+/* The top 3 bits specify how the bits of each component are interpreted. */
+
+/* e.g. R11F_G11F_B10F */
+#define MALI_FORMAT_SPECIAL (2 << 5)
+
+/* signed normalized, e.g. RGBA8_SNORM */
+#define MALI_FORMAT_SNORM (3 << 5)
+
+/* e.g. RGBA8UI */
+#define MALI_FORMAT_UINT (4 << 5)
+
+/* e.g. RGBA8 and RGBA32F */
+#define MALI_FORMAT_UNORM (5 << 5)
+
+/* e.g. RGBA8I and RGBA16F */
+#define MALI_FORMAT_SINT (6 << 5)
+
+/* These formats seem to largely duplicate the others. They're used at least
+ * for Bifrost framebuffer output.
+ */
+#define MALI_FORMAT_SPECIAL2 (7 << 5)
+
+/* If the high 3 bits are 3 to 6 these two bits say how many components
+ * there are.
+ */
+#define MALI_NR_CHANNELS(n) ((n - 1) << 3)
+
+/* If the high 3 bits are 3 to 6, then the low 3 bits say how big each
+ * component is, except the special MALI_CHANNEL_FLOAT which overrides what the
+ * bits mean.
+ */
+
+#define MALI_CHANNEL_4 2
+
+#define MALI_CHANNEL_8 3
+
+#define MALI_CHANNEL_16 4
+
+#define MALI_CHANNEL_32 5
+
+/* For MALI_FORMAT_SINT it means a half-float (e.g. RG16F). For
+ * MALI_FORMAT_UNORM, it means a 32-bit float.
+ */
+#define MALI_CHANNEL_FLOAT 7
+
+enum mali_format {
+ MALI_RGB565 = MALI_FORMAT_SPECIAL | 0x0,
+ MALI_RGB5_A1_UNORM = MALI_FORMAT_SPECIAL | 0x2,
+ MALI_RGB10_A2_UNORM = MALI_FORMAT_SPECIAL | 0x3,
+ MALI_RGB10_A2_SNORM = MALI_FORMAT_SPECIAL | 0x5,
+ MALI_RGB10_A2UI = MALI_FORMAT_SPECIAL | 0x7,
+ MALI_RGB10_A2I = MALI_FORMAT_SPECIAL | 0x9,
+
+ /* YUV formats */
+ MALI_NV12 = MALI_FORMAT_SPECIAL | 0xc,
+
+ MALI_Z32_UNORM = MALI_FORMAT_SPECIAL | 0xD,
+ MALI_R32_FIXED = MALI_FORMAT_SPECIAL | 0x11,
+ MALI_RG32_FIXED = MALI_FORMAT_SPECIAL | 0x12,
+ MALI_RGB32_FIXED = MALI_FORMAT_SPECIAL | 0x13,
+ MALI_RGBA32_FIXED = MALI_FORMAT_SPECIAL | 0x14,
+ MALI_R11F_G11F_B10F = MALI_FORMAT_SPECIAL | 0x19,
+ MALI_R9F_G9F_B9F_E5F = MALI_FORMAT_SPECIAL | 0x1b,
+ /* Only used for varyings, to indicate the transformed gl_Position */
+ MALI_VARYING_POS = MALI_FORMAT_SPECIAL | 0x1e,
+ /* Only used for varyings, to indicate that the write should be
+ * discarded.
+ */
+ MALI_VARYING_DISCARD = MALI_FORMAT_SPECIAL | 0x1f,
+
+ MALI_R8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
+ MALI_R16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
+ MALI_R32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
+ MALI_RG8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
+ MALI_RG16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
+ MALI_RG32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
+ MALI_RGB8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
+ MALI_RGB16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
+ MALI_RGB32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
+ MALI_RGBA8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
+ MALI_RGBA16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
+ MALI_RGBA32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
+
+ MALI_R8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
+ MALI_R16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
+ MALI_R32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
+ MALI_RG8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
+ MALI_RG16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
+ MALI_RG32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
+ MALI_RGB8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
+ MALI_RGB16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
+ MALI_RGB32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
+ MALI_RGBA8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
+ MALI_RGBA16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
+ MALI_RGBA32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
+
+ MALI_R8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
+ MALI_R16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
+ MALI_R32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
+ MALI_R32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT,
+ MALI_RG8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
+ MALI_RG16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
+ MALI_RG32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
+ MALI_RG32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT,
+ MALI_RGB8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
+ MALI_RGB16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
+ MALI_RGB32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
+ MALI_RGB32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT,
+ MALI_RGBA4_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_4,
+ MALI_RGBA8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
+ MALI_RGBA16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
+ MALI_RGBA32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
+ MALI_RGBA32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT,
+
+ MALI_R8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
+ MALI_R16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
+ MALI_R32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
+ MALI_R16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT,
+ MALI_RG8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
+ MALI_RG16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
+ MALI_RG32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
+ MALI_RG16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT,
+ MALI_RGB8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
+ MALI_RGB16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
+ MALI_RGB32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
+ MALI_RGB16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT,
+ MALI_RGBA8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
+ MALI_RGBA16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
+ MALI_RGBA32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
+ MALI_RGBA16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT,
+
+ MALI_RGBA4 = MALI_FORMAT_SPECIAL2 | 0x8,
+ MALI_RGBA8_2 = MALI_FORMAT_SPECIAL2 | 0xd,
+ MALI_RGB10_A2_2 = MALI_FORMAT_SPECIAL2 | 0xe,
+};
+
+
+/* Alpha coverage is encoded as 4-bits (from a clampf), with inversion
+ * literally performing a bitwise invert. This function produces slightly wrong
+ * results and I'm not sure why; some rounding issue I suppose... */
+
+#define MALI_ALPHA_COVERAGE(clampf) ((uint16_t) (int) (clampf * 15.0f))
+#define MALI_GET_ALPHA_COVERAGE(nibble) ((float) nibble / 15.0f)
+
+/* Applies to midgard1.flags */
+
+/* Should the hardware perform early-Z testing? Normally should be set
+ * for performance reasons. Clear if you use: discard,
+ * alpha-to-coverage... * It's also possible this disables
+ * forward-pixel kill; we're not quite sure which bit is which yet.
+ * TODO: How does this interact with blending?*/
+
+#define MALI_EARLY_Z (1 << 6)
+
+/* Should the hardware calculate derivatives (via helper invocations)? Set in a
+ * fragment shader that uses texturing or derivative functions */
+
+#define MALI_HELPER_INVOCATIONS (1 << 7)
+
+/* Flags denoting the fragment shader's use of tilebuffer readback. If the
+ * shader might read any part of the tilebuffer, set MALI_READS_TILEBUFFER. If
+ * it might read depth/stencil in particular, also set MALI_READS_ZS */
+
+#define MALI_READS_ZS (1 << 8)
+#define MALI_READS_TILEBUFFER (1 << 12)
+
+/* The raw Midgard blend payload can either be an equation or a shader
+ * address, depending on the context */
+
+union midgard_blend {
+ mali_ptr shader;
+
+ struct {
+ struct mali_blend_equation equation;
+ float constant;
+ };
+};
+
+/* On MRT Midgard systems (using an MFBD), each render target gets its own
+ * blend descriptor */
+
+#define MALI_BLEND_SRGB (0x400)
+
+struct midgard_blend_rt {
+ /* Flags base value of 0x200 to enable the render target.
+ * OR with 0x1 for blending (anything other than REPLACE).
+ * OR with 0x2 for programmable blending with 0-2 registers
+ * OR with 0x3 for programmable blending with 2+ registers
+ * OR with MALI_BLEND_SRGB for implicit sRGB
+ */
+
+ u64 flags;
+ union midgard_blend blend;
+} __attribute__((packed));
+
+/* On Bifrost systems (all MRT), each render target gets one of these
+ * descriptors */
+
+struct bifrost_blend_rt {
+ /* This is likely an analogue of the flags on
+ * midgard_blend_rt */
+
+ u16 flags; // = 0x200
+
+ /* Single-channel blend constants are encoded in a sort of
+ * fixed-point. Basically, the float is mapped to a byte, becoming
+ * a high byte, and then the lower-byte is added for precision.
+ * For the original float f:
+ *
+ * f = (constant_hi / 255) + (constant_lo / 65535)
+ *
+ * constant_hi = int(f / 255)
+ * constant_lo = 65535*f - (65535/255) * constant_hi
+ */
+
+ u16 constant;
+
+ struct mali_blend_equation equation;
+ /*
+ * - 0x19 normally
+ * - 0x3 when this slot is unused (everything else is 0 except the index)
+ * - 0x11 when this is the fourth slot (and it's used)
++ * - 0 when there is a blend shader
+ */
+ u16 unk2;
+ /* increments from 0 to 3 */
+ u16 index;
+
+ union {
+ struct {
+ /* So far, I've only seen:
+ * - R001 for 1-component formats
+ * - RG01 for 2-component formats
+ * - RGB1 for 3-component formats
+ * - RGBA for 4-component formats
+ */
+ u32 swizzle : 12;
+ enum mali_format format : 8;
+
+ /* Type of the shader output variable. Note, this can
+ * be different from the format.
+ *
+ * 0: f16 (mediump float)
+ * 1: f32 (highp float)
+ * 2: i32 (highp int)
+ * 3: u32 (highp uint)
+ * 4: i16 (mediump int)
+ * 5: u16 (mediump uint)
+ */
+ u32 shader_type : 3;
+ u32 zero : 9;
+ };
+
+ /* Only the low 32 bits of the blend shader are stored, the
+ * high 32 bits are implicitly the same as the original shader.
+ * According to the kernel driver, the program counter for
+ * shaders is actually only 24 bits, so shaders cannot cross
+ * the 2^24-byte boundary, and neither can the blend shader.
+ * The blob handles this by allocating a 2^24 byte pool for
+ * shaders, and making sure that any blend shaders are stored
+ * in the same pool as the original shader. The kernel will
+ * make sure this allocation is aligned to 2^24 bytes.
+ */
+ u32 shader;
+ };
+} __attribute__((packed));
+
+/* Descriptor for the shader. Following this is at least one, up to four blend
+ * descriptors for each active render target */
+
+struct mali_shader_meta {
+ mali_ptr shader;
+ u16 texture_count;
+ u16 sampler_count;
+ u16 attribute_count;
+ u16 varying_count;
+
+ union {
+ struct {
+ u32 uniform_buffer_count : 4;
+ u32 unk1 : 28; // = 0x800000 for vertex, 0x958020 for tiler
+ } bifrost1;
+ struct {
+ unsigned uniform_buffer_count : 4;
+ unsigned flags : 12;
+
+ /* Whole number of uniform registers used, times two;
+ * whole number of work registers used (no scale).
+ */
+ unsigned work_count : 5;
+ unsigned uniform_count : 5;
+ unsigned unknown2 : 6;
+ } midgard1;
+ };
+
+ /* On bifrost: Exactly the same as glPolygonOffset() for both.
+ * On midgard: Depth factor is exactly as passed to glPolygonOffset.
+ * Depth units is equal to the value passed to glDeptOhffset + 1.0f
+ * (use MALI_NEGATIVE)
+ */
+ float depth_units;
+ float depth_factor;
+
+ u32 unknown2_2;
+
+ u16 alpha_coverage;
+ u16 unknown2_3;
+
+ u8 stencil_mask_front;
+ u8 stencil_mask_back;
+ u16 unknown2_4;
+
+ struct mali_stencil_test stencil_front;
+ struct mali_stencil_test stencil_back;
+
+ union {
+ struct {
+ u32 unk3 : 7;
+ /* On Bifrost, some system values are preloaded in
+ * registers R55-R62 by the thread dispatcher prior to
+ * the start of shader execution. This is a bitfield
+ * with one entry for each register saying which
+ * registers need to be preloaded. Right now, the known
+ * values are:
+ *
+ * Vertex/compute:
+ * - R55 : gl_LocalInvocationID.xy
+ * - R56 : gl_LocalInvocationID.z + unknown in high 16 bits
+ * - R57 : gl_WorkGroupID.x
+ * - R58 : gl_WorkGroupID.y
+ * - R59 : gl_WorkGroupID.z
+ * - R60 : gl_GlobalInvocationID.x
+ * - R61 : gl_GlobalInvocationID.y/gl_VertexID (without base)
+ * - R62 : gl_GlobalInvocationID.z/gl_InstanceID (without base)
+ *
+ * Fragment:
+ * - R55 : unknown, never seen (but the bit for this is
+ * always set?)
+ * - R56 : unknown (bit always unset)
+ * - R57 : gl_PrimitiveID
+ * - R58 : gl_FrontFacing in low bit, potentially other stuff
+ * - R59 : u16 fragment coordinates (used to compute
+ * gl_FragCoord.xy, together with sample positions)
+ * - R60 : gl_SampleMask (used in epilog, so pretty
+ * much always used, but the bit is always 0 -- is
+ * this just always pushed?)
+ * - R61 : gl_SampleMaskIn and gl_SampleID, used by
+ * varying interpolation.
+ * - R62 : unknown (bit always unset).
+ */
+ u32 preload_regs : 8;
+ /* In units of 8 bytes or 64 bits, since the
+ * uniform/const port loads 64 bits at a time.
+ */
+ u32 uniform_count : 7;
+ u32 unk4 : 10; // = 2
+ } bifrost2;
+ struct {
+ u32 unknown2_7;
+ } midgard2;
+ };
+
+ /* zero on bifrost */
+ u32 unknown2_8;
+
+ /* Blending information for the older non-MRT Midgard HW. Check for
+ * MALI_HAS_BLEND_SHADER to decide how to interpret.
+ */
+
+ union midgard_blend blend;
+} __attribute__((packed));
+
+/* This only concerns hardware jobs */
+
+/* Possible values for job_descriptor_size */
+
+#define MALI_JOB_32 0
+#define MALI_JOB_64 1
+
+struct mali_job_descriptor_header {
+ u32 exception_status;
+ u32 first_incomplete_task;
+ u64 fault_pointer;
+ u8 job_descriptor_size : 1;
+ enum mali_job_type job_type : 7;
+ u8 job_barrier : 1;
+ u8 unknown_flags : 7;
+ u16 job_index;
+ u16 job_dependency_index_1;
+ u16 job_dependency_index_2;
+
+ union {
+ u64 next_job_64;
+ u32 next_job_32;
+ };
+} __attribute__((packed));
+
+struct mali_payload_set_value {
+ u64 out;
+ u64 unknown;
+} __attribute__((packed));
+
+/* Special attributes have a fixed index */
+#define MALI_SPECIAL_ATTRIBUTE_BASE 16
+#define MALI_VERTEX_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 0)
+#define MALI_INSTANCE_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 1)
+
+/*
+ * Mali Attributes
+ *
+ * This structure lets the attribute unit compute the address of an attribute
+ * given the vertex and instance ID. Unfortunately, the way this works is
+ * rather complicated when instancing is enabled.
+ *
+ * To explain this, first we need to explain how compute and vertex threads are
+ * dispatched. This is a guess (although a pretty firm guess!) since the
+ * details are mostly hidden from the driver, except for attribute instancing.
+ * When a quad is dispatched, it receives a single, linear index. However, we
+ * need to translate that index into a (vertex id, instance id) pair, or a
+ * (local id x, local id y, local id z) triple for compute shaders (although
+ * vertex shaders and compute shaders are handled almost identically).
+ * Focusing on vertex shaders, one option would be to do:
+ *
+ * vertex_id = linear_id % num_vertices
+ * instance_id = linear_id / num_vertices
+ *
+ * but this involves a costly division and modulus by an arbitrary number.
+ * Instead, we could pad num_vertices. We dispatch padded_num_vertices *
+ * num_instances threads instead of num_vertices * num_instances, which results
+ * in some "extra" threads with vertex_id >= num_vertices, which we have to
+ * discard. The more we pad num_vertices, the more "wasted" threads we
+ * dispatch, but the division is potentially easier.
+ *
+ * One straightforward choice is to pad num_vertices to the next power of two,
+ * which means that the division and modulus are just simple bit shifts and
+ * masking. But the actual algorithm is a bit more complicated. The thread
+ * dispatcher has special support for dividing by 3, 5, 7, and 9, in addition
+ * to dividing by a power of two. This is possibly using the technique
+ * described in patent US20170010862A1. As a result, padded_num_vertices can be
+ * 1, 3, 5, 7, or 9 times a power of two. This results in less wasted threads,
+ * since we need less padding.
+ *
+ * padded_num_vertices is picked by the hardware. The driver just specifies the
+ * actual number of vertices. At least for Mali G71, the first few cases are
+ * given by:
+ *
+ * num_vertices | padded_num_vertices
+ * 3 | 4
+ * 4-7 | 8
+ * 8-11 | 12 (3 * 4)
+ * 12-15 | 16
+ * 16-19 | 20 (5 * 4)
+ *
+ * Note that padded_num_vertices is a multiple of four (presumably because
+ * threads are dispatched in groups of 4). Also, padded_num_vertices is always
+ * at least one more than num_vertices, which seems like a quirk of the
+ * hardware. For larger num_vertices, the hardware uses the following
+ * algorithm: using the binary representation of num_vertices, we look at the
+ * most significant set bit as well as the following 3 bits. Let n be the
+ * number of bits after those 4 bits. Then we set padded_num_vertices according
+ * to the following table:
+ *
+ * high bits | padded_num_vertices
+ * 1000 | 9 * 2^n
+ * 1001 | 5 * 2^(n+1)
+ * 101x | 3 * 2^(n+2)
+ * 110x | 7 * 2^(n+1)
+ * 111x | 2^(n+4)
+ *
+ * For example, if num_vertices = 70 is passed to glDraw(), its binary
+ * representation is 1000110, so n = 3 and the high bits are 1000, and
+ * therefore padded_num_vertices = 9 * 2^3 = 72.
+ *
+ * The attribute unit works in terms of the original linear_id. if
+ * num_instances = 1, then they are the same, and everything is simple.
+ * However, with instancing things get more complicated. There are four
+ * possible modes, two of them we can group together:
+ *
+ * 1. Use the linear_id directly. Only used when there is no instancing.
+ *
+ * 2. Use the linear_id modulo a constant. This is used for per-vertex
+ * attributes with instancing enabled by making the constant equal
+ * padded_num_vertices. Because the modulus is always padded_num_vertices, this
+ * mode only supports a modulus that is a power of 2 times 1, 3, 5, 7, or 9.
+ * The shift field specifies the power of two, while the extra_flags field
+ * specifies the odd number. If shift = n and extra_flags = m, then the modulus
+ * is (2m + 1) * 2^n. As an example, if num_vertices = 70, then as computed
+ * above, padded_num_vertices = 9 * 2^3, so we should set extra_flags = 4 and
+ * shift = 3. Note that we must exactly follow the hardware algorithm used to
+ * get padded_num_vertices in order to correctly implement per-vertex
+ * attributes.
+ *
+ * 3. Divide the linear_id by a constant. In order to correctly implement
+ * instance divisors, we have to divide linear_id by padded_num_vertices times
+ * to user-specified divisor. So first we compute padded_num_vertices, again
+ * following the exact same algorithm that the hardware uses, then multiply it
+ * by the GL-level divisor to get the hardware-level divisor. This case is
+ * further divided into two more cases. If the hardware-level divisor is a
+ * power of two, then we just need to shift. The shift amount is specified by
+ * the shift field, so that the hardware-level divisor is just 2^shift.
+ *
+ * If it isn't a power of two, then we have to divide by an arbitrary integer.
+ * For that, we use the well-known technique of multiplying by an approximation
+ * of the inverse. The driver must compute the magic multiplier and shift
+ * amount, and then the hardware does the multiplication and shift. The
+ * hardware and driver also use the "round-down" optimization as described in
+ * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf.
+ * The hardware further assumes the multiplier is between 2^31 and 2^32, so the
+ * high bit is implicitly set to 1 even though it is set to 0 by the driver --
+ * presumably this simplifies the hardware multiplier a little. The hardware
+ * first multiplies linear_id by the multiplier and takes the high 32 bits,
+ * then applies the round-down correction if extra_flags = 1, then finally
+ * shifts right by the shift field.
+ *
+ * There are some differences between ridiculousfish's algorithm and the Mali
+ * hardware algorithm, which means that the reference code from ridiculousfish
+ * doesn't always produce the right constants. Mali does not use the pre-shift
+ * optimization, since that would make a hardware implementation slower (it
+ * would have to always do the pre-shift, multiply, and post-shift operations).
+ * It also forces the multplier to be at least 2^31, which means that the
+ * exponent is entirely fixed, so there is no trial-and-error. Altogether,
+ * given the divisor d, the algorithm the driver must follow is:
+ *
+ * 1. Set shift = floor(log2(d)).
+ * 2. Compute m = ceil(2^(shift + 32) / d) and e = 2^(shift + 32) % d.
+ * 3. If e <= 2^shift, then we need to use the round-down algorithm. Set
+ * magic_divisor = m - 1 and extra_flags = 1.
+ * 4. Otherwise, set magic_divisor = m and extra_flags = 0.
+ */
+
+enum mali_attr_mode {
+ MALI_ATTR_UNUSED = 0,
+ MALI_ATTR_LINEAR = 1,
+ MALI_ATTR_POT_DIVIDE = 2,
+ MALI_ATTR_MODULO = 3,
+ MALI_ATTR_NPOT_DIVIDE = 4,
+};
+
+/* This magic "pseudo-address" is used as `elements` to implement
+ * gl_PointCoord. When read from a fragment shader, it generates a point
+ * coordinate per the OpenGL ES 2.0 specification. Flipped coordinate spaces
+ * require an affine transformation in the shader. */
+
+#define MALI_VARYING_POINT_COORD (0x60)
+
+union mali_attr {
+ /* This is used for actual attributes. */
+ struct {
+ /* The bottom 3 bits are the mode */
+ mali_ptr elements : 64 - 8;
+ u32 shift : 5;
+ u32 extra_flags : 3;
+ u32 stride;
+ u32 size;
+ };
+ /* The entry after an NPOT_DIVIDE entry has this format. It stores
+ * extra information that wouldn't fit in a normal entry.
+ */
+ struct {
+ u32 unk; /* = 0x20 */
+ u32 magic_divisor;
+ u32 zero;
+ /* This is the original, GL-level divisor. */
+ u32 divisor;
+ };
+} __attribute__((packed));
+
+struct mali_attr_meta {
+ /* Vertex buffer index */
+ u8 index;
+
+ unsigned unknown1 : 2;
+ unsigned swizzle : 12;
+ enum mali_format format : 8;
+
+ /* Always observed to be zero at the moment */
+ unsigned unknown3 : 2;
+
+ /* When packing multiple attributes in a buffer, offset addresses by
+ * this value. Obscurely, this is signed. */
+ int32_t src_offset;
+} __attribute__((packed));
+
+enum mali_fbd_type {
+ MALI_SFBD = 0,
+ MALI_MFBD = 1,
+};
+
+#define FBD_TYPE (1)
+#define FBD_MASK (~0x3f)
+
+struct mali_uniform_buffer_meta {
+ /* This is actually the size minus 1 (MALI_POSITIVE), in units of 16
+ * bytes. This gives a maximum of 2^14 bytes, which just so happens to
+ * be the GL minimum-maximum for GL_MAX_UNIFORM_BLOCK_SIZE.
+ */
+ u64 size : 10;
+
+ /* This is missing the bottom 2 bits and top 8 bits. The top 8 bits
+ * should be 0 for userspace pointers, according to
+ * https://lwn.net/Articles/718895/. By reusing these bits, we can make
+ * each entry in the table only 64 bits.
+ */
+ mali_ptr ptr : 64 - 10;
+};
+
+/* On Bifrost, these fields are the same between the vertex and tiler payloads.
+ * They also seem to be the same between Bifrost and Midgard. They're shared in
+ * fused payloads.
+ */
+
+/* Applies to unknown_draw */
+
+#define MALI_DRAW_INDEXED_UINT8 (0x10)
+#define MALI_DRAW_INDEXED_UINT16 (0x20)
+#define MALI_DRAW_INDEXED_UINT32 (0x30)
+#define MALI_DRAW_VARYING_SIZE (0x100)
+#define MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX (0x10000)
+
+struct mali_vertex_tiler_prefix {
+ /* This is a dynamic bitfield containing the following things in this order:
+ *
+ * - gl_WorkGroupSize.x
+ * - gl_WorkGroupSize.y
+ * - gl_WorkGroupSize.z
+ * - gl_NumWorkGroups.x
+ * - gl_NumWorkGroups.y
+ * - gl_NumWorkGroups.z
+ *
+ * The number of bits allocated for each number is based on the *_shift
+ * fields below. For example, workgroups_y_shift gives the bit that
+ * gl_NumWorkGroups.y starts at, and workgroups_z_shift gives the bit
+ * that gl_NumWorkGroups.z starts at (and therefore one after the bit
+ * that gl_NumWorkGroups.y ends at). The actual value for each gl_*
+ * value is one more than the stored value, since if any of the values
+ * are zero, then there would be no invocations (and hence no job). If
+ * there were 0 bits allocated to a given field, then it must be zero,
+ * and hence the real value is one.
+ *
+ * Vertex jobs reuse the same job dispatch mechanism as compute jobs,
+ * effectively doing glDispatchCompute(1, vertex_count, instance_count)
+ * where vertex count is the number of vertices.
+ */
+ u32 invocation_count;
+
+ u32 size_y_shift : 5;
+ u32 size_z_shift : 5;
+ u32 workgroups_x_shift : 6;
+ u32 workgroups_y_shift : 6;
+ u32 workgroups_z_shift : 6;
+ /* This is max(workgroups_x_shift, 2) in all the cases I've seen. */
+ u32 workgroups_x_shift_2 : 4;
+
+ u32 draw_mode : 4;
+ u32 unknown_draw : 22;
+
+ /* This is the the same as workgroups_x_shift_2 in compute shaders, but
+ * always 5 for vertex jobs and 6 for tiler jobs. I suspect this has
+ * something to do with how many quads get put in the same execution
+ * engine, which is a balance (you don't want to starve the engine, but
+ * you also want to distribute work evenly).
+ */
+ u32 workgroups_x_shift_3 : 6;
+
+
+ /* Negative of draw_start for TILER jobs from what I've seen */
+ int32_t negative_start;
+ u32 zero1;
+
+ /* Like many other strictly nonzero quantities, index_count is
+ * subtracted by one. For an indexed cube, this is equal to 35 = 6
+ * faces * 2 triangles/per face * 3 vertices/per triangle - 1. That is,
+ * for an indexed draw, index_count is the number of actual vertices
+ * rendered whereas invocation_count is the number of unique vertices
+ * rendered (the number of times the vertex shader must be invoked).
+ * For non-indexed draws, this is just equal to invocation_count. */
+
+ u32 index_count;
+
+ /* No hidden structure; literally just a pointer to an array of uint
+ * indices (width depends on flags). Thanks, guys, for not making my
+ * life insane for once! NULL for non-indexed draws. */
+
+ uintptr_t indices;
+} __attribute__((packed));
+
+/* Point size / line width can either be specified as a 32-bit float (for
+ * constant size) or as a [machine word size]-bit GPU pointer (for varying size). If a pointer
+ * is selected, by setting the appropriate MALI_DRAW_VARYING_SIZE bit in the tiler
+ * payload, the contents of varying_pointer will be intepreted as an array of
+ * fp16 sizes, one for each vertex. gl_PointSize is therefore implemented by
+ * creating a special MALI_R16F varying writing to varying_pointer. */
+
+union midgard_primitive_size {
+ float constant;
+ uintptr_t pointer;
+};
+
+struct bifrost_vertex_only {
+ u32 unk2; /* =0x2 */
+
+ u32 zero0;
+
+ u64 zero1;
+} __attribute__((packed));
+
+struct bifrost_tiler_heap_meta {
+ u32 zero;
+ u32 heap_size;
+ /* note: these are just guesses! */
+ mali_ptr tiler_heap_start;
+ mali_ptr tiler_heap_free;
+ mali_ptr tiler_heap_end;
+
+ /* hierarchy weights? but they're still 0 after the job has run... */
+ u32 zeros[12];
+} __attribute__((packed));
+
+struct bifrost_tiler_meta {
+ u64 zero0;
+ u16 hierarchy_mask;
+ u16 flags;
+ u16 width;
+ u16 height;
+ u64 zero1;
+ mali_ptr tiler_heap_meta;
+ /* TODO what is this used for? */
+ u64 zeros[20];
+} __attribute__((packed));
+
+struct bifrost_tiler_only {
+ /* 0x20 */
+ union midgard_primitive_size primitive_size;
+
+ mali_ptr tiler_meta;
+
+ u64 zero1, zero2, zero3, zero4, zero5, zero6;
+
+ u32 gl_enables;
+ u32 zero7;
+ u64 zero8;
+} __attribute__((packed));
+
+struct bifrost_scratchpad {
+ u32 zero;
+ u32 flags; // = 0x1f
+ /* This is a pointer to a CPU-inaccessible buffer, 16 pages, allocated
+ * during startup. It seems to serve the same purpose as the
+ * gpu_scratchpad in the SFBD for Midgard, although it's slightly
+ * larger.
+ */
+ mali_ptr gpu_scratchpad;
+} __attribute__((packed));
+
+struct mali_vertex_tiler_postfix {
+ /* Zero for vertex jobs. Pointer to the position (gl_Position) varying
+ * output from the vertex shader for tiler jobs.
+ */
+
+ uintptr_t position_varying;
+
+ /* An array of mali_uniform_buffer_meta's. The size is given by the
+ * shader_meta.
+ */
+ uintptr_t uniform_buffers;
+
+ /* This is a pointer to an array of pointers to the texture
+ * descriptors, number of pointers bounded by number of textures. The
+ * indirection is needed to accomodate varying numbers and sizes of
+ * texture descriptors */
+ uintptr_t texture_trampoline;
+
+ /* For OpenGL, from what I've seen, this is intimately connected to
+ * texture_meta. cwabbott says this is not the case under Vulkan, hence
+ * why this field is seperate (Midgard is Vulkan capable). Pointer to
+ * array of sampler descriptors (which are uniform in size) */
+ uintptr_t sampler_descriptor;
+
+ uintptr_t uniforms;
+ u8 flags : 4;
+ uintptr_t _shader_upper : MALI_SHORT_PTR_BITS - 4; /* struct shader_meta */
+ uintptr_t attributes; /* struct attribute_buffer[] */
+ uintptr_t attribute_meta; /* attribute_meta[] */
+ uintptr_t varyings; /* struct attr */
+ uintptr_t varying_meta; /* pointer */
+ uintptr_t viewport;
+ uintptr_t occlusion_counter; /* A single bit as far as I can tell */
+
+ /* Note: on Bifrost, this isn't actually the FBD. It points to
+ * bifrost_scratchpad instead. However, it does point to the same thing
+ * in vertex and tiler jobs.
+ */
+ mali_ptr framebuffer;
+} __attribute__((packed));
+
+struct midgard_payload_vertex_tiler {
+#ifndef __LP64__
+ union midgard_primitive_size primitive_size;
+#endif
+
+ struct mali_vertex_tiler_prefix prefix;
+
+#ifndef __LP64__
+ u32 zero3;
+#endif
+
+ u16 gl_enables; // 0x5
+
+ /* Both zero for non-instanced draws. For instanced draws, a
+ * decomposition of padded_num_vertices. See the comments about the
+ * corresponding fields in mali_attr for context. */
+
+ unsigned instance_shift : 5;
+ unsigned instance_odd : 3;
+
+ u8 zero4;
+
+ /* Offset for first vertex in buffer */
+ u32 draw_start;
+
+ uintptr_t zero5;
+
+ struct mali_vertex_tiler_postfix postfix;
+
+#ifdef __LP64__
+ union midgard_primitive_size primitive_size;
+#endif
+} __attribute__((packed));
+
+struct bifrost_payload_vertex {
+ struct mali_vertex_tiler_prefix prefix;
+ struct bifrost_vertex_only vertex;
+ struct mali_vertex_tiler_postfix postfix;
+} __attribute__((packed));
+
+struct bifrost_payload_tiler {
+ struct mali_vertex_tiler_prefix prefix;
+ struct bifrost_tiler_only tiler;
+ struct mali_vertex_tiler_postfix postfix;
+} __attribute__((packed));
+
+struct bifrost_payload_fused {
+ struct mali_vertex_tiler_prefix prefix;
+ struct bifrost_tiler_only tiler;
+ struct mali_vertex_tiler_postfix tiler_postfix;
+ u64 padding; /* zero */
+ struct bifrost_vertex_only vertex;
+ struct mali_vertex_tiler_postfix vertex_postfix;
+} __attribute__((packed));
+
+/* Purposeful off-by-one in width, height fields. For example, a (64, 64)
+ * texture is stored as (63, 63) in these fields. This adjusts for that.
+ * There's an identical pattern in the framebuffer descriptor. Even vertex
+ * count fields work this way, hence the generic name -- integral fields that
+ * are strictly positive generally need this adjustment. */
+
+#define MALI_POSITIVE(dim) (dim - 1)
+
+/* Opposite of MALI_POSITIVE, found in the depth_units field */
+
+#define MALI_NEGATIVE(dim) (dim + 1)
+
+/* Used with wrapping. Incomplete (this is a 4-bit field...) */
+
+enum mali_wrap_mode {
+ MALI_WRAP_REPEAT = 0x8,
+ MALI_WRAP_CLAMP_TO_EDGE = 0x9,
+ MALI_WRAP_CLAMP_TO_BORDER = 0xB,
+ MALI_WRAP_MIRRORED_REPEAT = 0xC
+};
+
+/* Shared across both command stream and Midgard, and even with Bifrost */
+
+enum mali_texture_type {
+ MALI_TEX_CUBE = 0x0,
+ MALI_TEX_1D = 0x1,
+ MALI_TEX_2D = 0x2,
+ MALI_TEX_3D = 0x3
+};
+
+/* 8192x8192 */
+#define MAX_MIP_LEVELS (13)
+
+/* Cubemap bloats everything up */
+#define MAX_CUBE_FACES (6)
+
+/* For each pointer, there is an address and optionally also a stride */
+#define MAX_ELEMENTS (2)
+
+/* Corresponds to the type passed to glTexImage2D and so forth */
+
+/* Flags for usage2 */
+#define MALI_TEX_MANUAL_STRIDE (0x20)
+
+struct mali_texture_format {
+ unsigned swizzle : 12;
+ enum mali_format format : 8;
+
+ unsigned srgb : 1;
+ unsigned unknown1 : 1;
+
+ enum mali_texture_type type : 2;
+
+ unsigned usage2 : 8;
+} __attribute__((packed));
+
+struct mali_texture_descriptor {
+ uint16_t width;
+ uint16_t height;
+ uint16_t depth;
+ uint16_t array_size;
+
+ struct mali_texture_format format;
+
+ uint16_t unknown3;
+
+ /* One for non-mipmapped, zero for mipmapped */
+ uint8_t unknown3A;
+
+ /* Zero for non-mipmapped, (number of levels - 1) for mipmapped */
+ uint8_t nr_mipmap_levels;
+
+ /* Swizzling is a single 32-bit word, broken up here for convenience.
+ * Here, swizzling refers to the ES 3.0 texture parameters for channel
+ * level swizzling, not the internal pixel-level swizzling which is
+ * below OpenGL's reach */
+
+ unsigned swizzle : 12;
+ unsigned swizzle_zero : 20;
+
+ uint32_t unknown5;
+ uint32_t unknown6;
+ uint32_t unknown7;
+
+ mali_ptr payload[MAX_MIP_LEVELS * MAX_CUBE_FACES * MAX_ELEMENTS];
+} __attribute__((packed));
+
+/* Used as part of filter_mode */
+
+#define MALI_LINEAR 0
+#define MALI_NEAREST 1
+#define MALI_MIP_LINEAR (0x18)
+
+/* Used to construct low bits of filter_mode */
+
+#define MALI_TEX_MAG(mode) (((mode) & 1) << 0)
+#define MALI_TEX_MIN(mode) (((mode) & 1) << 1)
+
+#define MALI_TEX_MAG_MASK (1)
+#define MALI_TEX_MIN_MASK (2)
+
+#define MALI_FILTER_NAME(filter) (filter ? "MALI_NEAREST" : "MALI_LINEAR")
+
+/* Used for lod encoding. Thanks @urjaman for pointing out these routines can
+ * be cleaned up a lot. */
+
+#define DECODE_FIXED_16(x) ((float) (x / 256.0))
+
+static inline uint16_t
+FIXED_16(float x)
+{
+ /* Clamp inputs, accounting for float error */
+ float max_lod = (32.0 - (1.0 / 512.0));
+
+ x = ((x > max_lod) ? max_lod : ((x < 0.0) ? 0.0 : x));
+
+ return (int) (x * 256.0);
+}
+
+struct mali_sampler_descriptor {
+ uint32_t filter_mode;
+
+ /* Fixed point. Upper 8-bits is before the decimal point, although it
+ * caps [0-31]. Lower 8-bits is after the decimal point: int(round(x *
+ * 256)) */
+
+ uint16_t min_lod;
+ uint16_t max_lod;
+
+ /* All one word in reality, but packed a bit */
+
+ enum mali_wrap_mode wrap_s : 4;
+ enum mali_wrap_mode wrap_t : 4;
+ enum mali_wrap_mode wrap_r : 4;
+ enum mali_alt_func compare_func : 3;
+
+ /* No effect on 2D textures. For cubemaps, set for ES3 and clear for
+ * ES2, controlling seamless cubemapping */
+ unsigned seamless_cube_map : 1;
+
+ unsigned zero : 16;
+
+ uint32_t zero2;
+ float border_color[4];
+} __attribute__((packed));
+
+/* viewport0/viewport1 form the arguments to glViewport. viewport1 is
+ * modified by MALI_POSITIVE; viewport0 is as-is.
+ */
+
+struct mali_viewport {
+ /* XY clipping planes */
+ float clip_minx;
+ float clip_miny;
+ float clip_maxx;
+ float clip_maxy;
+
+ /* Depth clipping planes */
+ float clip_minz;
+ float clip_maxz;
+
+ u16 viewport0[2];
+ u16 viewport1[2];
+} __attribute__((packed));
+
+/* From presentations, 16x16 tiles externally. Use shift for fast computation
+ * of tile numbers. */
+
+#define MALI_TILE_SHIFT 4
+#define MALI_TILE_LENGTH (1 << MALI_TILE_SHIFT)
+
+/* Tile coordinates are stored as a compact u32, as only 12 bits are needed to
+ * each component. Notice that this provides a theoretical upper bound of (1 <<
+ * 12) = 4096 tiles in each direction, addressing a maximum framebuffer of size
+ * 65536x65536. Multiplying that together, times another four given that Mali
+ * framebuffers are 32-bit ARGB8888, means that this upper bound would take 16
+ * gigabytes of RAM just to store the uncompressed framebuffer itself, let
+ * alone rendering in real-time to such a buffer.
+ *
+ * Nice job, guys.*/
+
+/* From mali_kbase_10969_workaround.c */
+#define MALI_X_COORD_MASK 0x00000FFF
+#define MALI_Y_COORD_MASK 0x0FFF0000
+
+/* Extract parts of a tile coordinate */
+
+#define MALI_TILE_COORD_X(coord) ((coord) & MALI_X_COORD_MASK)
+#define MALI_TILE_COORD_Y(coord) (((coord) & MALI_Y_COORD_MASK) >> 16)
+#define MALI_TILE_COORD_FLAGS(coord) ((coord) & ~(MALI_X_COORD_MASK | MALI_Y_COORD_MASK))
+
+/* No known flags yet, but just in case...? */
+
+#define MALI_TILE_NO_FLAG (0)
+
+/* Helpers to generate tile coordinates based on the boundary coordinates in
+ * screen space. So, with the bounds (0, 0) to (128, 128) for the screen, these
+ * functions would convert it to the bounding tiles (0, 0) to (7, 7).
+ * Intentional "off-by-one"; finding the tile number is a form of fencepost
+ * problem. */
+
+#define MALI_MAKE_TILE_COORDS(X, Y) ((X) | ((Y) << 16))
+#define MALI_BOUND_TO_TILE(B, bias) ((B - bias) >> MALI_TILE_SHIFT)
+#define MALI_COORDINATE_TO_TILE(W, H, bias) MALI_MAKE_TILE_COORDS(MALI_BOUND_TO_TILE(W, bias), MALI_BOUND_TO_TILE(H, bias))
+#define MALI_COORDINATE_TO_TILE_MIN(W, H) MALI_COORDINATE_TO_TILE(W, H, 0)
+#define MALI_COORDINATE_TO_TILE_MAX(W, H) MALI_COORDINATE_TO_TILE(W, H, 1)
+
+struct mali_payload_fragment {
+ u32 min_tile_coord;
+ u32 max_tile_coord;
+ mali_ptr framebuffer;
+} __attribute__((packed));
+
+/* Single Framebuffer Descriptor */
+
+/* Flags apply to format. With just MSAA_A and MSAA_B, the framebuffer is
+ * configured for 4x. With MSAA_8, it is configured for 8x. */
+
+#define MALI_FRAMEBUFFER_MSAA_8 (1 << 3)
+#define MALI_FRAMEBUFFER_MSAA_A (1 << 4)
+#define MALI_FRAMEBUFFER_MSAA_B (1 << 23)
+
+/* Fast/slow based on whether all three buffers are cleared at once */
+
+#define MALI_CLEAR_FAST (1 << 18)
+#define MALI_CLEAR_SLOW (1 << 28)
+#define MALI_CLEAR_SLOW_STENCIL (1 << 31)
+
+/* Configures hierarchical tiling on Midgard for both SFBD/MFBD (embedded
+ * within the larget framebuffer descriptor). Analogous to
+ * bifrost_tiler_heap_meta and bifrost_tiler_meta*/
+
+struct midgard_tiler_descriptor {
+ /* Size of the entire polygon list; see pan_tiler.c for the
+ * computation. It's based on hierarchical tiling */
+
+ u32 polygon_list_size;
+
+ /* Name known from the replay workaround in the kernel. What exactly is
+ * flagged here is less known. We do that (tiler_hierarchy_mask & 0x1ff)
+ * specifies a mask of hierarchy weights, which explains some of the
+ * performance mysteries around setting it. We also see the bottom bit
+ * of tiler_flags set in the kernel, but no comment why. */
+
+ u16 hierarchy_mask;
+ u16 flags;
+
+ /* See mali_tiler.c for an explanation */
+ mali_ptr polygon_list;
+ mali_ptr polygon_list_body;
+
+ /* Names based on we see symmetry with replay jobs which name these
+ * explicitly */
+
+ mali_ptr heap_start; /* tiler heap_free_address */
+ mali_ptr heap_end;
+
+ /* Hierarchy weights. We know these are weights based on the kernel,
+ * but I've never seen them be anything other than zero */
+ u32 weights[8];
+};
+
+struct mali_single_framebuffer {
+ u32 unknown1;
+ u32 unknown2;
+ u64 unknown_address_0;
+ u64 zero1;
+ u64 zero0;
+
+ /* Exact format is ironically not known, since EGL is finnicky with the
+ * blob. MSAA, colourspace, etc are configured here. */
+
+ u32 format;
+
+ u32 clear_flags;
+ u32 zero2;
+
+ /* Purposeful off-by-one in these fields should be accounted for by the
+ * MALI_DIMENSION macro */
+
+ u16 width;
+ u16 height;
+
+ u32 zero3[8];
+
+ /* By default, the framebuffer is upside down from OpenGL's
+ * perspective. Set framebuffer to the end and negate the stride to
+ * flip in the Y direction */
+
+ mali_ptr framebuffer;
+ int32_t stride;
+
+ u32 zero4;
+
+ /* Depth and stencil buffers are interleaved, it appears, as they are
+ * set to the same address in captures. Both fields set to zero if the
+ * buffer is not being cleared. Depending on GL_ENABLE magic, you might
+ * get a zero enable despite the buffer being present; that still is
+ * disabled. */
+
+ mali_ptr depth_buffer; // not SAME_VA
+ u64 depth_buffer_enable;
+
+ mali_ptr stencil_buffer; // not SAME_VA
+ u64 stencil_buffer_enable;
+
+ u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware
+ u32 clear_color_2; // always equal, but unclear function?
+ u32 clear_color_3; // always equal, but unclear function?
+ u32 clear_color_4; // always equal, but unclear function?
+
+ /* Set to zero if not cleared */
+
+ float clear_depth_1; // float32, ditto
+ float clear_depth_2; // float32, ditto
+ float clear_depth_3; // float32, ditto
+ float clear_depth_4; // float32, ditto
+
+ u32 clear_stencil; // Exactly as it appears in OpenGL
+
+ u32 zero6[7];
+
+ struct midgard_tiler_descriptor tiler;
+
+ /* More below this, maybe */
+} __attribute__((packed));
+
+/* On Midgard, this "framebuffer descriptor" is used for the framebuffer field
+ * of compute jobs. Superficially resembles a single framebuffer descriptor */
+
+struct mali_compute_fbd {
+ u32 unknown1[16];
+} __attribute__((packed));
+
+/* Format bits for the render target flags */
+
+#define MALI_MFBD_FORMAT_MSAA (1 << 1)
+#define MALI_MFBD_FORMAT_SRGB (1 << 2)
+
+enum mali_mfbd_block_format {
+ MALI_MFBD_BLOCK_TILED = 0x0,
+ MALI_MFBD_BLOCK_UNKNOWN = 0x1,
+ MALI_MFBD_BLOCK_LINEAR = 0x2,
+ MALI_MFBD_BLOCK_AFBC = 0x3,
+};
+
+struct mali_rt_format {
+ unsigned unk1 : 32;
+ unsigned unk2 : 3;
+
+ unsigned nr_channels : 2; /* MALI_POSITIVE */
+
+ unsigned unk3 : 5;
+ enum mali_mfbd_block_format block : 2;
+ unsigned flags : 4;
+
+ unsigned swizzle : 12;
+
+ unsigned unk4 : 4;
+} __attribute__((packed));
+
+struct bifrost_render_target {
+ struct mali_rt_format format;
+
+ u64 zero1;
+
+ union {
+ struct {
+ /* Stuff related to ARM Framebuffer Compression. When AFBC is enabled,
+ * there is an extra metadata buffer that contains 16 bytes per tile.
+ * The framebuffer needs to be the same size as before, since we don't
+ * know ahead of time how much space it will take up. The
+ * framebuffer_stride is set to 0, since the data isn't stored linearly
+ * anymore.
+ */
+
+ mali_ptr metadata;
+ u32 stride; // stride in units of tiles
+ u32 unk; // = 0x20000
+ } afbc;
+
+ struct {
+ /* Heck if I know */
+ u64 unk;
+ mali_ptr pointer;
+ } chunknown;
+ };
+
+ mali_ptr framebuffer;
+
+ u32 zero2 : 4;
+ u32 framebuffer_stride : 28; // in units of bytes
+ u32 zero3;
+
+ u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware
+ u32 clear_color_2; // always equal, but unclear function?
+ u32 clear_color_3; // always equal, but unclear function?
+ u32 clear_color_4; // always equal, but unclear function?
+} __attribute__((packed));
+
+/* An optional part of bifrost_framebuffer. It comes between the main structure
+ * and the array of render targets. It must be included if any of these are
+ * enabled:
+ *
+ * - Transaction Elimination
+ * - Depth/stencil
+ * - TODO: Anything else?
+ */
+
+/* Flags field: note, these are guesses */
+
+#define MALI_EXTRA_PRESENT (0x400)
+#define MALI_EXTRA_AFBC (0x20)
+#define MALI_EXTRA_AFBC_ZS (0x10)
+#define MALI_EXTRA_ZS (0x4)
+
+struct bifrost_fb_extra {
+ mali_ptr checksum;
+ /* Each tile has an 8 byte checksum, so the stride is "width in tiles * 8" */
+ u32 checksum_stride;
+
+ u32 flags;
+
+ union {
+ /* Note: AFBC is only allowed for 24/8 combined depth/stencil. */
+ struct {
+ mali_ptr depth_stencil_afbc_metadata;
+ u32 depth_stencil_afbc_stride; // in units of tiles
+ u32 zero1;
+
+ mali_ptr depth_stencil;
+
+ u64 padding;
+ } ds_afbc;
+
+ struct {
+ /* Depth becomes depth/stencil in case of combined D/S */
+ mali_ptr depth;
+ u32 depth_stride_zero : 4;
+ u32 depth_stride : 28;
+ u32 zero1;
+
+ mali_ptr stencil;
+ u32 stencil_stride_zero : 4;
+ u32 stencil_stride : 28;
+ u32 zero2;
+ } ds_linear;
+ };
+
+
+ u64 zero3, zero4;
+} __attribute__((packed));
+
+/* Flags for mfbd_flags */
+
+/* Enables writing depth results back to main memory (rather than keeping them
+ * on-chip in the tile buffer and then discarding) */
+
+#define MALI_MFBD_DEPTH_WRITE (1 << 10)
+
+/* The MFBD contains the extra bifrost_fb_extra section */
+
+#define MALI_MFBD_EXTRA (1 << 13)
+
+struct bifrost_framebuffer {
+ u32 unk0; // = 0x10
+
+ u32 unknown2; // = 0x1f, same as SFBD
+ mali_ptr scratchpad;
+
+ /* 0x10 */
+ mali_ptr sample_locations;
+ mali_ptr unknown1;
+ /* 0x20 */
+ u16 width1, height1;
+ u32 zero3;
+ u16 width2, height2;
+ u32 unk1 : 19; // = 0x01000
+ u32 rt_count_1 : 2; // off-by-one (use MALI_POSITIVE)
+ u32 unk2 : 3; // = 0
+ u32 rt_count_2 : 3; // no off-by-one
+ u32 zero4 : 5;
+ /* 0x30 */
+ u32 clear_stencil : 8;
+ u32 mfbd_flags : 24; // = 0x100
+ float clear_depth;
+
+ struct midgard_tiler_descriptor tiler;
+
+ /* optional: struct bifrost_fb_extra extra */
+ /* struct bifrost_render_target rts[] */
+} __attribute__((packed));
+
+#endif /* __PANFROST_JOB_H__ */
--- /dev/null
+/*
+ * © Copyright 2017-2018 The Panfrost Community
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __PANFROST_MISC_H__
+#define __PANFROST_MISC_H__
+
+#include <inttypes.h>
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+typedef uint64_t mali_ptr;
+
+#define MALI_PTR_FMT "0x%" PRIx64
+
+/* FIXME: put this somewhere more fitting */
+#define MALI_MEM_MAP_TRACKING_HANDLE (3ull << 12)
+
+#endif
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
-inc_panfrost = include_directories(['.', 'shared'])
+inc_panfrost_hw = include_directories([
+ 'include'
+])
+
+inc_panfrost = include_directories([
+ '.', 'include', 'shared', 'midgard', 'bifrost'
+])
subdir('shared')
+subdir('midgard')
+subdir('bifrost')
+subdir('pandecode')
+
+files_pandecode = files(
+ 'pandecode/cmdline.c',
+ 'pandecode/common.c',
+ 'pandecode/decode.c',
+ 'pandecode/pan_pretty_print.c',
+
+ 'midgard/disassemble.c',
+ 'midgard/midgard_ops.c',
+ 'bifrost/disassemble.c',
+)
+
+pandecode = executable(
+ 'pandecoder',
+ files_pandecode,
+ include_directories : [inc_common, inc_include, inc_src, inc_panfrost],
+ dependencies : [
+ dep_thread,
+ ],
+ link_with : [
+ libmesa_util
+ ],
+ build_by_default : true
+)
+
+files_bifrost = files(
+ 'bifrost/cmdline.c',
+)
+
+bifrost_compiler = executable(
+ 'bifrost_compiler',
+ [files_bifrost],
+ include_directories : [
+ inc_common,
+ inc_include,
+ inc_src,
+ inc_panfrost,
+ ],
+ dependencies : [
+ dep_thread,
+ idep_nir
+ ],
+ link_with : [
+ libglsl_standalone,
+ libmesa_util,
+ libpanfrost_bifrost
+ ],
+ build_by_default : true
+)
--- /dev/null
+/*
+ * Copyright (C) 2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MDG_COMPILER_H
+#define _MDG_COMPILER_H
+
+#include "midgard.h"
+#include "helpers.h"
+#include "midgard_compile.h"
+
+#include "util/hash_table.h"
+#include "util/u_dynarray.h"
+#include "util/set.h"
+#include "util/list.h"
+
+#include "main/mtypes.h"
+#include "compiler/nir_types.h"
+#include "compiler/nir/nir.h"
+
+/* Forward declare */
+struct midgard_block;
+
+/* Target types. Defaults to TARGET_GOTO (the type corresponding directly to
+ * the hardware), hence why that must be zero. TARGET_DISCARD signals this
+ * instruction is actually a discard op. */
+
+#define TARGET_GOTO 0
+#define TARGET_BREAK 1
+#define TARGET_CONTINUE 2
+#define TARGET_DISCARD 3
+
+typedef struct midgard_branch {
+ /* If conditional, the condition is specified in r31.w */
+ bool conditional;
+
+ /* For conditionals, if this is true, we branch on FALSE. If false, we branch on TRUE. */
+ bool invert_conditional;
+
+ /* Branch targets: the start of a block, the start of a loop (continue), the end of a loop (break). Value is one of TARGET_ */
+ unsigned target_type;
+
+ /* The actual target */
+ union {
+ int target_block;
+ int target_break;
+ int target_continue;
+ };
+} midgard_branch;
+
+/* Instruction arguments represented as block-local SSA indices, rather than
+ * registers. Negative values mean unused. */
+
+typedef struct {
+ int src0;
+ int src1;
+ int dest;
+
+ /* src1 is -not- SSA but instead a 16-bit inline constant to be smudged
+ * in. Only valid for ALU ops. */
+ bool inline_constant;
+} ssa_args;
+
+/* Generic in-memory data type repesenting a single logical instruction, rather
+ * than a single instruction group. This is the preferred form for code gen.
+ * Multiple midgard_insturctions will later be combined during scheduling,
+ * though this is not represented in this structure. Its format bridges
+ * the low-level binary representation with the higher level semantic meaning.
+ *
+ * Notably, it allows registers to be specified as block local SSA, for code
+ * emitted before the register allocation pass.
+ */
+
+typedef struct midgard_instruction {
+ /* Must be first for casting */
+ struct list_head link;
+
+ unsigned type; /* ALU, load/store, texture */
+
+ /* If the register allocator has not run yet... */
+ ssa_args ssa_args;
+
+ /* Special fields for an ALU instruction */
+ midgard_reg_info registers;
+
+ /* I.e. (1 << alu_bit) */
+ int unit;
+
+ /* When emitting bundle, should this instruction have a break forced
+ * before it? Used for r31 writes which are valid only within a single
+ * bundle and *need* to happen as early as possible... this is a hack,
+ * TODO remove when we have a scheduler */
+ bool precede_break;
+
+ bool has_constants;
+ float constants[4];
+ uint16_t inline_constant;
+ bool has_blend_constant;
+
+ bool compact_branch;
+ bool writeout;
+ bool prepacked_branch;
+
+ /* Masks in a saneish format. One bit per channel, not packed fancy.
+ * Use this instead of the op specific ones, and switch over at emit
+ * time */
+ uint16_t mask;
+
+ union {
+ midgard_load_store_word load_store;
+ midgard_vector_alu alu;
+ midgard_texture_word texture;
+ midgard_branch_extended branch_extended;
+ uint16_t br_compact;
+
+ /* General branch, rather than packed br_compact. Higher level
+ * than the other components */
+ midgard_branch branch;
+ };
+} midgard_instruction;
+
+typedef struct midgard_block {
+ /* Link to next block. Must be first for mir_get_block */
+ struct list_head link;
+
+ /* List of midgard_instructions emitted for the current block */
+ struct list_head instructions;
+
+ bool is_scheduled;
+
+ /* List of midgard_bundles emitted (after the scheduler has run) */
+ struct util_dynarray bundles;
+
+ /* Number of quadwords _actually_ emitted, as determined after scheduling */
+ unsigned quadword_count;
+
+ /* Successors: always one forward (the block after us), maybe
+ * one backwards (for a backward branch). No need for a second
+ * forward, since graph traversal would get there eventually
+ * anyway */
+ struct midgard_block *successors[2];
+ unsigned nr_successors;
+
+ /* The successors pointer form a graph, and in the case of
+ * complex control flow, this graph has a cycles. To aid
+ * traversal during liveness analysis, we have a visited?
+ * boolean for passes to use as they see fit, provided they
+ * clean up later */
+ bool visited;
+} midgard_block;
+
+typedef struct midgard_bundle {
+ /* Tag for the overall bundle */
+ int tag;
+
+ /* Instructions contained by the bundle */
+ int instruction_count;
+ midgard_instruction *instructions[5];
+
+ /* Bundle-wide ALU configuration */
+ int padding;
+ int control;
+ bool has_embedded_constants;
+ float constants[4];
+ bool has_blend_constant;
+} midgard_bundle;
+
+typedef struct compiler_context {
+ nir_shader *nir;
+ gl_shader_stage stage;
+
+ /* Is internally a blend shader? Depends on stage == FRAGMENT */
+ bool is_blend;
+
+ /* Tracking for blend constant patching */
+ int blend_constant_offset;
+
+ /* Current NIR function */
+ nir_function *func;
+
+ /* Unordered list of midgard_blocks */
+ int block_count;
+ struct list_head blocks;
+
+ midgard_block *initial_block;
+ midgard_block *previous_source_block;
+ midgard_block *final_block;
+
+ /* List of midgard_instructions emitted for the current block */
+ midgard_block *current_block;
+
+ /* The current "depth" of the loop, for disambiguating breaks/continues
+ * when using nested loops */
+ int current_loop_depth;
+
+ /* Total number of loops for shader-db */
+ unsigned loop_count;
+
+ /* Constants which have been loaded, for later inlining */
+ struct hash_table_u64 *ssa_constants;
+
+ /* SSA values / registers which have been aliased. Naively, these
+ * demand a fmov output; instead, we alias them in a later pass to
+ * avoid the wasted op.
+ *
+ * A note on encoding: to avoid dynamic memory management here, rather
+ * than ampping to a pointer, we map to the source index; the key
+ * itself is just the destination index. */
+
+ struct hash_table_u64 *ssa_to_alias;
+ struct set *leftover_ssa_to_alias;
+
+ /* Actual SSA-to-register for RA */
+ struct hash_table_u64 *ssa_to_register;
+
+ /* Mapping of hashes computed from NIR indices to the sequential temp indices ultimately used in MIR */
+ struct hash_table_u64 *hash_to_temp;
+ int temp_count;
+ int max_hash;
+
+ /* Just the count of the max register used. Higher count => higher
+ * register pressure */
+ int work_registers;
+
+ /* Used for cont/last hinting. Increase when a tex op is added.
+ * Decrease when a tex op is removed. */
+ int texture_op_count;
+
+ /* Mapping of texture register -> SSA index for unaliasing */
+ int texture_index[2];
+
+ /* If any path hits a discard instruction */
+ bool can_discard;
+
+ /* The number of uniforms allowable for the fast path */
+ int uniform_cutoff;
+
+ /* Count of instructions emitted from NIR overall, across all blocks */
+ int instruction_count;
+
+ /* Alpha ref value passed in */
+ float alpha_ref;
+
+ /* The index corresponding to the fragment output */
+ unsigned fragment_output;
+
+ /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */
+ unsigned sysvals[MAX_SYSVAL_COUNT];
+ unsigned sysval_count;
+ struct hash_table_u64 *sysval_to_id;
+} compiler_context;
+
+/* Helpers for manipulating the above structures (forming the driver IR) */
+
+/* Append instruction to end of current block */
+
+static inline midgard_instruction *
+mir_upload_ins(struct midgard_instruction ins)
+{
+ midgard_instruction *heap = malloc(sizeof(ins));
+ memcpy(heap, &ins, sizeof(ins));
+ return heap;
+}
+
+static inline void
+emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins)
+{
+ list_addtail(&(mir_upload_ins(ins))->link, &ctx->current_block->instructions);
+}
+
+static inline void
+mir_insert_instruction_before(struct midgard_instruction *tag, struct midgard_instruction ins)
+{
+ list_addtail(&(mir_upload_ins(ins))->link, &tag->link);
+}
+
+static inline void
+mir_remove_instruction(struct midgard_instruction *ins)
+{
+ list_del(&ins->link);
+}
+
+static inline midgard_instruction*
+mir_prev_op(struct midgard_instruction *ins)
+{
+ return list_last_entry(&(ins->link), midgard_instruction, link);
+}
+
+static inline midgard_instruction*
+mir_next_op(struct midgard_instruction *ins)
+{
+ return list_first_entry(&(ins->link), midgard_instruction, link);
+}
+
+#define mir_foreach_block(ctx, v) \
+ list_for_each_entry(struct midgard_block, v, &ctx->blocks, link)
+
+#define mir_foreach_block_from(ctx, from, v) \
+ list_for_each_entry_from(struct midgard_block, v, from, &ctx->blocks, link)
+
+#define mir_foreach_instr(ctx, v) \
+ list_for_each_entry(struct midgard_instruction, v, &ctx->current_block->instructions, link)
+
+#define mir_foreach_instr_safe(ctx, v) \
+ list_for_each_entry_safe(struct midgard_instruction, v, &ctx->current_block->instructions, link)
+
+#define mir_foreach_instr_in_block(block, v) \
+ list_for_each_entry(struct midgard_instruction, v, &block->instructions, link)
+
+#define mir_foreach_instr_in_block_safe(block, v) \
+ list_for_each_entry_safe(struct midgard_instruction, v, &block->instructions, link)
+
+#define mir_foreach_instr_in_block_safe_rev(block, v) \
+ list_for_each_entry_safe_rev(struct midgard_instruction, v, &block->instructions, link)
+
+#define mir_foreach_instr_in_block_from(block, v, from) \
+ list_for_each_entry_from(struct midgard_instruction, v, from, &block->instructions, link)
+
+#define mir_foreach_instr_in_block_from_rev(block, v, from) \
+ list_for_each_entry_from_rev(struct midgard_instruction, v, from, &block->instructions, link)
+
+#define mir_foreach_bundle_in_block(block, v) \
+ util_dynarray_foreach(&block->bundles, midgard_bundle, v)
+
+#define mir_foreach_instr_global(ctx, v) \
+ mir_foreach_block(ctx, v_block) \
+ mir_foreach_instr_in_block(v_block, v)
+
+
+static inline midgard_instruction *
+mir_last_in_block(struct midgard_block *block)
+{
+ return list_last_entry(&block->instructions, struct midgard_instruction, link);
+}
+
+static inline midgard_block *
+mir_get_block(compiler_context *ctx, int idx)
+{
+ struct list_head *lst = &ctx->blocks;
+
+ while ((idx--) + 1)
+ lst = lst->next;
+
+ return (struct midgard_block *) lst;
+}
+
+static inline bool
+mir_is_alu_bundle(midgard_bundle *bundle)
+{
+ return IS_ALU(bundle->tag);
+}
+
+/* MIR manipulation */
+
+void mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new);
+void mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new);
+void mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new);
+
+/* MIR printing */
+
+void mir_print_instruction(midgard_instruction *ins);
+void mir_print_bundle(midgard_bundle *ctx);
+void mir_print_block(midgard_block *block);
+void mir_print_shader(compiler_context *ctx);
+
+/* MIR goodies */
+
+static const midgard_vector_alu_src blank_alu_src = {
+ .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+};
+
+static const midgard_vector_alu_src blank_alu_src_xxxx = {
+ .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X),
+};
+
+static const midgard_scalar_alu_src blank_scalar_alu_src = {
+ .full = true
+};
+
+/* Used for encoding the unused source of 1-op instructions */
+static const midgard_vector_alu_src zero_alu_src = { 0 };
+
+/* 'Intrinsic' move for aliasing */
+
+static inline midgard_instruction
+v_mov(unsigned src, midgard_vector_alu_src mod, unsigned dest)
+{
+ midgard_instruction ins = {
+ .type = TAG_ALU_4,
+ .mask = 0xF,
+ .ssa_args = {
+ .src0 = SSA_UNUSED_1,
+ .src1 = src,
+ .dest = dest,
+ },
+ .alu = {
+ .op = midgard_alu_op_imov,
+ .reg_mode = midgard_reg_mode_32,
+ .dest_override = midgard_dest_override_none,
+ .outmod = midgard_outmod_int_wrap,
+ .src1 = vector_alu_srco_unsigned(zero_alu_src),
+ .src2 = vector_alu_srco_unsigned(mod)
+ },
+ };
+
+ return ins;
+}
+
+/* Scheduling */
+
+void schedule_program(compiler_context *ctx);
+
+/* Register allocation */
+
+struct ra_graph;
+
+struct ra_graph* allocate_registers(compiler_context *ctx);
+void install_registers(compiler_context *ctx, struct ra_graph *g);
+bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src);
+bool mir_has_multiple_writes(compiler_context *ctx, int src);
+
+void mir_create_pipeline_registers(compiler_context *ctx);
+
+/* Final emission */
+
+void emit_binary_bundle(
+ compiler_context *ctx,
+ midgard_bundle *bundle,
+ struct util_dynarray *emission,
+ int next_tag);
+
+/* NIR stuff */
+
+bool
+nir_undef_to_zero(nir_shader *shader);
+
+#endif
--- /dev/null
+struct exec_list;
+
+bool do_mat_op_to_vec(struct exec_list *instructions);
+
+extern "C" {
+ bool c_do_mat_op_to_vec(struct exec_list *instructions) {
+ return do_mat_op_to_vec(instructions);
+ }
+};
--- /dev/null
+/* Author(s):
+ * Connor Abbott
+ * Alyssa Rosenzweig
+ *
+ * Copyright (c) 2013 Connor Abbott (connor@abbott.cx)
+ * Copyright (c) 2018 Alyssa Rosenzweig (alyssa@rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <ctype.h>
+#include <string.h>
+#include "midgard.h"
+#include "midgard-parse.h"
+#include "midgard_ops.h"
+#include "disassemble.h"
+#include "helpers.h"
+#include "util/half_float.h"
+#include "util/u_math.h"
+
+#define DEFINE_CASE(define, str) case define: { printf(str); break; }
+
+static bool is_instruction_int = false;
+
+/* Prints a short form of the tag for branching, the minimum needed to be
+ * legible and unambiguous */
+
+static void
+print_tag_short(unsigned tag)
+{
+ switch (midgard_word_types[tag]) {
+ case midgard_word_type_texture:
+ printf("tex/%X", tag);
+ break;
+
+ case midgard_word_type_load_store:
+ printf("ldst");
+ break;
+
+ case midgard_word_type_alu:
+ printf("alu%d/%X", midgard_word_size[tag], tag);
+ break;
+
+ default:
+ printf("%s%X", (tag > 0) ? "" : "unk", tag);
+ break;
+ }
+}
+
+static void
+print_alu_opcode(midgard_alu_op op)
+{
+ bool int_op = false;
+
+ if (alu_opcode_props[op].name) {
+ printf("%s", alu_opcode_props[op].name);
+
+ int_op = midgard_is_integer_op(op);
+ } else
+ printf("alu_op_%02X", op);
+
+ /* For constant analysis */
+ is_instruction_int = int_op;
+}
+
+static void
+print_ld_st_opcode(midgard_load_store_op op)
+{
+ if (load_store_opcode_names[op])
+ printf("%s", load_store_opcode_names[op]);
+ else
+ printf("ldst_op_%02X", op);
+}
+
+static bool is_embedded_constant_half = false;
+static bool is_embedded_constant_int = false;
+
+static char
+prefix_for_bits(unsigned bits)
+{
+ switch (bits) {
+ case 8:
+ return 'q';
+ case 16:
+ return 'h';
+ case 64:
+ return 'd';
+ default:
+ return 0;
+ }
+}
+
+static void
+print_reg(unsigned reg, unsigned bits)
+{
+ /* Perform basic static analysis for expanding constants correctly */
+
+ if (reg == 26) {
+ is_embedded_constant_int = is_instruction_int;
+ is_embedded_constant_half = (bits < 32);
+ }
+
+ char prefix = prefix_for_bits(bits);
+
+ if (prefix)
+ putchar(prefix);
+
+ printf("r%u", reg);
+}
+
+static char *outmod_names_float[4] = {
+ "",
+ ".pos",
+ ".unk2",
+ ".sat"
+};
+
+static char *outmod_names_int[4] = {
+ ".isat",
+ ".usat",
+ "",
+ ".hi"
+};
+
+static char *srcmod_names_int[4] = {
+ "sext(",
+ "zext(",
+ "",
+ "("
+};
+
+static void
+print_outmod(unsigned outmod, bool is_int)
+{
+ printf("%s", is_int ? outmod_names_int[outmod] :
+ outmod_names_float[outmod]);
+}
+
+static void
+print_quad_word(uint32_t *words, unsigned tabs)
+{
+ unsigned i;
+
+ for (i = 0; i < 4; i++)
+ printf("0x%08X%s ", words[i], i == 3 ? "" : ",");
+
+ printf("\n");
+}
+
+static const char components[16] = "xyzwefghijklmnop";
+
+/* Helper to print 4 chars of a swizzle */
+static void
+print_swizzle_helper(unsigned swizzle, bool upper)
+{
+ for (unsigned i = 0; i < 4; ++i) {
+ unsigned c = (swizzle >> (i * 2)) & 3;
+ c += upper*4;
+ printf("%c", components[c]);
+ }
+}
+
+/* Helper to print 8 chars of a swizzle, duplicating over */
+static void
+print_swizzle_helper_8(unsigned swizzle, bool upper)
+{
+ for (unsigned i = 0; i < 4; ++i) {
+ unsigned c = (swizzle >> (i * 2)) & 3;
+ c *= 2;
+ c += upper*8;
+ printf("%c%c", components[c], components[c+1]);
+ }
+}
+
+static void
+print_swizzle_vec16(unsigned swizzle, bool rep_high, bool rep_low,
+ midgard_dest_override override)
+{
+ printf(".");
+
+ if (override == midgard_dest_override_upper) {
+ if (rep_high)
+ printf(" /* rep_high */ ");
+ if (rep_low)
+ printf(" /* rep_low */ ");
+
+ if (!rep_high && rep_low)
+ print_swizzle_helper_8(swizzle, true);
+ else
+ print_swizzle_helper_8(swizzle, false);
+ } else {
+ print_swizzle_helper_8(swizzle, rep_high & 1);
+ print_swizzle_helper_8(swizzle, !rep_low & 1);
+ }
+}
+
+static void
+print_swizzle_vec8(unsigned swizzle, bool rep_high, bool rep_low)
+{
+ printf(".");
+
+ print_swizzle_helper(swizzle, rep_high & 1);
+ print_swizzle_helper(swizzle, !rep_low & 1);
+}
+
+static void
+print_swizzle_vec4(unsigned swizzle, bool rep_high, bool rep_low)
+{
+ if (rep_high)
+ printf(" /* rep_high */ ");
+ if (rep_low)
+ printf(" /* rep_low */ ");
+
+ if (swizzle == 0xE4) return; /* xyzw */
+
+ printf(".");
+ print_swizzle_helper(swizzle, 0);
+}
+static void
+print_swizzle_vec2(unsigned swizzle, bool rep_high, bool rep_low)
+{
+ if (rep_high)
+ printf(" /* rep_high */ ");
+ if (rep_low)
+ printf(" /* rep_low */ ");
+
+ if (swizzle == 0xE4) return; /* XY */
+
+ printf(".");
+
+ for (unsigned i = 0; i < 4; i += 2) {
+ unsigned a = (swizzle >> (i * 2)) & 3;
+ unsigned b = (swizzle >> ((i+1) * 2)) & 3;
+
+ /* Normally we're adjacent, but if there's an issue, don't make
+ * it ambiguous */
+
+ if (a & 0x1)
+ printf("[%c%c]", components[a], components[b]);
+ else if (a == b)
+ printf("%c", components[a >> 1]);
+ else if (b == (a + 1))
+ printf("%c", "XY"[a >> 1]);
+ else
+ printf("[%c%c]", components[a], components[b]);
+ }
+}
+
+static int
+bits_for_mode(midgard_reg_mode mode)
+{
+ switch (mode) {
+ case midgard_reg_mode_8:
+ return 8;
+ case midgard_reg_mode_16:
+ return 16;
+ case midgard_reg_mode_32:
+ return 32;
+ case midgard_reg_mode_64:
+ return 64;
+ default:
+ return 0;
+ }
+}
+
+static int
+bits_for_mode_halved(midgard_reg_mode mode, bool half)
+{
+ unsigned bits = bits_for_mode(mode);
+
+ if (half)
+ bits >>= 1;
+
+ return bits;
+}
+
+static void
+print_vector_src(unsigned src_binary,
+ midgard_reg_mode mode, unsigned reg,
+ midgard_dest_override override, bool is_int)
+{
+ midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary;
+
+ /* Modifiers change meaning depending on the op's context */
+
+ midgard_int_mod int_mod = src->mod;
+
+ if (is_int) {
+ printf("%s", srcmod_names_int[int_mod]);
+ } else {
+ if (src->mod & MIDGARD_FLOAT_MOD_NEG)
+ printf("-");
+
+ if (src->mod & MIDGARD_FLOAT_MOD_ABS)
+ printf("abs(");
+ }
+
+ //register
+ unsigned bits = bits_for_mode_halved(mode, src->half);
+ print_reg(reg, bits);
+
+ //swizzle
+ if (bits == 16)
+ print_swizzle_vec8(src->swizzle, src->rep_high, src->rep_low);
+ else if (bits == 8)
+ print_swizzle_vec16(src->swizzle, src->rep_high, src->rep_low, override);
+ else if (bits == 32)
+ print_swizzle_vec4(src->swizzle, src->rep_high, src->rep_low);
+ else if (bits == 64)
+ print_swizzle_vec2(src->swizzle, src->rep_high, src->rep_low);
+
+ /* Since we wrapped with a function-looking thing */
+
+ if (is_int && int_mod == midgard_int_shift)
+ printf(") << %d", bits);
+ else if ((is_int && (int_mod != midgard_int_normal))
+ || (!is_int && src->mod & MIDGARD_FLOAT_MOD_ABS))
+ printf(")");
+}
+
+static uint16_t
+decode_vector_imm(unsigned src2_reg, unsigned imm)
+{
+ uint16_t ret;
+ ret = src2_reg << 11;
+ ret |= (imm & 0x7) << 8;
+ ret |= (imm >> 3) & 0xFF;
+ return ret;
+}
+
+static void
+print_immediate(uint16_t imm)
+{
+ if (is_instruction_int)
+ printf("#%d", imm);
+ else
+ printf("#%g", _mesa_half_to_float(imm));
+}
+
+static unsigned
+print_dest(unsigned reg, midgard_reg_mode mode, midgard_dest_override override)
+{
+ /* Depending on the mode and override, we determine the type of
+ * destination addressed. Absent an override, we address just the
+ * type of the operation itself */
+
+ unsigned bits = bits_for_mode(mode);
+
+ if (override != midgard_dest_override_none)
+ bits /= 2;
+
+ print_reg(reg, bits);
+
+ return bits;
+}
+
+static void
+print_mask_vec16(uint8_t mask, midgard_dest_override override)
+{
+ printf(".");
+
+ if (override == midgard_dest_override_none) {
+ for (unsigned i = 0; i < 8; i++) {
+ if (mask & (1 << i))
+ printf("%c%c",
+ components[i*2 + 0],
+ components[i*2 + 1]);
+ }
+ } else {
+ bool upper = (override == midgard_dest_override_upper);
+
+ for (unsigned i = 0; i < 8; i++) {
+ if (mask & (1 << i))
+ printf("%c", components[i + (upper ? 8 : 0)]);
+ }
+ }
+}
+
+/* For 16-bit+ masks, we read off from the 8-bit mask field. For 16-bit (vec8),
+ * it's just one bit per channel, easy peasy. For 32-bit (vec4), it's one bit
+ * per channel with one duplicate bit in the middle. For 64-bit (vec2), it's
+ * one-bit per channel with _3_ duplicate bits in the middle. Basically, just
+ * subdividing the 128-bit word in 16-bit increments. For 64-bit, we uppercase
+ * the mask to make it obvious what happened */
+
+static void
+print_mask(uint8_t mask, unsigned bits, midgard_dest_override override)
+{
+ if (bits == 8) {
+ print_mask_vec16(mask, override);
+ return;
+ }
+
+ /* Skip 'complete' masks */
+
+ if (bits >= 32 && mask == 0xFF) return;
+
+ if (bits == 16) {
+ if (mask == 0x0F)
+ return;
+ else if (mask == 0xF0) {
+ printf("'");
+ return;
+ }
+ }
+
+ printf(".");
+
+ unsigned skip = (bits / 16);
+ bool uppercase = bits > 32;
+ bool tripped = false;
+
+ for (unsigned i = 0; i < 8; i += skip) {
+ bool a = (mask & (1 << i)) != 0;
+
+ for (unsigned j = 1; j < skip; ++j) {
+ bool dupe = (mask & (1 << (i + j))) != 0;
+ tripped |= (dupe != a);
+ }
+
+ if (a) {
+ char c = components[i / skip];
+
+ if (uppercase)
+ c = toupper(c);
+
+ printf("%c", c);
+ }
+ }
+
+ if (tripped)
+ printf(" /* %X */", mask);
+}
+
+/* Prints the 4-bit masks found in texture and load/store ops, as opposed to
+ * the 8-bit masks found in (vector) ALU ops */
+
+static void
+print_mask_4(unsigned mask)
+{
+ if (mask == 0xF) return;
+
+ printf(".");
+
+ for (unsigned i = 0; i < 4; ++i) {
+ bool a = (mask & (1 << i)) != 0;
+ if (a)
+ printf("%c", components[i]);
+ }
+}
+
+static void
+print_vector_field(const char *name, uint16_t *words, uint16_t reg_word,
+ unsigned tabs)
+{
+ midgard_reg_info *reg_info = (midgard_reg_info *)®_word;
+ midgard_vector_alu *alu_field = (midgard_vector_alu *) words;
+ midgard_reg_mode mode = alu_field->reg_mode;
+ unsigned override = alu_field->dest_override;
+
+ /* For now, prefix instruction names with their unit, until we
+ * understand how this works on a deeper level */
+ printf("%s.", name);
+
+ print_alu_opcode(alu_field->op);
+
+ /* Postfix with the size to disambiguate if necessary */
+ char postfix = prefix_for_bits(bits_for_mode(mode));
+ bool size_ambiguous = override != midgard_dest_override_none;
+
+ if (size_ambiguous)
+ printf("%c", postfix ? postfix : 'r');
+
+ /* Print the outmod, if there is one */
+ print_outmod(alu_field->outmod,
+ midgard_is_integer_out_op(alu_field->op));
+
+ printf(" ");
+
+ /* Mask denoting status of 8-lanes */
+ uint8_t mask = alu_field->mask;
+
+ /* First, print the destination */
+ unsigned dest_size =
+ print_dest(reg_info->out_reg, mode, alu_field->dest_override);
+
+ /* Apply the destination override to the mask */
+
+ if (mode == midgard_reg_mode_32 || mode == midgard_reg_mode_64) {
+ if (override == midgard_dest_override_lower)
+ mask &= 0x0F;
+ else if (override == midgard_dest_override_upper)
+ mask &= 0xF0;
+ } else if (mode == midgard_reg_mode_16
+ && override == midgard_dest_override_lower) {
+ /* stub */
+ }
+
+ if (override != midgard_dest_override_none) {
+ bool modeable = (mode != midgard_reg_mode_8);
+ bool known = override != 0x3; /* Unused value */
+
+ if (!(modeable && known))
+ printf("/* do%d */ ", override);
+ }
+
+ print_mask(mask, dest_size, override);
+
+ printf(", ");
+
+ bool is_int = midgard_is_integer_op(alu_field->op);
+ print_vector_src(alu_field->src1, mode, reg_info->src1_reg, override, is_int);
+
+ printf(", ");
+
+ if (reg_info->src2_imm) {
+ uint16_t imm = decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2);
+ print_immediate(imm);
+ } else {
+ print_vector_src(alu_field->src2, mode,
+ reg_info->src2_reg, override, is_int);
+ }
+
+ printf("\n");
+}
+
+static void
+print_scalar_src(unsigned src_binary, unsigned reg)
+{
+ midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary;
+
+ if (src->negate)
+ printf("-");
+
+ if (src->abs)
+ printf("abs(");
+
+ print_reg(reg, src->full ? 32 : 16);
+
+ unsigned c = src->component;
+
+ if (src->full) {
+ assert((c & 1) == 0);
+ c >>= 1;
+ }
+
+ printf(".%c", components[c]);
+
+ if (src->abs)
+ printf(")");
+
+}
+
+static uint16_t
+decode_scalar_imm(unsigned src2_reg, unsigned imm)
+{
+ uint16_t ret;
+ ret = src2_reg << 11;
+ ret |= (imm & 3) << 9;
+ ret |= (imm & 4) << 6;
+ ret |= (imm & 0x38) << 2;
+ ret |= imm >> 6;
+ return ret;
+}
+
+static void
+print_scalar_field(const char *name, uint16_t *words, uint16_t reg_word,
+ unsigned tabs)
+{
+ midgard_reg_info *reg_info = (midgard_reg_info *)®_word;
+ midgard_scalar_alu *alu_field = (midgard_scalar_alu *) words;
+
+ if (alu_field->unknown)
+ printf("scalar ALU unknown bit set\n");
+
+ printf("%s.", name);
+ print_alu_opcode(alu_field->op);
+ print_outmod(alu_field->outmod,
+ midgard_is_integer_out_op(alu_field->op));
+ printf(" ");
+
+ bool full = alu_field->output_full;
+ print_reg(reg_info->out_reg, full ? 32 : 16);
+ unsigned c = alu_field->output_component;
+
+ if (full) {
+ assert((c & 1) == 0);
+ c >>= 1;
+ }
+
+ printf(".%c, ", components[c]);
+
+ print_scalar_src(alu_field->src1, reg_info->src1_reg);
+
+ printf(", ");
+
+ if (reg_info->src2_imm) {
+ uint16_t imm = decode_scalar_imm(reg_info->src2_reg,
+ alu_field->src2);
+ print_immediate(imm);
+ } else
+ print_scalar_src(alu_field->src2, reg_info->src2_reg);
+
+ printf("\n");
+}
+
+static void
+print_branch_op(int op)
+{
+ switch (op) {
+ case midgard_jmp_writeout_op_branch_uncond:
+ printf("uncond.");
+ break;
+
+ case midgard_jmp_writeout_op_branch_cond:
+ printf("cond.");
+ break;
+
+ case midgard_jmp_writeout_op_writeout:
+ printf("write.");
+ break;
+
+ case midgard_jmp_writeout_op_tilebuffer_pending:
+ printf("tilebuffer.");
+ break;
+
+ case midgard_jmp_writeout_op_discard:
+ printf("discard.");
+ break;
+
+ default:
+ printf("unk%d.", op);
+ break;
+ }
+}
+
+static void
+print_branch_cond(int cond)
+{
+ switch (cond) {
+ case midgard_condition_write0:
+ printf("write0");
+ break;
+
+ case midgard_condition_false:
+ printf("false");
+ break;
+
+ case midgard_condition_true:
+ printf("true");
+ break;
+
+ case midgard_condition_always:
+ printf("always");
+ break;
+
+ default:
+ printf("unk%X", cond);
+ break;
+ }
+}
+
+static void
+print_compact_branch_writeout_field(uint16_t word)
+{
+ midgard_jmp_writeout_op op = word & 0x7;
+
+ switch (op) {
+ case midgard_jmp_writeout_op_branch_uncond: {
+ midgard_branch_uncond br_uncond;
+ memcpy((char *) &br_uncond, (char *) &word, sizeof(br_uncond));
+ printf("br.uncond ");
+
+ if (br_uncond.unknown != 1)
+ printf("unknown:%d, ", br_uncond.unknown);
+
+ if (br_uncond.offset >= 0)
+ printf("+");
+
+ printf("%d -> ", br_uncond.offset);
+ print_tag_short(br_uncond.dest_tag);
+ printf("\n");
+
+ break;
+ }
+
+ case midgard_jmp_writeout_op_branch_cond:
+ case midgard_jmp_writeout_op_writeout:
+ case midgard_jmp_writeout_op_discard:
+ default: {
+ midgard_branch_cond br_cond;
+ memcpy((char *) &br_cond, (char *) &word, sizeof(br_cond));
+
+ printf("br.");
+
+ print_branch_op(br_cond.op);
+ print_branch_cond(br_cond.cond);
+
+ printf(" ");
+
+ if (br_cond.offset >= 0)
+ printf("+");
+
+ printf("%d -> ", br_cond.offset);
+ print_tag_short(br_cond.dest_tag);
+ printf("\n");
+
+ break;
+ }
+ }
+}
+
+static void
+print_extended_branch_writeout_field(uint8_t *words)
+{
+ midgard_branch_extended br;
+ memcpy((char *) &br, (char *) words, sizeof(br));
+
+ printf("brx.");
+
+ print_branch_op(br.op);
+
+ /* Condition repeated 8 times in all known cases. Check this. */
+
+ unsigned cond = br.cond & 0x3;
+
+ for (unsigned i = 0; i < 16; i += 2) {
+ assert(((br.cond >> i) & 0x3) == cond);
+ }
+
+ print_branch_cond(cond);
+
+ if (br.unknown)
+ printf(".unknown%d", br.unknown);
+
+ printf(" ");
+
+ if (br.offset >= 0)
+ printf("+");
+
+ printf("%d -> ", br.offset);
+ print_tag_short(br.dest_tag);
+ printf("\n");
+}
+
+static unsigned
+num_alu_fields_enabled(uint32_t control_word)
+{
+ unsigned ret = 0;
+
+ if ((control_word >> 17) & 1)
+ ret++;
+
+ if ((control_word >> 19) & 1)
+ ret++;
+
+ if ((control_word >> 21) & 1)
+ ret++;
+
+ if ((control_word >> 23) & 1)
+ ret++;
+
+ if ((control_word >> 25) & 1)
+ ret++;
+
+ return ret;
+}
+
+static float
+float_bitcast(uint32_t integer)
+{
+ union {
+ uint32_t i;
+ float f;
+ } v;
+
+ v.i = integer;
+ return v.f;
+}
+
+static void
+print_alu_word(uint32_t *words, unsigned num_quad_words,
+ unsigned tabs)
+{
+ uint32_t control_word = words[0];
+ uint16_t *beginning_ptr = (uint16_t *)(words + 1);
+ unsigned num_fields = num_alu_fields_enabled(control_word);
+ uint16_t *word_ptr = beginning_ptr + num_fields;
+ unsigned num_words = 2 + num_fields;
+
+ if ((control_word >> 16) & 1)
+ printf("unknown bit 16 enabled\n");
+
+ if ((control_word >> 17) & 1) {
+ print_vector_field("vmul", word_ptr, *beginning_ptr, tabs);
+ beginning_ptr += 1;
+ word_ptr += 3;
+ num_words += 3;
+ }
+
+ if ((control_word >> 18) & 1)
+ printf("unknown bit 18 enabled\n");
+
+ if ((control_word >> 19) & 1) {
+ print_scalar_field("sadd", word_ptr, *beginning_ptr, tabs);
+ beginning_ptr += 1;
+ word_ptr += 2;
+ num_words += 2;
+ }
+
+ if ((control_word >> 20) & 1)
+ printf("unknown bit 20 enabled\n");
+
+ if ((control_word >> 21) & 1) {
+ print_vector_field("vadd", word_ptr, *beginning_ptr, tabs);
+ beginning_ptr += 1;
+ word_ptr += 3;
+ num_words += 3;
+ }
+
+ if ((control_word >> 22) & 1)
+ printf("unknown bit 22 enabled\n");
+
+ if ((control_word >> 23) & 1) {
+ print_scalar_field("smul", word_ptr, *beginning_ptr, tabs);
+ beginning_ptr += 1;
+ word_ptr += 2;
+ num_words += 2;
+ }
+
+ if ((control_word >> 24) & 1)
+ printf("unknown bit 24 enabled\n");
+
+ if ((control_word >> 25) & 1) {
+ print_vector_field("lut", word_ptr, *beginning_ptr, tabs);
+ beginning_ptr += 1;
+ word_ptr += 3;
+ num_words += 3;
+ }
+
+ if ((control_word >> 26) & 1) {
+ print_compact_branch_writeout_field(*word_ptr);
+ word_ptr += 1;
+ num_words += 1;
+ }
+
+ if ((control_word >> 27) & 1) {
+ print_extended_branch_writeout_field((uint8_t *) word_ptr);
+ word_ptr += 3;
+ num_words += 3;
+ }
+
+ if (num_quad_words > (num_words + 7) / 8) {
+ assert(num_quad_words == (num_words + 15) / 8);
+ //Assume that the extra quadword is constants
+ void *consts = words + (4 * num_quad_words - 4);
+
+ if (is_embedded_constant_int) {
+ if (is_embedded_constant_half) {
+ int16_t *sconsts = (int16_t *) consts;
+ printf("sconstants %d, %d, %d, %d\n",
+ sconsts[0],
+ sconsts[1],
+ sconsts[2],
+ sconsts[3]);
+ } else {
+ int32_t *iconsts = (int32_t *) consts;
+ printf("iconstants %d, %d, %d, %d\n",
+ iconsts[0],
+ iconsts[1],
+ iconsts[2],
+ iconsts[3]);
+ }
+ } else {
+ if (is_embedded_constant_half) {
+ uint16_t *hconsts = (uint16_t *) consts;
+ printf("hconstants %g, %g, %g, %g\n",
+ _mesa_half_to_float(hconsts[0]),
+ _mesa_half_to_float(hconsts[1]),
+ _mesa_half_to_float(hconsts[2]),
+ _mesa_half_to_float(hconsts[3]));
+ } else {
+ uint32_t *fconsts = (uint32_t *) consts;
+ printf("fconstants %g, %g, %g, %g\n",
+ float_bitcast(fconsts[0]),
+ float_bitcast(fconsts[1]),
+ float_bitcast(fconsts[2]),
+ float_bitcast(fconsts[3]));
+ }
+
+ }
+ }
+}
+
+static void
+print_varying_parameters(midgard_load_store_word *word)
+{
+ midgard_varying_parameter param;
+ unsigned v = word->varying_parameters;
+ memcpy(¶m, &v, sizeof(param));
+
+ if (param.is_varying) {
+ /* If a varying, there are qualifiers */
+ if (param.flat)
+ printf(".flat");
+
+ if (param.interpolation != midgard_interp_default) {
+ if (param.interpolation == midgard_interp_centroid)
+ printf(".centroid");
+ else
+ printf(".interp%d", param.interpolation);
+ }
+
+ if (param.modifier != midgard_varying_mod_none) {
+ if (param.modifier == midgard_varying_mod_perspective_w)
+ printf(".perspectivew");
+ else if (param.modifier == midgard_varying_mod_perspective_z)
+ printf(".perspectivez");
+ else
+ printf(".mod%d", param.modifier);
+ }
+ } else if (param.flat || param.interpolation || param.modifier) {
+ printf(" /* is_varying not set but varying metadata attached */");
+ }
+
+ if (param.zero0 || param.zero1 || param.zero2)
+ printf(" /* zero tripped, %d %d %d */ ", param.zero0, param.zero1, param.zero2);
+}
+
+static bool
+is_op_varying(unsigned op)
+{
+ switch (op) {
+ case midgard_op_st_vary_16:
+ case midgard_op_st_vary_32:
+ case midgard_op_ld_vary_16:
+ case midgard_op_ld_vary_32:
+ return true;
+ }
+
+ return false;
+}
+
+static void
+print_load_store_instr(uint64_t data,
+ unsigned tabs)
+{
+ midgard_load_store_word *word = (midgard_load_store_word *) &data;
+
+ print_ld_st_opcode(word->op);
+
+ if (is_op_varying(word->op))
+ print_varying_parameters(word);
+
+ printf(" r%d", word->reg);
+ print_mask_4(word->mask);
+
+ int address = word->address;
+
+ if (word->op == midgard_op_ld_uniform_32) {
+ /* Uniforms use their own addressing scheme */
+
+ int lo = word->varying_parameters >> 7;
+ int hi = word->address;
+
+ /* TODO: Combine fields logically */
+ address = (hi << 3) | lo;
+ }
+
+ printf(", %d", address);
+
+ print_swizzle_vec4(word->swizzle, false, false);
+
+ printf(", 0x%X /* %X */\n", word->unknown, word->varying_parameters);
+}
+
+static void
+print_load_store_word(uint32_t *word, unsigned tabs)
+{
+ midgard_load_store *load_store = (midgard_load_store *) word;
+
+ if (load_store->word1 != 3) {
+ print_load_store_instr(load_store->word1, tabs);
+ }
+
+ if (load_store->word2 != 3) {
+ print_load_store_instr(load_store->word2, tabs);
+ }
+}
+
+static void
+print_texture_reg(bool full, bool select, bool upper)
+{
+ if (full)
+ printf("r%d", REG_TEX_BASE + select);
+ else
+ printf("hr%d", (REG_TEX_BASE + select) * 2 + upper);
+
+ if (full && upper)
+ printf("// error: out full / upper mutually exclusive\n");
+
+}
+
+static void
+print_texture_reg_triple(unsigned triple)
+{
+ bool full = triple & 1;
+ bool select = triple & 2;
+ bool upper = triple & 4;
+
+ print_texture_reg(full, select, upper);
+}
+
+static void
+print_texture_format(int format)
+{
+ /* Act like a modifier */
+ printf(".");
+
+ switch (format) {
+ DEFINE_CASE(MALI_TEX_1D, "1d");
+ DEFINE_CASE(MALI_TEX_2D, "2d");
+ DEFINE_CASE(MALI_TEX_3D, "3d");
+ DEFINE_CASE(MALI_TEX_CUBE, "cube");
+
+ default:
+ unreachable("Bad format");
+ }
+}
+
+static void
+print_texture_op(unsigned op, bool gather)
+{
+ /* Act like a bare name, like ESSL functions */
+
+ if (gather) {
+ printf("textureGather");
+
+ unsigned component = op >> 4;
+ unsigned bottom = op & 0xF;
+
+ if (bottom != 0x2)
+ printf("_unk%d", bottom);
+
+ printf(".%c", components[component]);
+ return;
+ }
+
+ switch (op) {
+ DEFINE_CASE(TEXTURE_OP_NORMAL, "texture");
+ DEFINE_CASE(TEXTURE_OP_LOD, "textureLod");
+ DEFINE_CASE(TEXTURE_OP_TEXEL_FETCH, "texelFetch");
+
+ default:
+ printf("tex_%d", op);
+ break;
+ }
+}
+
+static bool
+texture_op_takes_bias(unsigned op)
+{
+ return op == TEXTURE_OP_NORMAL;
+}
+
+static char
+sampler_type_name(enum mali_sampler_type t)
+{
+ switch (t) {
+ case MALI_SAMPLER_FLOAT:
+ return 'f';
+ case MALI_SAMPLER_UNSIGNED:
+ return 'u';
+ case MALI_SAMPLER_SIGNED:
+ return 'i';
+ default:
+ return '?';
+ }
+
+}
+
+#undef DEFINE_CASE
+
+static void
+print_texture_word(uint32_t *word, unsigned tabs)
+{
+ midgard_texture_word *texture = (midgard_texture_word *) word;
+
+ /* Broad category of texture operation in question */
+ print_texture_op(texture->op, texture->is_gather);
+
+ /* Specific format in question */
+ print_texture_format(texture->format);
+
+ assert(texture->zero == 0);
+
+ /* Instruction "modifiers" parallel the ALU instructions. */
+
+ if (texture->shadow)
+ printf(".shadow");
+
+ if (texture->cont)
+ printf(".cont");
+
+ if (texture->last)
+ printf(".last");
+
+ printf(" ");
+
+ print_texture_reg(texture->out_full, texture->out_reg_select, texture->out_upper);
+ print_mask_4(texture->mask);
+ printf(", ");
+
+ printf("texture%d, ", texture->texture_handle);
+
+ /* Print the type, GL style */
+ printf("%c", sampler_type_name(texture->sampler_type));
+ printf("sampler%d", texture->sampler_handle);
+ print_swizzle_vec4(texture->swizzle, false, false);
+ printf(", ");
+
+ print_texture_reg(texture->in_reg_full, texture->in_reg_select, texture->in_reg_upper);
+ print_swizzle_vec4(texture->in_reg_swizzle, false, false);
+
+ /* There is *always* an offset attached. Of
+ * course, that offset is just immediate #0 for a
+ * GLES call that doesn't take an offset. If there
+ * is a non-negative non-zero offset, this is
+ * specified in immediate offset mode, with the
+ * values in the offset_* fields as immediates. If
+ * this is a negative offset, we instead switch to
+ * a register offset mode, where the offset_*
+ * fields become register triplets */
+
+ if (texture->offset_register) {
+ printf(" + ");
+ print_texture_reg_triple(texture->offset_x);
+
+ /* The less questions you ask, the better. */
+
+ unsigned swizzle_lo, swizzle_hi;
+ unsigned orig_y = texture->offset_y;
+ unsigned orig_z = texture->offset_z;
+
+ memcpy(&swizzle_lo, &orig_y, sizeof(unsigned));
+ memcpy(&swizzle_hi, &orig_z, sizeof(unsigned));
+
+ /* Duplicate hi swizzle over */
+ assert(swizzle_hi < 4);
+ swizzle_hi = (swizzle_hi << 2) | swizzle_hi;
+
+ unsigned swiz = (swizzle_lo << 4) | swizzle_hi;
+ unsigned reversed = util_bitreverse(swiz) >> 24;
+ print_swizzle_vec4(reversed, false, false);
+
+ printf(", ");
+ } else if (texture->offset_x || texture->offset_y || texture->offset_z) {
+ /* Only select ops allow negative immediate offsets, verify */
+
+ bool neg_x = texture->offset_x < 0;
+ bool neg_y = texture->offset_y < 0;
+ bool neg_z = texture->offset_z < 0;
+ bool any_neg = neg_x || neg_y || neg_z;
+
+ if (any_neg && texture->op != TEXTURE_OP_TEXEL_FETCH)
+ printf("/* invalid negative */ ");
+
+ /* Regardless, just print the immediate offset */
+
+ printf(" + <%d, %d, %d>, ",
+ texture->offset_x,
+ texture->offset_y,
+ texture->offset_z);
+ } else {
+ printf(", ");
+ }
+
+ char lod_operand = texture_op_takes_bias(texture->op) ? '+' : '=';
+
+ if (texture->lod_register) {
+ midgard_tex_register_select sel;
+ uint8_t raw = texture->bias;
+ memcpy(&sel, &raw, sizeof(raw));
+
+ unsigned c = (sel.component_hi << 1) | sel.component_lo;
+
+ printf("lod %c ", lod_operand);
+ print_texture_reg(sel.full, sel.select, sel.upper);
+ printf(".%c, ", components[c]);
+
+ if (!sel.component_hi)
+ printf(" /* gradient? */");
+
+ if (texture->bias_int)
+ printf(" /* bias_int = 0x%X */", texture->bias_int);
+
+ if (sel.zero)
+ printf(" /* sel.zero = 0x%X */", sel.zero);
+ } else if (texture->op == TEXTURE_OP_TEXEL_FETCH) {
+ /* For texel fetch, the int LOD is in the fractional place and
+ * there is no fraction / possibility of bias. We *always* have
+ * an explicit LOD, even if it's zero. */
+
+ if (texture->bias_int)
+ printf(" /* bias_int = 0x%X */ ", texture->bias_int);
+
+ printf("lod = %d, ", texture->bias);
+ } else if (texture->bias || texture->bias_int) {
+ signed bias_int = texture->bias_int;
+ float bias_frac = texture->bias / 256.0f;
+ float bias = bias_int + bias_frac;
+
+ bool is_bias = texture_op_takes_bias(texture->op);
+ char sign = (bias >= 0.0) ? '+' : '-';
+ char operand = is_bias ? sign : '=';
+
+ printf("lod %c %f, ", operand, fabsf(bias));
+ }
+
+ printf("\n");
+
+ /* While not zero in general, for these simple instructions the
+ * following unknowns are zero, so we don't include them */
+
+ if (texture->unknown2 ||
+ texture->unknown4 ||
+ texture->unknownA ||
+ texture->unknown8) {
+ printf("// unknown2 = 0x%x\n", texture->unknown2);
+ printf("// unknown4 = 0x%x\n", texture->unknown4);
+ printf("// unknownA = 0x%x\n", texture->unknownA);
+ printf("// unknown8 = 0x%x\n", texture->unknown8);
+ }
+}
+
+void
+disassemble_midgard(uint8_t *code, size_t size)
+{
+ uint32_t *words = (uint32_t *) code;
+ unsigned num_words = size / 4;
+ int tabs = 0;
+
+ bool prefetch_flag = false;
+
+ unsigned i = 0;
+
+ while (i < num_words) {
+ unsigned tag = words[i] & 0xF;
+ unsigned num_quad_words = midgard_word_size[tag];
+
+ switch (midgard_word_types[tag]) {
+ case midgard_word_type_texture:
+ print_texture_word(&words[i], tabs);
+ break;
+
+ case midgard_word_type_load_store:
+ print_load_store_word(&words[i], tabs);
+ break;
+
+ case midgard_word_type_alu:
+ print_alu_word(&words[i], num_quad_words, tabs);
+
+ if (prefetch_flag)
+ return;
+
+ /* Reset word static analysis state */
+ is_embedded_constant_half = false;
+ is_embedded_constant_int = false;
+
+ break;
+
+ default:
+ printf("Unknown word type %u:\n", words[i] & 0xF);
+ num_quad_words = 1;
+ print_quad_word(&words[i], tabs);
+ printf("\n");
+ break;
+ }
+
+ printf("\n");
+
+ unsigned next = (words[i] & 0xF0) >> 4;
+
+ i += 4 * num_quad_words;
+
+ /* Break based on instruction prefetch flag */
+
+ if (i < num_words && next == 1) {
+ prefetch_flag = true;
+
+ if (midgard_word_types[words[i] & 0xF] != midgard_word_type_alu)
+ return;
+ }
+ }
+
+ return;
+}
--- /dev/null
+#include <stddef.h>
+void disassemble_midgard(uint8_t *code, size_t size);
--- /dev/null
+/* Copyright (c) 2018-2019 Alyssa Rosenzweig (alyssa@rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __MDG_HELPERS_H
+#define __MDG_HELPERS_H
+
+#include "util/macros.h"
+#include <string.h>
+
+#define OP_IS_STORE_VARY(op) (\
+ op == midgard_op_st_vary_16 || \
+ op == midgard_op_st_vary_32 \
+ )
+
+#define OP_IS_STORE(op) (\
+ OP_IS_STORE_VARY(op) || \
+ op == midgard_op_st_cubemap_coords \
+ )
+
+#define OP_IS_MOVE(op) ( \
+ op == midgard_alu_op_fmov || \
+ op == midgard_alu_op_imov \
+ )
+
+/* ALU control words are single bit fields with a lot of space */
+
+#define ALU_ENAB_VEC_MUL (1 << 17)
+#define ALU_ENAB_SCAL_ADD (1 << 19)
+#define ALU_ENAB_VEC_ADD (1 << 21)
+#define ALU_ENAB_SCAL_MUL (1 << 23)
+#define ALU_ENAB_VEC_LUT (1 << 25)
+#define ALU_ENAB_BR_COMPACT (1 << 26)
+#define ALU_ENAB_BRANCH (1 << 27)
+
+/* Other opcode properties that don't conflict with the ALU_ENABs, non-ISA */
+
+/* Denotes an opcode that takes a vector input with a fixed-number of
+ * channels, but outputs to only a single output channel, like dot products.
+ * For these, to determine the effective mask, this quirk can be set. We have
+ * an intentional off-by-one (a la MALI_POSITIVE), since 0-channel makes no
+ * sense but we need to fit 4 channels in 2-bits. Similarly, 1-channel doesn't
+ * make sense (since then why are we quirked?), so that corresponds to "no
+ * count set" */
+
+#define OP_CHANNEL_COUNT(c) ((c - 1) << 0)
+#define GET_CHANNEL_COUNT(c) ((c & (0x3 << 0)) ? ((c & (0x3 << 0)) + 1) : 0)
+
+/* For instructions that take a single argument, normally the first argument
+ * slot is used for the argument and the second slot is a dummy #0 constant.
+ * However, there are exceptions: instructions like fmov store their argument
+ * in the _second_ slot and store a dummy r24 in the first slot, designated by
+ * QUIRK_FLIPPED_R24 */
+
+#define QUIRK_FLIPPED_R24 (1 << 2)
+
+/* Is the op commutative? */
+#define OP_COMMUTES (1 << 3)
+
+/* Does the op convert types between int- and float- space (i2f/f2u/etc) */
+#define OP_TYPE_CONVERT (1 << 4)
+
+/* Vector-independant shorthands for the above; these numbers are arbitrary and
+ * not from the ISA. Convert to the above with unit_enum_to_midgard */
+
+#define UNIT_MUL 0
+#define UNIT_ADD 1
+#define UNIT_LUT 2
+
+/* 4-bit type tags */
+
+#define TAG_TEXTURE_4_VTX 0x2
+#define TAG_TEXTURE_4 0x3
+#define TAG_LOAD_STORE_4 0x5
+#define TAG_ALU_4 0x8
+#define TAG_ALU_8 0x9
+#define TAG_ALU_12 0xA
+#define TAG_ALU_16 0xB
+
+static inline int
+quadword_size(int tag)
+{
+ switch (tag) {
+ case TAG_ALU_4:
+ case TAG_LOAD_STORE_4:
+ case TAG_TEXTURE_4:
+ case TAG_TEXTURE_4_VTX:
+ return 1;
+ case TAG_ALU_8:
+ return 2;
+ case TAG_ALU_12:
+ return 3;
+ case TAG_ALU_16:
+ return 4;
+ default:
+ unreachable("Unknown tag");
+ }
+}
+
+#define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 || \
+ tag == TAG_ALU_12 || tag == TAG_ALU_16)
+
+/* Special register aliases */
+
+#define MAX_WORK_REGISTERS 16
+
+/* Uniforms are begin at (REGISTER_UNIFORMS - uniform_count) */
+#define REGISTER_UNIFORMS 24
+
+#define REGISTER_UNUSED 24
+#define REGISTER_CONSTANT 26
+#define REGISTER_VARYING_BASE 26
+#define REGISTER_OFFSET 27
+#define REGISTER_TEXTURE_BASE 28
+#define REGISTER_SELECT 31
+
+/* SSA helper aliases to mimic the registers. UNUSED_0 encoded as an inline
+ * constant. UNUSED_1 encoded as REGISTER_UNUSED */
+
+#define SSA_UNUSED_0 0
+#define SSA_UNUSED_1 -2
+
+#define SSA_FIXED_SHIFT 24
+#define SSA_FIXED_REGISTER(reg) ((1 + reg) << SSA_FIXED_SHIFT)
+#define SSA_REG_FROM_FIXED(reg) ((reg >> SSA_FIXED_SHIFT) - 1)
+#define SSA_FIXED_MINIMUM SSA_FIXED_REGISTER(0)
+
+/* Swizzle support */
+
+#define SWIZZLE(A, B, C, D) ((D << 6) | (C << 4) | (B << 2) | (A << 0))
+#define SWIZZLE_FROM_ARRAY(r) SWIZZLE(r[0], r[1], r[2], r[3])
+#define COMPONENT_X 0x0
+#define COMPONENT_Y 0x1
+#define COMPONENT_Z 0x2
+#define COMPONENT_W 0x3
+
+#define SWIZZLE_XXXX SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X)
+#define SWIZZLE_XYXX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X)
+#define SWIZZLE_XYZX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X)
+#define SWIZZLE_XYZW SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W)
+#define SWIZZLE_XYXZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_Z)
+#define SWIZZLE_XYZZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_Z)
+#define SWIZZLE_WWWW SWIZZLE(COMPONENT_W, COMPONENT_W, COMPONENT_W, COMPONENT_W)
+
+static inline unsigned
+swizzle_of(unsigned comp)
+{
+ switch (comp) {
+ case 1:
+ return SWIZZLE_XXXX;
+ case 2:
+ return SWIZZLE_XYXX;
+ case 3:
+ return SWIZZLE_XYZX;
+ case 4:
+ return SWIZZLE_XYZW;
+ default:
+ unreachable("Invalid component count");
+ }
+}
+
+static inline unsigned
+mask_of(unsigned nr_comp)
+{
+ return (1 << nr_comp) - 1;
+}
+
+
+/* See ISA notes */
+
+#define LDST_NOP (3)
+
+/* There are five ALU units: VMUL, VADD, SMUL, SADD, LUT. A given opcode is
+ * implemented on some subset of these units (or occassionally all of them).
+ * This table encodes a bit mask of valid units for each opcode, so the
+ * scheduler can figure where to plonk the instruction. */
+
+/* Shorthands for each unit */
+#define UNIT_VMUL ALU_ENAB_VEC_MUL
+#define UNIT_SADD ALU_ENAB_SCAL_ADD
+#define UNIT_VADD ALU_ENAB_VEC_ADD
+#define UNIT_SMUL ALU_ENAB_SCAL_MUL
+#define UNIT_VLUT ALU_ENAB_VEC_LUT
+
+/* Shorthands for usual combinations of units */
+
+#define UNITS_MUL (UNIT_VMUL | UNIT_SMUL)
+#define UNITS_ADD (UNIT_VADD | UNIT_SADD)
+#define UNITS_MOST (UNITS_MUL | UNITS_ADD)
+#define UNITS_ALL (UNITS_MOST | UNIT_VLUT)
+#define UNITS_SCALAR (UNIT_SADD | UNIT_SMUL)
+#define UNITS_VECTOR (UNIT_VMUL | UNIT_VADD)
+#define UNITS_ANY_VECTOR (UNITS_VECTOR | UNIT_VLUT)
+
+struct mir_op_props {
+ const char *name;
+ unsigned props;
+};
+
+/* This file is common, so don't define the tables themselves. #include
+ * midgard_op.h if you need that, or edit midgard_ops.c directly */
+
+/* Duplicate bits to convert a 4-bit writemask to duplicated 8-bit format,
+ * which is used for 32-bit vector units */
+
+static inline unsigned
+expand_writemask_32(unsigned mask)
+{
+ unsigned o = 0;
+
+ for (int i = 0; i < 4; ++i)
+ if (mask & (1 << i))
+ o |= (3 << (2 * i));
+
+ return o;
+}
+
+/* Coerce structs to integer */
+
+static inline unsigned
+vector_alu_srco_unsigned(midgard_vector_alu_src src)
+{
+ unsigned u;
+ memcpy(&u, &src, sizeof(src));
+ return u;
+}
+
+static inline midgard_vector_alu_src
+vector_alu_from_unsigned(unsigned u)
+{
+ midgard_vector_alu_src s;
+ memcpy(&s, &u, sizeof(s));
+ return s;
+}
+
+/* Composes two swizzles */
+static inline unsigned
+pan_compose_swizzle(unsigned left, unsigned right)
+{
+ unsigned out = 0;
+
+ for (unsigned c = 0; c < 4; ++c) {
+ unsigned s = (left >> (2*c)) & 0x3;
+ unsigned q = (right >> (2*s)) & 0x3;
+
+ out |= (q << (2*c));
+ }
+
+ return out;
+}
+
+/* Applies a swizzle to an ALU source */
+
+static inline unsigned
+vector_alu_apply_swizzle(unsigned src, unsigned swizzle)
+{
+ midgard_vector_alu_src s =
+ vector_alu_from_unsigned(src);
+
+ s.swizzle = pan_compose_swizzle(s.swizzle, swizzle);
+
+ return vector_alu_srco_unsigned(s);
+}
+
+#endif
--- /dev/null
+# Copyright © 2018 Rob Clark
+# Copyright © 2019 Collabora
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libpanfrost_midgard_files = files(
+ 'midgard_compile.c',
+ 'mir.c',
+ 'midgard_print.c',
+ 'midgard_schedule.c',
+ 'midgard_emit.c',
+ 'midgard_ra.c',
+ 'midgard_ra_pipeline.c',
+ 'midgard_liveness.c',
+ 'midgard_ops.c',
+ 'cppwrap.cpp',
+ 'disassemble.c',
+)
+
+midgard_nir_algebraic_c = custom_target(
+ 'midgard_nir_algebraic.c',
+ input : 'midgard_nir_algebraic.py',
+ output : 'midgard_nir_algebraic.c',
+ command : [
+ prog_python, '@INPUT@',
+ '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
+ ],
+ capture : true,
+ depend_files : nir_algebraic_py,
+)
+
+libpanfrost_midgard = static_library(
+ 'panfrost_midgard',
+ [libpanfrost_midgard_files, midgard_nir_algebraic_c],
+ include_directories : [
+ inc_common,
+ inc_include,
+ inc_src,
+ inc_panfrost_hw,
+ ],
+ dependencies: [
+ idep_nir
+ ],
+ c_args : [c_vis_args, no_override_init_args],
+ cpp_args : [cpp_vis_args],
+ build_by_default : false,
+)
--- /dev/null
+/* Author(s):
+ * Connor Abbott
+ * Alyssa Rosenzweig
+ *
+ * Copyright (c) 2013 Connor Abbott (connor@abbott.cx)
+ * Copyright (c) 2018 Alyssa Rosenzweig (alyssa@rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __midgard_parse_h__
+#define __midgard_parse_h__
+
+/* Additional metadata for parsing Midgard binaries, not needed for compilation */
+
+static midgard_word_type midgard_word_types[16] = {
+ midgard_word_type_unknown, /* 0x0 */
+ midgard_word_type_unknown, /* 0x1 */
+ midgard_word_type_texture, /* 0x2 */
+ midgard_word_type_texture, /* 0x3 */
+ midgard_word_type_unknown, /* 0x4 */
+ midgard_word_type_load_store, /* 0x5 */
+ midgard_word_type_unknown, /* 0x6 */
+ midgard_word_type_unknown, /* 0x7 */
+ midgard_word_type_alu, /* 0x8 */
+ midgard_word_type_alu, /* 0x9 */
+ midgard_word_type_alu, /* 0xA */
+ midgard_word_type_alu, /* 0xB */
+ midgard_word_type_alu, /* 0xC */
+ midgard_word_type_alu, /* 0xD */
+ midgard_word_type_alu, /* 0xE */
+ midgard_word_type_alu, /* 0xF */
+};
+
+static unsigned midgard_word_size[16] = {
+ 0, /* 0x0 */
+ 0, /* 0x1 */
+ 1, /* 0x2 */
+ 1, /* 0x3 */
+ 0, /* 0x4 */
+ 1, /* 0x5 */
+ 0, /* 0x6 */
+ 0, /* 0x7 */
+ 1, /* 0x8 */
+ 2, /* 0x9 */
+ 3, /* 0xA */
+ 4, /* 0xB */
+ 1, /* 0xC */
+ 2, /* 0xD */
+ 3, /* 0xE */
+ 4, /* 0xF */
+};
+
+#endif
--- /dev/null
+/* Author(s):
+ * Connor Abbott
+ * Alyssa Rosenzweig
+ *
+ * Copyright (c) 2013 Connor Abbott (connor@abbott.cx)
+ * Copyright (c) 2018 Alyssa Rosenzweig (alyssa@rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __midgard_h__
+#define __midgard_h__
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "panfrost-job.h"
+
+#define MIDGARD_DBG_MSGS 0x0001
+#define MIDGARD_DBG_SHADERS 0x0002
+#define MIDGARD_DBG_SHADERDB 0x0004
+
+extern int midgard_debug;
+
+typedef enum {
+ midgard_word_type_alu,
+ midgard_word_type_load_store,
+ midgard_word_type_texture,
+ midgard_word_type_unknown
+} midgard_word_type;
+
+typedef enum {
+ midgard_alu_vmul,
+ midgard_alu_sadd,
+ midgard_alu_smul,
+ midgard_alu_vadd,
+ midgard_alu_lut
+} midgard_alu;
+
+/*
+ * ALU words
+ */
+
+typedef enum {
+ midgard_alu_op_fadd = 0x10,
+ midgard_alu_op_fmul = 0x14,
+
+ midgard_alu_op_fmin = 0x28,
+ midgard_alu_op_fmax = 0x2C,
+
+ midgard_alu_op_fmov = 0x30, /* fmov_rte */
+ midgard_alu_op_fmov_rtz = 0x31,
+ midgard_alu_op_fmov_rtn = 0x32,
+ midgard_alu_op_fmov_rtp = 0x33,
+ midgard_alu_op_froundeven = 0x34,
+ midgard_alu_op_ftrunc = 0x35,
+ midgard_alu_op_ffloor = 0x36,
+ midgard_alu_op_fceil = 0x37,
+ midgard_alu_op_ffma = 0x38,
+ midgard_alu_op_fdot3 = 0x3C,
+ midgard_alu_op_fdot3r = 0x3D,
+ midgard_alu_op_fdot4 = 0x3E,
+ midgard_alu_op_freduce = 0x3F,
+
+ midgard_alu_op_iadd = 0x40,
+ midgard_alu_op_ishladd = 0x41,
+ midgard_alu_op_isub = 0x46,
+ midgard_alu_op_iaddsat = 0x48,
+ midgard_alu_op_uaddsat = 0x49,
+ midgard_alu_op_isubsat = 0x4E,
+ midgard_alu_op_usubsat = 0x4F,
+
+ midgard_alu_op_imul = 0x58,
+
+ midgard_alu_op_imin = 0x60,
+ midgard_alu_op_umin = 0x61,
+ midgard_alu_op_imax = 0x62,
+ midgard_alu_op_umax = 0x63,
+ midgard_alu_op_ihadd = 0x64,
+ midgard_alu_op_uhadd = 0x65,
+ midgard_alu_op_irhadd = 0x66,
+ midgard_alu_op_urhadd = 0x67,
+ midgard_alu_op_iasr = 0x68,
+ midgard_alu_op_ilsr = 0x69,
+ midgard_alu_op_ishl = 0x6E,
+
+ midgard_alu_op_iand = 0x70,
+ midgard_alu_op_ior = 0x71,
+ midgard_alu_op_inand = 0x72, /* ~(a & b), for inot let a = b */
+ midgard_alu_op_inor = 0x73, /* ~(a | b) */
+ midgard_alu_op_iandnot = 0x74, /* (a & ~b), used for not/b2f */
+ midgard_alu_op_iornot = 0x75, /* (a | ~b) */
+ midgard_alu_op_ixor = 0x76,
+ midgard_alu_op_inxor = 0x77, /* ~(a & b) */
+ midgard_alu_op_iclz = 0x78, /* Number of zeroes on left */
+ midgard_alu_op_ibitcount8 = 0x7A, /* Counts bits in 8-bit increments */
+ midgard_alu_op_imov = 0x7B,
+ midgard_alu_op_iabsdiff = 0x7C,
+ midgard_alu_op_uabsdiff = 0x7D,
+ midgard_alu_op_ichoose = 0x7E, /* vector, component number - dupe for shuffle() */
+
+ midgard_alu_op_feq = 0x80,
+ midgard_alu_op_fne = 0x81,
+ midgard_alu_op_flt = 0x82,
+ midgard_alu_op_fle = 0x83,
+ midgard_alu_op_fball_eq = 0x88,
+ midgard_alu_op_bball_eq = 0x89,
+ midgard_alu_op_fball_lt = 0x8A, /* all(lessThan(.., ..)) */
+ midgard_alu_op_fball_lte = 0x8B, /* all(lessThanEqual(.., ..)) */
+
+ midgard_alu_op_bbany_neq = 0x90, /* used for bvec4(1) */
+ midgard_alu_op_fbany_neq = 0x91, /* bvec4(0) also */
+ midgard_alu_op_fbany_lt = 0x92, /* any(lessThan(.., ..)) */
+ midgard_alu_op_fbany_lte = 0x93, /* any(lessThanEqual(.., ..)) */
+
+ midgard_alu_op_f2i_rte = 0x98,
+ midgard_alu_op_f2i_rtz = 0x99,
+ midgard_alu_op_f2i_rtn = 0x9A,
+ midgard_alu_op_f2i_rtp = 0x9B,
+ midgard_alu_op_f2u_rte = 0x9C,
+ midgard_alu_op_f2u_rtz = 0x9D,
+ midgard_alu_op_f2u_rtn = 0x9E,
+ midgard_alu_op_f2u_rtp = 0x9F,
+
+ midgard_alu_op_ieq = 0xA0,
+ midgard_alu_op_ine = 0xA1,
+ midgard_alu_op_ult = 0xA2,
+ midgard_alu_op_ule = 0xA3,
+ midgard_alu_op_ilt = 0xA4,
+ midgard_alu_op_ile = 0xA5,
+ midgard_alu_op_iball_eq = 0xA8,
+ midgard_alu_op_iball_neq = 0xA9,
+ midgard_alu_op_uball_lt = 0xAA,
+ midgard_alu_op_uball_lte = 0xAB,
+ midgard_alu_op_iball_lt = 0xAC,
+ midgard_alu_op_iball_lte = 0xAD,
+
+ midgard_alu_op_ibany_eq = 0xB0,
+ midgard_alu_op_ibany_neq = 0xB1,
+ midgard_alu_op_ubany_lt = 0xB2,
+ midgard_alu_op_ubany_lte = 0xB3,
+ midgard_alu_op_ibany_lt = 0xB4, /* any(lessThan(.., ..)) */
+ midgard_alu_op_ibany_lte = 0xB5, /* any(lessThanEqual(.., ..)) */
+ midgard_alu_op_i2f_rte = 0xB8,
+ midgard_alu_op_i2f_rtz = 0xB9,
+ midgard_alu_op_i2f_rtn = 0xBA,
+ midgard_alu_op_i2f_rtp = 0xBB,
+ midgard_alu_op_u2f_rte = 0xBC,
+ midgard_alu_op_u2f_rtz = 0xBD,
+ midgard_alu_op_u2f_rtn = 0xBE,
+ midgard_alu_op_u2f_rtp = 0xBF,
+
+ midgard_alu_op_icsel_v = 0xC0, /* condition code r31 */
+ midgard_alu_op_icsel = 0xC1, /* condition code r31.w */
+ midgard_alu_op_fcsel_v = 0xC4,
+ midgard_alu_op_fcsel = 0xC5,
+ midgard_alu_op_fround = 0xC6,
+
+ midgard_alu_op_fatan_pt2 = 0xE8,
+ midgard_alu_op_fpow_pt1 = 0xEC,
+ midgard_alu_op_fpown_pt1 = 0xED,
+ midgard_alu_op_fpowr_pt1 = 0xEE,
+
+ midgard_alu_op_frcp = 0xF0,
+ midgard_alu_op_frsqrt = 0xF2,
+ midgard_alu_op_fsqrt = 0xF3,
+ midgard_alu_op_fexp2 = 0xF4,
+ midgard_alu_op_flog2 = 0xF5,
+ midgard_alu_op_fsin = 0xF6,
+ midgard_alu_op_fcos = 0xF7,
+ midgard_alu_op_fatan2_pt1 = 0xF9,
+} midgard_alu_op;
+
+typedef enum {
+ midgard_outmod_none = 0,
+ midgard_outmod_pos = 1,
+ /* 0x2 unknown */
+ midgard_outmod_sat = 3
+} midgard_outmod_float;
+
+typedef enum {
+ midgard_outmod_int_saturate = 0,
+ midgard_outmod_uint_saturate = 1,
+ midgard_outmod_int_wrap = 2,
+ midgard_outmod_int_high = 3, /* Overflowed portion */
+} midgard_outmod_int;
+
+typedef enum {
+ midgard_reg_mode_8 = 0,
+ midgard_reg_mode_16 = 1,
+ midgard_reg_mode_32 = 2,
+ midgard_reg_mode_64 = 3
+} midgard_reg_mode;
+
+typedef enum {
+ midgard_dest_override_lower = 0,
+ midgard_dest_override_upper = 1,
+ midgard_dest_override_none = 2
+} midgard_dest_override;
+
+typedef enum {
+ midgard_int_sign_extend = 0,
+ midgard_int_zero_extend = 1,
+ midgard_int_normal = 2,
+ midgard_int_shift = 3
+} midgard_int_mod;
+
+#define MIDGARD_FLOAT_MOD_ABS (1 << 0)
+#define MIDGARD_FLOAT_MOD_NEG (1 << 1)
+
+typedef struct
+__attribute__((__packed__))
+{
+ /* Either midgard_int_mod or from midgard_float_mod_*, depending on the
+ * type of op */
+ unsigned mod : 2;
+
+ /* replicate lower half if dest = half, or low/high half selection if
+ * dest = full
+ */
+ bool rep_low : 1;
+ bool rep_high : 1; /* unused if dest = full */
+ bool half : 1; /* only matters if dest = full */
+ unsigned swizzle : 8;
+}
+midgard_vector_alu_src;
+
+typedef struct
+__attribute__((__packed__))
+{
+ midgard_alu_op op : 8;
+ midgard_reg_mode reg_mode : 2;
+ unsigned src1 : 13;
+ unsigned src2 : 13;
+ midgard_dest_override dest_override : 2;
+ midgard_outmod_float outmod : 2;
+ unsigned mask : 8;
+}
+midgard_vector_alu;
+
+typedef struct
+__attribute__((__packed__))
+{
+ bool abs : 1;
+ bool negate : 1;
+ bool full : 1; /* 0 = half, 1 = full */
+ unsigned component : 3;
+}
+midgard_scalar_alu_src;
+
+typedef struct
+__attribute__((__packed__))
+{
+ midgard_alu_op op : 8;
+ unsigned src1 : 6;
+ unsigned src2 : 11;
+ unsigned unknown : 1;
+ unsigned outmod : 2;
+ bool output_full : 1;
+ unsigned output_component : 3;
+}
+midgard_scalar_alu;
+
+typedef struct
+__attribute__((__packed__))
+{
+ unsigned src1_reg : 5;
+ unsigned src2_reg : 5;
+ unsigned out_reg : 5;
+ bool src2_imm : 1;
+}
+midgard_reg_info;
+
+/* In addition to conditional branches and jumps (unconditional branches),
+ * Midgard implements a bit of fixed function functionality used in fragment
+ * shaders via specially crafted branches. These have special branch opcodes,
+ * which perform a fixed-function operation and/or use the results of a
+ * fixed-function operation as the branch condition. */
+
+typedef enum {
+ /* Regular branches */
+ midgard_jmp_writeout_op_branch_uncond = 1,
+ midgard_jmp_writeout_op_branch_cond = 2,
+
+ /* In a fragment shader, execute a discard_if instruction, with the
+ * corresponding condition code. Terminates the shader, so generally
+ * set the branch target to out of the shader */
+ midgard_jmp_writeout_op_discard = 4,
+
+ /* Branch if the tilebuffer is not yet ready. At the beginning of a
+ * fragment shader that reads from the tile buffer, for instance via
+ * ARM_shader_framebuffer_fetch or EXT_pixel_local_storage, this branch
+ * operation should be used as a loop. An instruction like
+ * "br.tilebuffer.always -1" does the trick, corresponding to
+ * "while(!is_tilebuffer_ready) */
+ midgard_jmp_writeout_op_tilebuffer_pending = 6,
+
+ /* In a fragment shader, try to write out the value pushed to r0 to the
+ * tilebuffer, subject to unknown state in r1.z and r1.w. If this
+ * succeeds, the shader terminates. If it fails, it branches to the
+ * specified branch target. Generally, this should be used in a loop to
+ * itself, acting as "do { write(r0); } while(!write_successful);" */
+ midgard_jmp_writeout_op_writeout = 7,
+} midgard_jmp_writeout_op;
+
+typedef enum {
+ midgard_condition_write0 = 0,
+
+ /* These condition codes denote a conditional branch on FALSE and on
+ * TRUE respectively */
+ midgard_condition_false = 1,
+ midgard_condition_true = 2,
+
+ /* This condition code always branches. For a pure branch, the
+ * unconditional branch coding should be used instead, but for
+ * fixed-function branch opcodes, this is still useful */
+ midgard_condition_always = 3,
+} midgard_condition;
+
+typedef struct
+__attribute__((__packed__))
+{
+ midgard_jmp_writeout_op op : 3; /* == branch_uncond */
+ unsigned dest_tag : 4; /* tag of branch destination */
+ unsigned unknown : 2;
+ int offset : 7;
+}
+midgard_branch_uncond;
+
+typedef struct
+__attribute__((__packed__))
+{
+ midgard_jmp_writeout_op op : 3; /* == branch_cond */
+ unsigned dest_tag : 4; /* tag of branch destination */
+ int offset : 7;
+ midgard_condition cond : 2;
+}
+midgard_branch_cond;
+
+typedef struct
+__attribute__((__packed__))
+{
+ midgard_jmp_writeout_op op : 3; /* == branch_cond */
+ unsigned dest_tag : 4; /* tag of branch destination */
+ unsigned unknown : 2;
+ signed offset : 23;
+ unsigned cond : 16;
+}
+midgard_branch_extended;
+
+typedef struct
+__attribute__((__packed__))
+{
+ midgard_jmp_writeout_op op : 3; /* == writeout */
+ unsigned unknown : 13;
+}
+midgard_writeout;
+
+/*
+ * Load/store words
+ */
+
+typedef enum {
+ midgard_op_ld_st_noop = 0x03,
+
+ /* Unclear why this is on the L/S unit, but (with an address of 0,
+ * appropriate swizzle, magic constant 0x24, and xy mask?) moves fp32 cube
+ * map coordinates in r27 to its cube map texture coordinate
+ * destination (e.g r29). 0x4 magic for lding from fp16 instead */
+
+ midgard_op_st_cubemap_coords = 0x0E,
+
+ /* Used in OpenCL. Probably can ld other things as well */
+ midgard_op_ld_global_id = 0x10,
+
+ /* The L/S unit can do perspective division a clock faster than the ALU
+ * if you're lucky. Put the vec4 in r27, and call with 0x24 as the
+ * unknown state; the output will be <x/w, y/w, z/w, 1>. Replace w with
+ * z for the z version */
+ midgard_op_ldst_perspective_division_z = 0x12,
+ midgard_op_ldst_perspective_division_w = 0x13,
+
+ /* val in r27.y, address embedded, outputs result to argument. Invert val for sub. Let val = +-1 for inc/dec. */
+ midgard_op_atomic_add = 0x40,
+ midgard_op_atomic_and = 0x44,
+ midgard_op_atomic_or = 0x48,
+ midgard_op_atomic_xor = 0x4C,
+
+ midgard_op_atomic_imin = 0x50,
+ midgard_op_atomic_umin = 0x54,
+ midgard_op_atomic_imax = 0x58,
+ midgard_op_atomic_umax = 0x5C,
+
+ midgard_op_atomic_xchg = 0x60,
+
+ /* Used for compute shader's __global arguments, __local variables (or
+ * for register spilling) */
+
+ midgard_op_ld_char = 0x81,
+ midgard_op_ld_char2 = 0x84,
+ midgard_op_ld_short = 0x85,
+ midgard_op_ld_char4 = 0x88, /* short2, int, float */
+ midgard_op_ld_short4 = 0x8C, /* int2, float2, long */
+ midgard_op_ld_int4 = 0x90, /* float4, long2 */
+
+ midgard_op_ld_attr_32 = 0x94,
+ midgard_op_ld_attr_16 = 0x95,
+ midgard_op_ld_attr_32u = 0x96,
+ midgard_op_ld_attr_32i = 0x97,
+ midgard_op_ld_vary_32 = 0x98,
+ midgard_op_ld_vary_16 = 0x99,
+ midgard_op_ld_vary_32u = 0x9A,
+ midgard_op_ld_vary_32i = 0x9B,
+ midgard_op_ld_color_buffer_16 = 0x9D,
+
+ midgard_op_ld_uniform_16 = 0xAC,
+ midgard_op_ld_uniform_32i = 0xA8,
+
+ midgard_op_ld_uniform_32 = 0xB0,
+ midgard_op_ld_color_buffer_8 = 0xBA,
+
+ midgard_op_st_char = 0xC0,
+ midgard_op_st_char2 = 0xC4, /* short */
+ midgard_op_st_char4 = 0xC8, /* short2, int, float */
+ midgard_op_st_short4 = 0xCC, /* int2, float2, long */
+ midgard_op_st_int4 = 0xD0, /* float4, long2 */
+
+ midgard_op_st_vary_32 = 0xD4,
+ midgard_op_st_vary_16 = 0xD5,
+ midgard_op_st_vary_32u = 0xD6,
+ midgard_op_st_vary_32i = 0xD7,
+
+ /* Value to st in r27, location r26.w as short2 */
+ midgard_op_st_image_f = 0xD8,
+ midgard_op_st_image_ui = 0xDA,
+ midgard_op_st_image_i = 0xDB,
+} midgard_load_store_op;
+
+typedef enum {
+ midgard_interp_centroid = 1,
+ midgard_interp_default = 2
+} midgard_interpolation;
+
+typedef enum {
+ midgard_varying_mod_none = 0,
+
+ /* Other values unknown */
+
+ /* Take the would-be result and divide all components by its z/w
+ * (perspective division baked in with the load) */
+ midgard_varying_mod_perspective_z = 2,
+ midgard_varying_mod_perspective_w = 3,
+} midgard_varying_modifier;
+
+typedef struct
+__attribute__((__packed__))
+{
+ unsigned zero0 : 1; /* Always zero */
+
+ midgard_varying_modifier modifier : 2;
+
+ unsigned zero1: 1; /* Always zero */
+
+ /* Varying qualifiers, zero if not a varying */
+ unsigned flat : 1;
+ unsigned is_varying : 1; /* Always one for varying, but maybe something else? */
+ midgard_interpolation interpolation : 2;
+
+ unsigned zero2 : 2; /* Always zero */
+}
+midgard_varying_parameter;
+
+typedef struct
+__attribute__((__packed__))
+{
+ midgard_load_store_op op : 8;
+ unsigned reg : 5;
+ unsigned mask : 4;
+ unsigned swizzle : 8;
+ unsigned unknown : 16;
+
+ unsigned varying_parameters : 10;
+
+ unsigned address : 9;
+}
+midgard_load_store_word;
+
+typedef struct
+__attribute__((__packed__))
+{
+ unsigned type : 4;
+ unsigned next_type : 4;
+ uint64_t word1 : 60;
+ uint64_t word2 : 60;
+}
+midgard_load_store;
+
+/* 8-bit register selector used in texture ops to select a bias/LOD/gradient
+ * register, shoved into the `bias` field */
+
+typedef struct
+__attribute__((__packed__))
+{
+ /* Combines with component_hi to form 2-bit component select out of
+ * xyzw, as the component for bias/LOD and the starting component of a
+ * gradient vector */
+
+ unsigned component_lo : 1;
+
+ /* Register select between r28/r29 */
+ unsigned select : 1;
+
+ /* For a half-register, selects the upper half */
+ unsigned upper : 1;
+
+ /* Specifies a full-register, clear for a half-register. Mutually
+ * exclusive with upper. */
+ unsigned full : 1;
+
+ /* Higher half of component_lo. Always seen to be set for LOD/bias
+ * and clear for processed gradients, but I'm not sure if that's a
+ * hardware requirement. */
+ unsigned component_hi : 1;
+
+ /* Padding to make this 8-bit */
+ unsigned zero : 3;
+}
+midgard_tex_register_select;
+
+/* Texture pipeline results are in r28-r29 */
+#define REG_TEX_BASE 28
+
+/* Texture opcodes... maybe? */
+#define TEXTURE_OP_NORMAL 0x11 /* texture */
+#define TEXTURE_OP_LOD 0x12 /* textureLod */
+#define TEXTURE_OP_TEXEL_FETCH 0x14 /* texelFetch */
+
+enum mali_sampler_type {
+ MALI_SAMPLER_UNK = 0x0,
+ MALI_SAMPLER_FLOAT = 0x1, /* sampler */
+ MALI_SAMPLER_UNSIGNED = 0x2, /* usampler */
+ MALI_SAMPLER_SIGNED = 0x3, /* isampler */
+};
+
+typedef struct
+__attribute__((__packed__))
+{
+ unsigned type : 4;
+ unsigned next_type : 4;
+
+ unsigned op : 6;
+ unsigned shadow : 1;
+ unsigned is_gather : 1;
+
+ /* A little obscure, but last is set for the last texture operation in
+ * a shader. cont appears to just be last's opposite (?). Yeah, I know,
+ * kind of funky.. BiOpen thinks it could do with memory hinting, or
+ * tile locking? */
+
+ unsigned cont : 1;
+ unsigned last : 1;
+
+ enum mali_texture_type format : 2;
+ unsigned zero : 2;
+
+ /* Is a register used to specify the
+ * LOD/bias/offset? If set, use the `bias` field as
+ * a register index. If clear, use the `bias` field
+ * as an immediate. */
+ unsigned lod_register : 1;
+
+ /* Is a register used to specify an offset? If set, use the
+ * offset_reg_* fields to encode this, duplicated for each of the
+ * components. If clear, there is implcitly always an immediate offst
+ * specificed in offset_imm_* */
+ unsigned offset_register : 1;
+
+ unsigned in_reg_full : 1;
+ unsigned in_reg_select : 1;
+ unsigned in_reg_upper : 1;
+ unsigned in_reg_swizzle : 8;
+
+ unsigned unknown8 : 2;
+
+ unsigned out_full : 1;
+
+ enum mali_sampler_type sampler_type : 2;
+
+ unsigned out_reg_select : 1;
+ unsigned out_upper : 1;
+
+ unsigned mask : 4;
+
+ unsigned unknown2 : 2;
+
+ unsigned swizzle : 8;
+ unsigned unknown4 : 8;
+
+ unsigned unknownA : 4;
+
+ /* In immediate mode, each offset field is an immediate range [0, 7].
+ *
+ * In register mode, offset_x becomes a register full / select / upper
+ * triplet and a vec3 swizzle is splattered across offset_y/offset_z in
+ * a genuinely bizarre way.
+ *
+ * For texel fetches in immediate mode, the range is the full [-8, 7],
+ * but for normal texturing the top bit must be zero and a register
+ * used instead. It's not clear where this limitation is from. */
+
+ signed offset_x : 4;
+ signed offset_y : 4;
+ signed offset_z : 4;
+
+ /* In immediate bias mode, for a normal texture op, this is
+ * texture bias, computed as int(2^8 * frac(biasf)), with
+ * bias_int = floor(bias). For a textureLod, it's that, but
+ * s/bias/lod. For a texel fetch, this is the LOD as-is.
+ *
+ * In register mode, this is a midgard_tex_register_select
+ * structure and bias_int is zero */
+
+ unsigned bias : 8;
+ signed bias_int : 8;
+
+ unsigned texture_handle : 16;
+ unsigned sampler_handle : 16;
+}
+midgard_texture_word;
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <err.h>
+
+#include "main/mtypes.h"
+#include "compiler/glsl/glsl_to_nir.h"
+#include "compiler/nir_types.h"
+#include "main/imports.h"
+#include "compiler/nir/nir_builder.h"
+#include "util/half_float.h"
+#include "util/u_math.h"
+#include "util/u_debug.h"
+#include "util/u_dynarray.h"
+#include "util/list.h"
+#include "main/mtypes.h"
+
+#include "midgard.h"
+#include "midgard_nir.h"
+#include "midgard_compile.h"
+#include "midgard_ops.h"
+#include "helpers.h"
+#include "compiler.h"
+
+#include "disassemble.h"
+
+static const struct debug_named_value debug_options[] = {
+ {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"},
+ {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"},
+ {"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"},
+ DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", debug_options, 0)
+
+unsigned SHADER_DB_COUNT = 0;
+
+int midgard_debug = 0;
+
+#define DBG(fmt, ...) \
+ do { if (midgard_debug & MIDGARD_DBG_MSGS) \
+ fprintf(stderr, "%s:%d: "fmt, \
+ __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
+
+static bool
+midgard_is_branch_unit(unsigned unit)
+{
+ return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT);
+}
+
+static void
+midgard_block_add_successor(midgard_block *block, midgard_block *successor)
+{
+ block->successors[block->nr_successors++] = successor;
+ assert(block->nr_successors <= ARRAY_SIZE(block->successors));
+}
+
+/* Helpers to generate midgard_instruction's using macro magic, since every
+ * driver seems to do it that way */
+
+#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
+
+#define M_LOAD_STORE(name, rname, uname) \
+ static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
+ midgard_instruction i = { \
+ .type = TAG_LOAD_STORE_4, \
+ .mask = 0xF, \
+ .ssa_args = { \
+ .rname = ssa, \
+ .uname = -1, \
+ .src1 = -1 \
+ }, \
+ .load_store = { \
+ .op = midgard_op_##name, \
+ .swizzle = SWIZZLE_XYZW, \
+ .address = address \
+ } \
+ }; \
+ \
+ return i; \
+ }
+
+#define M_LOAD(name) M_LOAD_STORE(name, dest, src0)
+#define M_STORE(name) M_LOAD_STORE(name, src0, dest)
+
+/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs
+ * the corresponding Midgard source */
+
+static midgard_vector_alu_src
+vector_alu_modifiers(nir_alu_src *src, bool is_int, unsigned broadcast_count,
+ bool half, bool sext)
+{
+ if (!src) return blank_alu_src;
+
+ /* Figure out how many components there are so we can adjust the
+ * swizzle. Specifically we want to broadcast the last channel so
+ * things like ball2/3 work
+ */
+
+ if (broadcast_count) {
+ uint8_t last_component = src->swizzle[broadcast_count - 1];
+
+ for (unsigned c = broadcast_count; c < NIR_MAX_VEC_COMPONENTS; ++c) {
+ src->swizzle[c] = last_component;
+ }
+ }
+
+ midgard_vector_alu_src alu_src = {
+ .rep_low = 0,
+ .rep_high = 0,
+ .half = half,
+ .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle)
+ };
+
+ if (is_int) {
+ alu_src.mod = midgard_int_normal;
+
+ /* Sign/zero-extend if needed */
+
+ if (half) {
+ alu_src.mod = sext ?
+ midgard_int_sign_extend
+ : midgard_int_zero_extend;
+ }
+
+ /* These should have been lowered away */
+ assert(!(src->abs || src->negate));
+ } else {
+ alu_src.mod = (src->abs << 0) | (src->negate << 1);
+ }
+
+ return alu_src;
+}
+
+/* load/store instructions have both 32-bit and 16-bit variants, depending on
+ * whether we are using vectors composed of highp or mediump. At the moment, we
+ * don't support half-floats -- this requires changes in other parts of the
+ * compiler -- therefore the 16-bit versions are commented out. */
+
+//M_LOAD(ld_attr_16);
+M_LOAD(ld_attr_32);
+//M_LOAD(ld_vary_16);
+M_LOAD(ld_vary_32);
+//M_LOAD(ld_uniform_16);
+M_LOAD(ld_uniform_32);
+M_LOAD(ld_color_buffer_8);
+//M_STORE(st_vary_16);
+M_STORE(st_vary_32);
+M_STORE(st_cubemap_coords);
+
+static midgard_instruction
+v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond)
+{
+ midgard_branch_cond branch = {
+ .op = op,
+ .dest_tag = tag,
+ .offset = offset,
+ .cond = cond
+ };
+
+ uint16_t compact;
+ memcpy(&compact, &branch, sizeof(branch));
+
+ midgard_instruction ins = {
+ .type = TAG_ALU_4,
+ .unit = ALU_ENAB_BR_COMPACT,
+ .prepacked_branch = true,
+ .compact_branch = true,
+ .br_compact = compact
+ };
+
+ if (op == midgard_jmp_writeout_op_writeout)
+ ins.writeout = true;
+
+ return ins;
+}
+
+static midgard_instruction
+v_branch(bool conditional, bool invert)
+{
+ midgard_instruction ins = {
+ .type = TAG_ALU_4,
+ .unit = ALU_ENAB_BRANCH,
+ .compact_branch = true,
+ .branch = {
+ .conditional = conditional,
+ .invert_conditional = invert
+ }
+ };
+
+ return ins;
+}
+
+static midgard_branch_extended
+midgard_create_branch_extended( midgard_condition cond,
+ midgard_jmp_writeout_op op,
+ unsigned dest_tag,
+ signed quadword_offset)
+{
+ /* For unclear reasons, the condition code is repeated 8 times */
+ uint16_t duplicated_cond =
+ (cond << 14) |
+ (cond << 12) |
+ (cond << 10) |
+ (cond << 8) |
+ (cond << 6) |
+ (cond << 4) |
+ (cond << 2) |
+ (cond << 0);
+
+ midgard_branch_extended branch = {
+ .op = op,
+ .dest_tag = dest_tag,
+ .offset = quadword_offset,
+ .cond = duplicated_cond
+ };
+
+ return branch;
+}
+
+static void
+attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
+{
+ ins->has_constants = true;
+ memcpy(&ins->constants, constants, 16);
+}
+
+static int
+glsl_type_size(const struct glsl_type *type, bool bindless)
+{
+ return glsl_count_attribute_slots(type, false);
+}
+
+/* Lower fdot2 to a vector multiplication followed by channel addition */
+static void
+midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu)
+{
+ if (alu->op != nir_op_fdot2)
+ return;
+
+ b->cursor = nir_before_instr(&alu->instr);
+
+ nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);
+ nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);
+
+ nir_ssa_def *product = nir_fmul(b, src0, src1);
+
+ nir_ssa_def *sum = nir_fadd(b,
+ nir_channel(b, product, 0),
+ nir_channel(b, product, 1));
+
+ /* Replace the fdot2 with this sum */
+ nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum));
+}
+
+static int
+midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr)
+{
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_viewport_scale:
+ return PAN_SYSVAL_VIEWPORT_SCALE;
+ case nir_intrinsic_load_viewport_offset:
+ return PAN_SYSVAL_VIEWPORT_OFFSET;
+ default:
+ return -1;
+ }
+}
+
+static unsigned
+nir_dest_index(compiler_context *ctx, nir_dest *dst)
+{
+ if (dst->is_ssa)
+ return dst->ssa.index;
+ else {
+ assert(!dst->reg.indirect);
+ return ctx->func->impl->ssa_alloc + dst->reg.reg->index;
+ }
+}
+
+static int sysval_for_instr(compiler_context *ctx, nir_instr *instr,
+ unsigned *dest)
+{
+ nir_intrinsic_instr *intr;
+ nir_dest *dst = NULL;
+ nir_tex_instr *tex;
+ int sysval = -1;
+
+ switch (instr->type) {
+ case nir_instr_type_intrinsic:
+ intr = nir_instr_as_intrinsic(instr);
+ sysval = midgard_nir_sysval_for_intrinsic(intr);
+ dst = &intr->dest;
+ break;
+ case nir_instr_type_tex:
+ tex = nir_instr_as_tex(instr);
+ if (tex->op != nir_texop_txs)
+ break;
+
+ sysval = PAN_SYSVAL(TEXTURE_SIZE,
+ PAN_TXS_SYSVAL_ID(tex->texture_index,
+ nir_tex_instr_dest_size(tex) -
+ (tex->is_array ? 1 : 0),
+ tex->is_array));
+ dst = &tex->dest;
+ break;
+ default:
+ break;
+ }
+
+ if (dest && dst)
+ *dest = nir_dest_index(ctx, dst);
+
+ return sysval;
+}
+
+static void
+midgard_nir_assign_sysval_body(compiler_context *ctx, nir_instr *instr)
+{
+ int sysval;
+
+ sysval = sysval_for_instr(ctx, instr, NULL);
+ if (sysval < 0)
+ return;
+
+ /* We have a sysval load; check if it's already been assigned */
+
+ if (_mesa_hash_table_u64_search(ctx->sysval_to_id, sysval))
+ return;
+
+ /* It hasn't -- so assign it now! */
+
+ unsigned id = ctx->sysval_count++;
+ _mesa_hash_table_u64_insert(ctx->sysval_to_id, sysval, (void *) ((uintptr_t) id + 1));
+ ctx->sysvals[id] = sysval;
+}
+
+static void
+midgard_nir_assign_sysvals(compiler_context *ctx, nir_shader *shader)
+{
+ ctx->sysval_count = 0;
+
+ nir_foreach_function(function, shader) {
+ if (!function->impl) continue;
+
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ midgard_nir_assign_sysval_body(ctx, instr);
+ }
+ }
+ }
+}
+
+static bool
+midgard_nir_lower_fdot2(nir_shader *shader)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (!function->impl) continue;
+
+ nir_builder _b;
+ nir_builder *b = &_b;
+ nir_builder_init(b, function->impl);
+
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_alu) continue;
+
+ nir_alu_instr *alu = nir_instr_as_alu(instr);
+ midgard_nir_lower_fdot2_body(b, alu);
+
+ progress |= true;
+ }
+ }
+
+ nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance);
+
+ }
+
+ return progress;
+}
+
+/* Flushes undefined values to zero */
+
+static void
+optimise_nir(nir_shader *nir)
+{
+ bool progress;
+ unsigned lower_flrp =
+ (nir->options->lower_flrp16 ? 16 : 0) |
+ (nir->options->lower_flrp32 ? 32 : 0) |
+ (nir->options->lower_flrp64 ? 64 : 0);
+
+ NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
+ NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
+ NIR_PASS(progress, nir, nir_lower_idiv);
+
+ nir_lower_tex_options lower_tex_1st_pass_options = {
+ .lower_rect = true,
+ .lower_txp = ~0
+ };
+
+ nir_lower_tex_options lower_tex_2nd_pass_options = {
+ .lower_txs_lod = true,
+ };
+
+ NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_1st_pass_options);
+ NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_2nd_pass_options);
+
+ do {
+ progress = false;
+
+ NIR_PASS(progress, nir, nir_lower_var_copies);
+ NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+
+ NIR_PASS(progress, nir, nir_copy_prop);
+ NIR_PASS(progress, nir, nir_opt_dce);
+ NIR_PASS(progress, nir, nir_opt_dead_cf);
+ NIR_PASS(progress, nir, nir_opt_cse);
+ NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
+ NIR_PASS(progress, nir, nir_opt_algebraic);
+ NIR_PASS(progress, nir, nir_opt_constant_folding);
+
+ if (lower_flrp != 0) {
+ bool lower_flrp_progress = false;
+ NIR_PASS(lower_flrp_progress,
+ nir,
+ nir_lower_flrp,
+ lower_flrp,
+ false /* always_precise */,
+ nir->options->lower_ffma);
+ if (lower_flrp_progress) {
+ NIR_PASS(progress, nir,
+ nir_opt_constant_folding);
+ progress = true;
+ }
+
+ /* Nothing should rematerialize any flrps, so we only
+ * need to do this lowering once.
+ */
+ lower_flrp = 0;
+ }
+
+ NIR_PASS(progress, nir, nir_opt_undef);
+ NIR_PASS(progress, nir, nir_undef_to_zero);
+
+ NIR_PASS(progress, nir, nir_opt_loop_unroll,
+ nir_var_shader_in |
+ nir_var_shader_out |
+ nir_var_function_temp);
+
+ NIR_PASS(progress, nir, nir_opt_vectorize);
+ } while (progress);
+
+ /* Must be run at the end to prevent creation of fsin/fcos ops */
+ NIR_PASS(progress, nir, midgard_nir_scale_trig);
+
+ do {
+ progress = false;
+
+ NIR_PASS(progress, nir, nir_opt_dce);
+ NIR_PASS(progress, nir, nir_opt_algebraic);
+ NIR_PASS(progress, nir, nir_opt_constant_folding);
+ NIR_PASS(progress, nir, nir_copy_prop);
+ } while (progress);
+
+ NIR_PASS(progress, nir, nir_opt_algebraic_late);
+
+ /* We implement booleans as 32-bit 0/~0 */
+ NIR_PASS(progress, nir, nir_lower_bool_to_int32);
+
+ /* Now that booleans are lowered, we can run out late opts */
+ NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
+
+ /* Lower mods for float ops only. Integer ops don't support modifiers
+ * (saturate doesn't make sense on integers, neg/abs require dedicated
+ * instructions) */
+
+ NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_float_source_mods);
+ NIR_PASS(progress, nir, nir_copy_prop);
+ NIR_PASS(progress, nir, nir_opt_dce);
+
+ /* Take us out of SSA */
+ NIR_PASS(progress, nir, nir_lower_locals_to_regs);
+ NIR_PASS(progress, nir, nir_convert_from_ssa, true);
+
+ /* We are a vector architecture; write combine where possible */
+ NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
+ NIR_PASS(progress, nir, nir_lower_vec_to_movs);
+
+ NIR_PASS(progress, nir, nir_opt_dce);
+}
+
+/* Front-half of aliasing the SSA slots, merely by inserting the flag in the
+ * appropriate hash table. Intentional off-by-one to avoid confusing NULL with
+ * r0. See the comments in compiler_context */
+
+static void
+alias_ssa(compiler_context *ctx, int dest, int src)
+{
+ _mesa_hash_table_u64_insert(ctx->ssa_to_alias, dest + 1, (void *) ((uintptr_t) src + 1));
+ _mesa_set_add(ctx->leftover_ssa_to_alias, (void *) (uintptr_t) (dest + 1));
+}
+
+/* ...or undo it, after which the original index will be used (dummy move should be emitted alongside this) */
+
+static void
+unalias_ssa(compiler_context *ctx, int dest)
+{
+ _mesa_hash_table_u64_remove(ctx->ssa_to_alias, dest + 1);
+ /* TODO: Remove from leftover or no? */
+}
+
+/* Do not actually emit a load; instead, cache the constant for inlining */
+
+static void
+emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
+{
+ nir_ssa_def def = instr->def;
+
+ float *v = rzalloc_array(NULL, float, 4);
+ nir_const_load_to_arr(v, instr, f32);
+ _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v);
+}
+
+static unsigned
+nir_src_index(compiler_context *ctx, nir_src *src)
+{
+ if (src->is_ssa)
+ return src->ssa->index;
+ else {
+ assert(!src->reg.indirect);
+ return ctx->func->impl->ssa_alloc + src->reg.reg->index;
+ }
+}
+
+static unsigned
+nir_alu_src_index(compiler_context *ctx, nir_alu_src *src)
+{
+ return nir_src_index(ctx, &src->src);
+}
+
+static bool
+nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
+{
+ unsigned comp = src->swizzle[0];
+
+ for (unsigned c = 1; c < nr_components; ++c) {
+ if (src->swizzle[c] != comp)
+ return true;
+ }
+
+ return false;
+}
+
+/* Midgard puts scalar conditionals in r31.w; move an arbitrary source (the
+ * output of a conditional test) into that register */
+
+static void
+emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component)
+{
+ int condition = nir_src_index(ctx, src);
+
+ /* Source to swizzle the desired component into w */
+
+ const midgard_vector_alu_src alu_src = {
+ .swizzle = SWIZZLE(component, component, component, component),
+ };
+
+ /* There is no boolean move instruction. Instead, we simulate a move by
+ * ANDing the condition with itself to get it into r31.w */
+
+ midgard_instruction ins = {
+ .type = TAG_ALU_4,
+
+ /* We need to set the conditional as close as possible */
+ .precede_break = true,
+ .unit = for_branch ? UNIT_SMUL : UNIT_SADD,
+ .mask = 1 << COMPONENT_W,
+
+ .ssa_args = {
+ .src0 = condition,
+ .src1 = condition,
+ .dest = SSA_FIXED_REGISTER(31),
+ },
+
+ .alu = {
+ .op = midgard_alu_op_iand,
+ .outmod = midgard_outmod_int_wrap,
+ .reg_mode = midgard_reg_mode_32,
+ .dest_override = midgard_dest_override_none,
+ .src1 = vector_alu_srco_unsigned(alu_src),
+ .src2 = vector_alu_srco_unsigned(alu_src)
+ },
+ };
+
+ emit_mir_instruction(ctx, ins);
+}
+
+/* Or, for mixed conditions (with csel_v), here's a vector version using all of
+ * r31 instead */
+
+static void
+emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp)
+{
+ int condition = nir_src_index(ctx, &src->src);
+
+ /* Source to swizzle the desired component into w */
+
+ const midgard_vector_alu_src alu_src = {
+ .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle),
+ };
+
+ /* There is no boolean move instruction. Instead, we simulate a move by
+ * ANDing the condition with itself to get it into r31.w */
+
+ midgard_instruction ins = {
+ .type = TAG_ALU_4,
+ .precede_break = true,
+ .mask = mask_of(nr_comp),
+ .ssa_args = {
+ .src0 = condition,
+ .src1 = condition,
+ .dest = SSA_FIXED_REGISTER(31),
+ },
+ .alu = {
+ .op = midgard_alu_op_iand,
+ .outmod = midgard_outmod_int_wrap,
+ .reg_mode = midgard_reg_mode_32,
+ .dest_override = midgard_dest_override_none,
+ .src1 = vector_alu_srco_unsigned(alu_src),
+ .src2 = vector_alu_srco_unsigned(alu_src)
+ },
+ };
+
+ emit_mir_instruction(ctx, ins);
+}
+
+
+
+/* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise
+ * pinning to eliminate this move in all known cases */
+
+static void
+emit_indirect_offset(compiler_context *ctx, nir_src *src)
+{
+ int offset = nir_src_index(ctx, src);
+
+ midgard_instruction ins = {
+ .type = TAG_ALU_4,
+ .mask = 1 << COMPONENT_W,
+ .ssa_args = {
+ .src0 = SSA_UNUSED_1,
+ .src1 = offset,
+ .dest = SSA_FIXED_REGISTER(REGISTER_OFFSET),
+ },
+ .alu = {
+ .op = midgard_alu_op_imov,
+ .outmod = midgard_outmod_int_wrap,
+ .reg_mode = midgard_reg_mode_32,
+ .dest_override = midgard_dest_override_none,
+ .src1 = vector_alu_srco_unsigned(zero_alu_src),
+ .src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx)
+ },
+ };
+
+ emit_mir_instruction(ctx, ins);
+}
+
+#define ALU_CASE(nir, _op) \
+ case nir_op_##nir: \
+ op = midgard_alu_op_##_op; \
+ assert(src_bitsize == dst_bitsize); \
+ break;
+
+#define ALU_CASE_BCAST(nir, _op, count) \
+ case nir_op_##nir: \
+ op = midgard_alu_op_##_op; \
+ broadcast_swizzle = count; \
+ assert(src_bitsize == dst_bitsize); \
+ break;
+static bool
+nir_is_fzero_constant(nir_src src)
+{
+ if (!nir_src_is_const(src))
+ return false;
+
+ for (unsigned c = 0; c < nir_src_num_components(src); ++c) {
+ if (nir_src_comp_as_float(src, c) != 0.0)
+ return false;
+ }
+
+ return true;
+}
+
+/* Analyze the sizes of the inputs to determine which reg mode. Ops needed
+ * special treatment override this anyway. */
+
+static midgard_reg_mode
+reg_mode_for_nir(nir_alu_instr *instr)
+{
+ unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
+
+ switch (src_bitsize) {
+ case 8:
+ return midgard_reg_mode_8;
+ case 16:
+ return midgard_reg_mode_16;
+ case 32:
+ return midgard_reg_mode_32;
+ case 64:
+ return midgard_reg_mode_64;
+ default:
+ unreachable("Invalid bit size");
+ }
+}
+
+static void
+emit_alu(compiler_context *ctx, nir_alu_instr *instr)
+{
+ bool is_ssa = instr->dest.dest.is_ssa;
+
+ unsigned dest = nir_dest_index(ctx, &instr->dest.dest);
+ unsigned nr_components = nir_dest_num_components(instr->dest.dest);
+ unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
+
+ /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are
+ * supported. A few do not and are commented for now. Also, there are a
+ * number of NIR ops which Midgard does not support and need to be
+ * lowered, also TODO. This switch block emits the opcode and calling
+ * convention of the Midgard instruction; actual packing is done in
+ * emit_alu below */
+
+ unsigned op;
+
+ /* Number of components valid to check for the instruction (the rest
+ * will be forced to the last), or 0 to use as-is. Relevant as
+ * ball-type instructions have a channel count in NIR but are all vec4
+ * in Midgard */
+
+ unsigned broadcast_swizzle = 0;
+
+ /* What register mode should we operate in? */
+ midgard_reg_mode reg_mode =
+ reg_mode_for_nir(instr);
+
+ /* Do we need a destination override? Used for inline
+ * type conversion */
+
+ midgard_dest_override dest_override =
+ midgard_dest_override_none;
+
+ /* Should we use a smaller respective source and sign-extend? */
+
+ bool half_1 = false, sext_1 = false;
+ bool half_2 = false, sext_2 = false;
+
+ unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
+ unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest);
+
+ switch (instr->op) {
+ ALU_CASE(fadd, fadd);
+ ALU_CASE(fmul, fmul);
+ ALU_CASE(fmin, fmin);
+ ALU_CASE(fmax, fmax);
+ ALU_CASE(imin, imin);
+ ALU_CASE(imax, imax);
+ ALU_CASE(umin, umin);
+ ALU_CASE(umax, umax);
+ ALU_CASE(ffloor, ffloor);
+ ALU_CASE(fround_even, froundeven);
+ ALU_CASE(ftrunc, ftrunc);
+ ALU_CASE(fceil, fceil);
+ ALU_CASE(fdot3, fdot3);
+ ALU_CASE(fdot4, fdot4);
+ ALU_CASE(iadd, iadd);
+ ALU_CASE(isub, isub);
+ ALU_CASE(imul, imul);
+
+ /* Zero shoved as second-arg */
+ ALU_CASE(iabs, iabsdiff);
+
+ ALU_CASE(mov, imov);
+
+ ALU_CASE(feq32, feq);
+ ALU_CASE(fne32, fne);
+ ALU_CASE(flt32, flt);
+ ALU_CASE(ieq32, ieq);
+ ALU_CASE(ine32, ine);
+ ALU_CASE(ilt32, ilt);
+ ALU_CASE(ult32, ult);
+
+ /* We don't have a native b2f32 instruction. Instead, like many
+ * GPUs, we exploit booleans as 0/~0 for false/true, and
+ * correspondingly AND
+ * by 1.0 to do the type conversion. For the moment, prime us
+ * to emit:
+ *
+ * iand [whatever], #0
+ *
+ * At the end of emit_alu (as MIR), we'll fix-up the constant
+ */
+
+ ALU_CASE(b2f32, iand);
+ ALU_CASE(b2i32, iand);
+
+ /* Likewise, we don't have a dedicated f2b32 instruction, but
+ * we can do a "not equal to 0.0" test. */
+
+ ALU_CASE(f2b32, fne);
+ ALU_CASE(i2b32, ine);
+
+ ALU_CASE(frcp, frcp);
+ ALU_CASE(frsq, frsqrt);
+ ALU_CASE(fsqrt, fsqrt);
+ ALU_CASE(fexp2, fexp2);
+ ALU_CASE(flog2, flog2);
+
+ ALU_CASE(f2i32, f2i_rtz);
+ ALU_CASE(f2u32, f2u_rtz);
+ ALU_CASE(i2f32, i2f_rtz);
+ ALU_CASE(u2f32, u2f_rtz);
+
+ ALU_CASE(f2i16, f2i_rtz);
+ ALU_CASE(f2u16, f2u_rtz);
+ ALU_CASE(i2f16, i2f_rtz);
+ ALU_CASE(u2f16, u2f_rtz);
+
+ ALU_CASE(fsin, fsin);
+ ALU_CASE(fcos, fcos);
+
+ /* Second op implicit #0 */
+ ALU_CASE(inot, inor);
+ ALU_CASE(iand, iand);
+ ALU_CASE(ior, ior);
+ ALU_CASE(ixor, ixor);
+ ALU_CASE(ishl, ishl);
+ ALU_CASE(ishr, iasr);
+ ALU_CASE(ushr, ilsr);
+
+ ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2);
+ ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3);
+ ALU_CASE(b32all_fequal4, fball_eq);
+
+ ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2);
+ ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3);
+ ALU_CASE(b32any_fnequal4, fbany_neq);
+
+ ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2);
+ ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3);
+ ALU_CASE(b32all_iequal4, iball_eq);
+
+ ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2);
+ ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3);
+ ALU_CASE(b32any_inequal4, ibany_neq);
+
+ /* Source mods will be shoved in later */
+ ALU_CASE(fabs, fmov);
+ ALU_CASE(fneg, fmov);
+ ALU_CASE(fsat, fmov);
+
+ /* For size conversion, we use a move. Ideally though we would squash
+ * these ops together; maybe that has to happen after in NIR as part of
+ * propagation...? An earlier algebraic pass ensured we step down by
+ * only / exactly one size. If stepping down, we use a dest override to
+ * reduce the size; if stepping up, we use a larger-sized move with a
+ * half source and a sign/zero-extension modifier */
+
+ case nir_op_i2i8:
+ case nir_op_i2i16:
+ case nir_op_i2i32:
+ /* If we end up upscale, we'll need a sign-extend on the
+ * operand (the second argument) */
+
+ sext_2 = true;
+ case nir_op_u2u8:
+ case nir_op_u2u16:
+ case nir_op_u2u32: {
+ op = midgard_alu_op_imov;
+
+ if (dst_bitsize == (src_bitsize * 2)) {
+ /* Converting up */
+ half_2 = true;
+
+ /* Use a greater register mode */
+ reg_mode++;
+ } else if (src_bitsize == (dst_bitsize * 2)) {
+ /* Converting down */
+ dest_override = midgard_dest_override_lower;
+ }
+
+ break;
+ }
+
+ case nir_op_f2f16: {
+ assert(src_bitsize == 32);
+
+ op = midgard_alu_op_fmov;
+ dest_override = midgard_dest_override_lower;
+ break;
+ }
+
+ case nir_op_f2f32: {
+ assert(src_bitsize == 16);
+
+ op = midgard_alu_op_fmov;
+ half_2 = true;
+ reg_mode++;
+ break;
+ }
+
+
+ /* For greater-or-equal, we lower to less-or-equal and flip the
+ * arguments */
+
+ case nir_op_fge:
+ case nir_op_fge32:
+ case nir_op_ige32:
+ case nir_op_uge32: {
+ op =
+ instr->op == nir_op_fge ? midgard_alu_op_fle :
+ instr->op == nir_op_fge32 ? midgard_alu_op_fle :
+ instr->op == nir_op_ige32 ? midgard_alu_op_ile :
+ instr->op == nir_op_uge32 ? midgard_alu_op_ule :
+ 0;
+
+ /* Swap via temporary */
+ nir_alu_src temp = instr->src[1];
+ instr->src[1] = instr->src[0];
+ instr->src[0] = temp;
+
+ break;
+ }
+
+ case nir_op_b32csel: {
+ /* Midgard features both fcsel and icsel, depending on
+ * the type of the arguments/output. However, as long
+ * as we're careful we can _always_ use icsel and
+ * _never_ need fcsel, since the latter does additional
+ * floating-point-specific processing whereas the
+ * former just moves bits on the wire. It's not obvious
+ * why these are separate opcodes, save for the ability
+ * to do things like sat/pos/abs/neg for free */
+
+ bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
+ op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel;
+
+ /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
+ nr_inputs = 2;
+
+ /* Emit the condition into r31 */
+
+ if (mixed)
+ emit_condition_mixed(ctx, &instr->src[0], nr_components);
+ else
+ emit_condition(ctx, &instr->src[0].src, false, instr->src[0].swizzle[0]);
+
+ /* The condition is the first argument; move the other
+ * arguments up one to be a binary instruction for
+ * Midgard */
+
+ memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
+ break;
+ }
+
+ default:
+ DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
+ assert(0);
+ return;
+ }
+
+ /* Midgard can perform certain modifiers on output of an ALU op */
+ unsigned outmod;
+
+ if (midgard_is_integer_out_op(op)) {
+ outmod = midgard_outmod_int_wrap;
+ } else {
+ bool sat = instr->dest.saturate || instr->op == nir_op_fsat;
+ outmod = sat ? midgard_outmod_sat : midgard_outmod_none;
+ }
+
+ /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */
+
+ if (instr->op == nir_op_fmax) {
+ if (nir_is_fzero_constant(instr->src[0].src)) {
+ op = midgard_alu_op_fmov;
+ nr_inputs = 1;
+ outmod = midgard_outmod_pos;
+ instr->src[0] = instr->src[1];
+ } else if (nir_is_fzero_constant(instr->src[1].src)) {
+ op = midgard_alu_op_fmov;
+ nr_inputs = 1;
+ outmod = midgard_outmod_pos;
+ }
+ }
+
+ /* Fetch unit, quirks, etc information */
+ unsigned opcode_props = alu_opcode_props[op].props;
+ bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
+
+ /* src0 will always exist afaik, but src1 will not for 1-argument
+ * instructions. The latter can only be fetched if the instruction
+ * needs it, or else we may segfault. */
+
+ unsigned src0 = nir_alu_src_index(ctx, &instr->src[0]);
+ unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(ctx, &instr->src[1]) : SSA_UNUSED_0;
+
+ /* Rather than use the instruction generation helpers, we do it
+ * ourselves here to avoid the mess */
+
+ midgard_instruction ins = {
+ .type = TAG_ALU_4,
+ .ssa_args = {
+ .src0 = quirk_flipped_r24 ? SSA_UNUSED_1 : src0,
+ .src1 = quirk_flipped_r24 ? src0 : src1,
+ .dest = dest,
+ }
+ };
+
+ nir_alu_src *nirmods[2] = { NULL };
+
+ if (nr_inputs == 2) {
+ nirmods[0] = &instr->src[0];
+ nirmods[1] = &instr->src[1];
+ } else if (nr_inputs == 1) {
+ nirmods[quirk_flipped_r24] = &instr->src[0];
+ } else {
+ assert(0);
+ }
+
+ /* These were lowered to a move, so apply the corresponding mod */
+
+ if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) {
+ nir_alu_src *s = nirmods[quirk_flipped_r24];
+
+ if (instr->op == nir_op_fneg)
+ s->negate = !s->negate;
+
+ if (instr->op == nir_op_fabs)
+ s->abs = !s->abs;
+ }
+
+ bool is_int = midgard_is_integer_op(op);
+
+ ins.mask = mask_of(nr_components);
+
+ midgard_vector_alu alu = {
+ .op = op,
+ .reg_mode = reg_mode,
+ .dest_override = dest_override,
+ .outmod = outmod,
+
+ .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, sext_1)),
+ .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int, broadcast_swizzle, half_2, sext_2)),
+ };
+
+ /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */
+
+ if (!is_ssa)
+ ins.mask &= instr->dest.write_mask;
+
+ ins.alu = alu;
+
+ /* Late fixup for emulated instructions */
+
+ if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) {
+ /* Presently, our second argument is an inline #0 constant.
+ * Switch over to an embedded 1.0 constant (that can't fit
+ * inline, since we're 32-bit, not 16-bit like the inline
+ * constants) */
+
+ ins.ssa_args.inline_constant = false;
+ ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+ ins.has_constants = true;
+
+ if (instr->op == nir_op_b2f32) {
+ ins.constants[0] = 1.0f;
+ } else {
+ /* Type pun it into place */
+ uint32_t one = 0x1;
+ memcpy(&ins.constants[0], &one, sizeof(uint32_t));
+ }
+
+ ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
+ } else if (nr_inputs == 1 && !quirk_flipped_r24) {
+ /* Lots of instructions need a 0 plonked in */
+ ins.ssa_args.inline_constant = false;
+ ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+ ins.has_constants = true;
+ ins.constants[0] = 0.0f;
+ ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
+ } else if (instr->op == nir_op_inot) {
+ /* ~b = ~(b & b), so duplicate the source */
+ ins.ssa_args.src1 = ins.ssa_args.src0;
+ ins.alu.src2 = ins.alu.src1;
+ }
+
+ if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
+ /* To avoid duplicating the lookup tables (probably), true LUT
+ * instructions can only operate as if they were scalars. Lower
+ * them here by changing the component. */
+
+ uint8_t original_swizzle[4];
+ memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle));
+ unsigned orig_mask = ins.mask;
+
+ for (int i = 0; i < nr_components; ++i) {
+ /* Mask the associated component, dropping the
+ * instruction if needed */
+
+ ins.mask = 1 << i;
+ ins.mask &= orig_mask;
+
+ if (!ins.mask)
+ continue;
+
+ for (int j = 0; j < 4; ++j)
+ nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */
+
+ ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, false));
+ emit_mir_instruction(ctx, ins);
+ }
+ } else {
+ emit_mir_instruction(ctx, ins);
+ }
+}
+
+#undef ALU_CASE
+
+/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly
+ * optimized) versions of UBO #0 */
+
+static void
+emit_ubo_read(
+ compiler_context *ctx,
+ unsigned dest,
+ unsigned offset,
+ nir_src *indirect_offset,
+ unsigned index)
+{
+ /* TODO: half-floats */
+
+ if (!indirect_offset && offset < ctx->uniform_cutoff && index == 0) {
+ /* Fast path: For the first 16 uniforms, direct accesses are
+ * 0-cycle, since they're just a register fetch in the usual
+ * case. So, we alias the registers while we're still in
+ * SSA-space */
+
+ int reg_slot = 23 - offset;
+ alias_ssa(ctx, dest, SSA_FIXED_REGISTER(reg_slot));
+ } else {
+ /* Otherwise, read from the 'special' UBO to access
+ * higher-indexed uniforms, at a performance cost. More
+ * generally, we're emitting a UBO read instruction. */
+
+ midgard_instruction ins = m_ld_uniform_32(dest, offset);
+
+ /* TODO: Don't split */
+ ins.load_store.varying_parameters = (offset & 7) << 7;
+ ins.load_store.address = offset >> 3;
+
+ if (indirect_offset) {
+ emit_indirect_offset(ctx, indirect_offset);
+ ins.load_store.unknown = 0x8700 | index; /* xxx: what is this? */
+ } else {
+ ins.load_store.unknown = 0x1E00 | index; /* xxx: what is this? */
+ }
+
+ /* TODO respect index */
+
+ emit_mir_instruction(ctx, ins);
+ }
+}
+
+static void
+emit_varying_read(
+ compiler_context *ctx,
+ unsigned dest, unsigned offset,
+ unsigned nr_comp, unsigned component,
+ nir_src *indirect_offset, nir_alu_type type)
+{
+ /* XXX: Half-floats? */
+ /* TODO: swizzle, mask */
+
+ midgard_instruction ins = m_ld_vary_32(dest, offset);
+ ins.mask = mask_of(nr_comp);
+ ins.load_store.swizzle = SWIZZLE_XYZW >> (2 * component);
+
+ midgard_varying_parameter p = {
+ .is_varying = 1,
+ .interpolation = midgard_interp_default,
+ .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0
+ };
+
+ unsigned u;
+ memcpy(&u, &p, sizeof(p));
+ ins.load_store.varying_parameters = u;
+
+ if (indirect_offset) {
+ /* We need to add in the dynamic index, moved to r27.w */
+ emit_indirect_offset(ctx, indirect_offset);
+ ins.load_store.unknown = 0x79e; /* xxx: what is this? */
+ } else {
+ /* Just a direct load */
+ ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
+ }
+
+ /* Use the type appropriate load */
+ switch (type) {
+ case nir_type_uint:
+ case nir_type_bool:
+ ins.load_store.op = midgard_op_ld_vary_32u;
+ break;
+ case nir_type_int:
+ ins.load_store.op = midgard_op_ld_vary_32i;
+ break;
+ case nir_type_float:
+ ins.load_store.op = midgard_op_ld_vary_32;
+ break;
+ default:
+ unreachable("Attempted to load unknown type");
+ break;
+ }
+
+ emit_mir_instruction(ctx, ins);
+}
+
+static void
+emit_sysval_read(compiler_context *ctx, nir_instr *instr)
+{
+ unsigned dest;
+ /* Figure out which uniform this is */
+ int sysval = sysval_for_instr(ctx, instr, &dest);
+ void *val = _mesa_hash_table_u64_search(ctx->sysval_to_id, sysval);
+
+ /* Sysvals are prefix uniforms */
+ unsigned uniform = ((uintptr_t) val) - 1;
+
+ /* Emit the read itself -- this is never indirect */
+ emit_ubo_read(ctx, dest, uniform, NULL, 0);
+}
+
+static void
+emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
+{
+ unsigned offset = 0, reg;
+
+ switch (instr->intrinsic) {
+ case nir_intrinsic_discard_if:
+ emit_condition(ctx, &instr->src[0], true, COMPONENT_X);
+
+ /* fallthrough */
+
+ case nir_intrinsic_discard: {
+ bool conditional = instr->intrinsic == nir_intrinsic_discard_if;
+ struct midgard_instruction discard = v_branch(conditional, false);
+ discard.branch.target_type = TARGET_DISCARD;
+ emit_mir_instruction(ctx, discard);
+
+ ctx->can_discard = true;
+ break;
+ }
+
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_input: {
+ bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
+ bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
+
+ /* Get the base type of the intrinsic */
+ /* TODO: Infer type? Does it matter? */
+ nir_alu_type t =
+ is_ubo ? nir_type_uint : nir_intrinsic_type(instr);
+ t = nir_alu_type_get_base_type(t);
+
+ if (!is_ubo) {
+ offset = nir_intrinsic_base(instr);
+ }
+
+ unsigned nr_comp = nir_intrinsic_dest_components(instr);
+
+ nir_src *src_offset = nir_get_io_offset_src(instr);
+
+ bool direct = nir_src_is_const(*src_offset);
+
+ if (direct)
+ offset += nir_src_as_uint(*src_offset);
+
+ /* We may need to apply a fractional offset */
+ int component = instr->intrinsic == nir_intrinsic_load_input ?
+ nir_intrinsic_component(instr) : 0;
+ reg = nir_dest_index(ctx, &instr->dest);
+
+ if (is_uniform && !ctx->is_blend) {
+ emit_ubo_read(ctx, reg, ctx->sysval_count + offset, !direct ? &instr->src[0] : NULL, 0);
+ } else if (is_ubo) {
+ nir_src index = instr->src[0];
+
+ /* We don't yet support indirect UBOs. For indirect
+ * block numbers (if that's possible), we don't know
+ * enough about the hardware yet. For indirect sources,
+ * we know what we need but we need to add some NIR
+ * support for lowering correctly with respect to
+ * 128-bit reads */
+
+ assert(nir_src_is_const(index));
+ assert(nir_src_is_const(*src_offset));
+
+ /* TODO: Alignment */
+ assert((offset & 0xF) == 0);
+
+ uint32_t uindex = nir_src_as_uint(index) + 1;
+ emit_ubo_read(ctx, reg, offset / 16, NULL, uindex);
+ } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) {
+ emit_varying_read(ctx, reg, offset, nr_comp, component, !direct ? &instr->src[0] : NULL, t);
+ } else if (ctx->is_blend) {
+ /* For blend shaders, load the input color, which is
+ * preloaded to r0 */
+
+ midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
+ emit_mir_instruction(ctx, move);
+ } else if (ctx->stage == MESA_SHADER_VERTEX) {
+ midgard_instruction ins = m_ld_attr_32(reg, offset);
+ ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */
+ ins.mask = mask_of(nr_comp);
+
+ /* Use the type appropriate load */
+ switch (t) {
+ case nir_type_uint:
+ case nir_type_bool:
+ ins.load_store.op = midgard_op_ld_attr_32u;
+ break;
+ case nir_type_int:
+ ins.load_store.op = midgard_op_ld_attr_32i;
+ break;
+ case nir_type_float:
+ ins.load_store.op = midgard_op_ld_attr_32;
+ break;
+ default:
+ unreachable("Attempted to load unknown type");
+ break;
+ }
+
+ emit_mir_instruction(ctx, ins);
+ } else {
+ DBG("Unknown load\n");
+ assert(0);
+ }
+
+ break;
+ }
+
+ /* Reads 128-bit value raw off the tilebuffer during blending, tasty */
+
+ case nir_intrinsic_load_raw_output_pan:
+ reg = nir_dest_index(ctx, &instr->dest);
+ assert(ctx->is_blend);
+
+ midgard_instruction ins = m_ld_color_buffer_8(reg, 0);
+ emit_mir_instruction(ctx, ins);
+ break;
+
+ case nir_intrinsic_load_blend_const_color_rgba: {
+ assert(ctx->is_blend);
+ reg = nir_dest_index(ctx, &instr->dest);
+
+ /* Blend constants are embedded directly in the shader and
+ * patched in, so we use some magic routing */
+
+ midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg);
+ ins.has_constants = true;
+ ins.has_blend_constant = true;
+ emit_mir_instruction(ctx, ins);
+ break;
+ }
+
+ case nir_intrinsic_store_output:
+ assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
+
+ offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]);
+
+ reg = nir_src_index(ctx, &instr->src[0]);
+
+ if (ctx->stage == MESA_SHADER_FRAGMENT) {
+ /* gl_FragColor is not emitted with load/store
+ * instructions. Instead, it gets plonked into
+ * r0 at the end of the shader and we do the
+ * framebuffer writeout dance. TODO: Defer
+ * writes */
+
+ midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
+ emit_mir_instruction(ctx, move);
+
+ /* Save the index we're writing to for later reference
+ * in the epilogue */
+
+ ctx->fragment_output = reg;
+ } else if (ctx->stage == MESA_SHADER_VERTEX) {
+ /* Varyings are written into one of two special
+ * varying register, r26 or r27. The register itself is
+ * selected as the register in the st_vary instruction,
+ * minus the base of 26. E.g. write into r27 and then
+ * call st_vary(1) */
+
+ midgard_instruction ins = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(26));
+ emit_mir_instruction(ctx, ins);
+
+ /* We should have been vectorized, though we don't
+ * currently check that st_vary is emitted only once
+ * per slot (this is relevant, since there's not a mask
+ * parameter available on the store [set to 0 by the
+ * blob]). We do respect the component by adjusting the
+ * swizzle. */
+
+ unsigned component = nir_intrinsic_component(instr);
+
+ midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset);
+ st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
+ st.load_store.swizzle = SWIZZLE_XYZW << (2*component);
+ emit_mir_instruction(ctx, st);
+ } else {
+ DBG("Unknown store\n");
+ assert(0);
+ }
+
+ break;
+
+ /* Special case of store_output for lowered blend shaders */
+ case nir_intrinsic_store_raw_output_pan:
+ assert (ctx->stage == MESA_SHADER_FRAGMENT);
+ reg = nir_src_index(ctx, &instr->src[0]);
+
+ midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
+ emit_mir_instruction(ctx, move);
+ ctx->fragment_output = reg;
+
+ break;
+
+ case nir_intrinsic_load_alpha_ref_float:
+ assert(instr->dest.is_ssa);
+
+ float ref_value = ctx->alpha_ref;
+
+ float *v = ralloc_array(NULL, float, 4);
+ memcpy(v, &ref_value, sizeof(float));
+ _mesa_hash_table_u64_insert(ctx->ssa_constants, instr->dest.ssa.index + 1, v);
+ break;
+
+ case nir_intrinsic_load_viewport_scale:
+ case nir_intrinsic_load_viewport_offset:
+ emit_sysval_read(ctx, &instr->instr);
+ break;
+
+ default:
+ printf ("Unhandled intrinsic\n");
+ assert(0);
+ break;
+ }
+}
+
+static unsigned
+midgard_tex_format(enum glsl_sampler_dim dim)
+{
+ switch (dim) {
+ case GLSL_SAMPLER_DIM_1D:
+ case GLSL_SAMPLER_DIM_BUF:
+ return MALI_TEX_1D;
+
+ case GLSL_SAMPLER_DIM_2D:
+ case GLSL_SAMPLER_DIM_EXTERNAL:
+ return MALI_TEX_2D;
+
+ case GLSL_SAMPLER_DIM_3D:
+ return MALI_TEX_3D;
+
+ case GLSL_SAMPLER_DIM_CUBE:
+ return MALI_TEX_CUBE;
+
+ default:
+ DBG("Unknown sampler dim type\n");
+ assert(0);
+ return 0;
+ }
+}
+
+/* Tries to attach an explicit LOD / bias as a constant. Returns whether this
+ * was successful */
+
+static bool
+pan_attach_constant_bias(
+ compiler_context *ctx,
+ nir_src lod,
+ midgard_texture_word *word)
+{
+ /* To attach as constant, it has to *be* constant */
+
+ if (!nir_src_is_const(lod))
+ return false;
+
+ float f = nir_src_as_float(lod);
+
+ /* Break into fixed-point */
+ signed lod_int = f;
+ float lod_frac = f - lod_int;
+
+ /* Carry over negative fractions */
+ if (lod_frac < 0.0) {
+ lod_int--;
+ lod_frac += 1.0;
+ }
+
+ /* Encode */
+ word->bias = float_to_ubyte(lod_frac);
+ word->bias_int = lod_int;
+
+ return true;
+}
+
+static enum mali_sampler_type
+midgard_sampler_type(nir_alu_type t) {
+ switch (nir_alu_type_get_base_type(t))
+ {
+ case nir_type_float:
+ return MALI_SAMPLER_FLOAT;
+ case nir_type_int:
+ return MALI_SAMPLER_SIGNED;
+ case nir_type_uint:
+ return MALI_SAMPLER_UNSIGNED;
+ default:
+ unreachable("Unknown sampler type");
+ }
+}
+
+static void
+emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
+ unsigned midgard_texop)
+{
+ /* TODO */
+ //assert (!instr->sampler);
+ //assert (!instr->texture_array_size);
+
+ /* Allocate registers via a round robin scheme to alternate between the two registers */
+ int reg = ctx->texture_op_count & 1;
+ int in_reg = reg, out_reg = reg;
+
+ /* Make room for the reg */
+
+ if (ctx->texture_index[reg] > -1)
+ unalias_ssa(ctx, ctx->texture_index[reg]);
+
+ int texture_index = instr->texture_index;
+ int sampler_index = texture_index;
+
+ /* No helper to build texture words -- we do it all here */
+ midgard_instruction ins = {
+ .type = TAG_TEXTURE_4,
+ .mask = 0xF,
+ .texture = {
+ .op = midgard_texop,
+ .format = midgard_tex_format(instr->sampler_dim),
+ .texture_handle = texture_index,
+ .sampler_handle = sampler_index,
+
+ /* TODO: Regalloc it in */
+ .swizzle = SWIZZLE_XYZW,
+
+ /* TODO: half */
+ .in_reg_full = 1,
+ .out_full = 1,
+
+ .sampler_type = midgard_sampler_type(instr->dest_type),
+ }
+ };
+
+ for (unsigned i = 0; i < instr->num_srcs; ++i) {
+ int reg = SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg);
+ int index = nir_src_index(ctx, &instr->src[i].src);
+ int nr_comp = nir_src_num_components(instr->src[i].src);
+ midgard_vector_alu_src alu_src = blank_alu_src;
+
+ switch (instr->src[i].src_type) {
+ case nir_tex_src_coord: {
+ if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
+ /* texelFetch is undefined on samplerCube */
+ assert(midgard_texop != TEXTURE_OP_TEXEL_FETCH);
+
+ /* For cubemaps, we need to load coords into
+ * special r27, and then use a special ld/st op
+ * to select the face and copy the xy into the
+ * texture register */
+
+ alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X);
+
+ midgard_instruction move = v_mov(index, alu_src, SSA_FIXED_REGISTER(27));
+ emit_mir_instruction(ctx, move);
+
+ midgard_instruction st = m_st_cubemap_coords(reg, 0);
+ st.load_store.unknown = 0x24; /* XXX: What is this? */
+ st.mask = 0x3; /* xy */
+ st.load_store.swizzle = alu_src.swizzle;
+ emit_mir_instruction(ctx, st);
+
+ ins.texture.in_reg_swizzle = swizzle_of(2);
+ } else {
+ ins.texture.in_reg_swizzle = alu_src.swizzle = swizzle_of(nr_comp);
+
+ midgard_instruction mov = v_mov(index, alu_src, reg);
+ mov.mask = mask_of(nr_comp);
+ emit_mir_instruction(ctx, mov);
+
+ if (midgard_texop == TEXTURE_OP_TEXEL_FETCH) {
+ /* Texel fetch opcodes care about the
+ * values of z and w, so we actually
+ * need to spill into a second register
+ * for a texel fetch with register bias
+ * (for non-2D). TODO: Implement that
+ */
+
+ assert(instr->sampler_dim == GLSL_SAMPLER_DIM_2D);
+
+ midgard_instruction zero = v_mov(index, alu_src, reg);
+ zero.ssa_args.inline_constant = true;
+ zero.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+ zero.has_constants = true;
+ zero.mask = ~mov.mask;
+ emit_mir_instruction(ctx, zero);
+
+ ins.texture.in_reg_swizzle = SWIZZLE_XYZZ;
+ } else {
+ /* Non-texel fetch doesn't need that
+ * nonsense. However we do use the Z
+ * for array indexing */
+ bool is_3d = instr->sampler_dim == GLSL_SAMPLER_DIM_3D;
+ ins.texture.in_reg_swizzle = is_3d ? SWIZZLE_XYZZ : SWIZZLE_XYXZ;
+ }
+ }
+
+ break;
+ }
+
+ case nir_tex_src_bias:
+ case nir_tex_src_lod: {
+ /* Try as a constant if we can */
+
+ bool is_txf = midgard_texop == TEXTURE_OP_TEXEL_FETCH;
+ if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture))
+ break;
+
+ /* Otherwise we use a register. To keep RA simple, we
+ * put the bias/LOD into the w component of the input
+ * source, which is otherwise in xy */
+
+ alu_src.swizzle = SWIZZLE_XXXX;
+
+ midgard_instruction mov = v_mov(index, alu_src, reg);
+ mov.mask = 1 << COMPONENT_W;
+ emit_mir_instruction(ctx, mov);
+
+ ins.texture.lod_register = true;
+
+ midgard_tex_register_select sel = {
+ .select = in_reg,
+ .full = 1,
+
+ /* w */
+ .component_lo = 1,
+ .component_hi = 1
+ };
+
+ uint8_t packed;
+ memcpy(&packed, &sel, sizeof(packed));
+ ins.texture.bias = packed;
+
+ break;
+ };
+
+ default:
+ unreachable("Unknown texture source type\n");
+ }
+ }
+
+ /* Set registers to read and write from the same place */
+ ins.texture.in_reg_select = in_reg;
+ ins.texture.out_reg_select = out_reg;
+
+ emit_mir_instruction(ctx, ins);
+
+ int o_reg = REGISTER_TEXTURE_BASE + out_reg, o_index = nir_dest_index(ctx, &instr->dest);
+ midgard_instruction ins2 = v_mov(SSA_FIXED_REGISTER(o_reg), blank_alu_src, o_index);
+ emit_mir_instruction(ctx, ins2);
+
+ /* Used for .cont and .last hinting */
+ ctx->texture_op_count++;
+}
+
+static void
+emit_tex(compiler_context *ctx, nir_tex_instr *instr)
+{
+ /* Fixup op, since only textureLod is permitted in VS but NIR can give
+ * generic tex in some cases (which confuses the hardware) */
+
+ bool is_vertex = ctx->stage == MESA_SHADER_VERTEX;
+
+ if (is_vertex && instr->op == nir_texop_tex)
+ instr->op = nir_texop_txl;
+
+ switch (instr->op) {
+ case nir_texop_tex:
+ case nir_texop_txb:
+ emit_texop_native(ctx, instr, TEXTURE_OP_NORMAL);
+ break;
+ case nir_texop_txl:
+ emit_texop_native(ctx, instr, TEXTURE_OP_LOD);
+ break;
+ case nir_texop_txf:
+ emit_texop_native(ctx, instr, TEXTURE_OP_TEXEL_FETCH);
+ break;
+ case nir_texop_txs:
+ emit_sysval_read(ctx, &instr->instr);
+ break;
+ default:
+ unreachable("Unhanlded texture op");
+ }
+}
+
+static void
+emit_jump(compiler_context *ctx, nir_jump_instr *instr)
+{
+ switch (instr->type) {
+ case nir_jump_break: {
+ /* Emit a branch out of the loop */
+ struct midgard_instruction br = v_branch(false, false);
+ br.branch.target_type = TARGET_BREAK;
+ br.branch.target_break = ctx->current_loop_depth;
+ emit_mir_instruction(ctx, br);
+
+ DBG("break..\n");
+ break;
+ }
+
+ default:
+ DBG("Unknown jump type %d\n", instr->type);
+ break;
+ }
+}
+
+static void
+emit_instr(compiler_context *ctx, struct nir_instr *instr)
+{
+ switch (instr->type) {
+ case nir_instr_type_load_const:
+ emit_load_const(ctx, nir_instr_as_load_const(instr));
+ break;
+
+ case nir_instr_type_intrinsic:
+ emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+ break;
+
+ case nir_instr_type_alu:
+ emit_alu(ctx, nir_instr_as_alu(instr));
+ break;
+
+ case nir_instr_type_tex:
+ emit_tex(ctx, nir_instr_as_tex(instr));
+ break;
+
+ case nir_instr_type_jump:
+ emit_jump(ctx, nir_instr_as_jump(instr));
+ break;
+
+ case nir_instr_type_ssa_undef:
+ /* Spurious */
+ break;
+
+ default:
+ DBG("Unhandled instruction type\n");
+ break;
+ }
+}
+
+
+/* ALU instructions can inline or embed constants, which decreases register
+ * pressure and saves space. */
+
+#define CONDITIONAL_ATTACH(src) { \
+ void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \
+\
+ if (entry) { \
+ attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \
+ alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
+ } \
+}
+
+static void
+inline_alu_constants(compiler_context *ctx)
+{
+ mir_foreach_instr(ctx, alu) {
+ /* Other instructions cannot inline constants */
+ if (alu->type != TAG_ALU_4) continue;
+
+ /* If there is already a constant here, we can do nothing */
+ if (alu->has_constants) continue;
+
+ /* It makes no sense to inline constants on a branch */
+ if (alu->compact_branch || alu->prepacked_branch) continue;
+
+ CONDITIONAL_ATTACH(src0);
+
+ if (!alu->has_constants) {
+ CONDITIONAL_ATTACH(src1)
+ } else if (!alu->inline_constant) {
+ /* Corner case: _two_ vec4 constants, for instance with a
+ * csel. For this case, we can only use a constant
+ * register for one, we'll have to emit a move for the
+ * other. Note, if both arguments are constants, then
+ * necessarily neither argument depends on the value of
+ * any particular register. As the destination register
+ * will be wiped, that means we can spill the constant
+ * to the destination register.
+ */
+
+ void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src1 + 1);
+ unsigned scratch = alu->ssa_args.dest;
+
+ if (entry) {
+ midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch);
+ attach_constants(ctx, &ins, entry, alu->ssa_args.src1 + 1);
+
+ /* Force a break XXX Defer r31 writes */
+ ins.unit = UNIT_VLUT;
+
+ /* Set the source */
+ alu->ssa_args.src1 = scratch;
+
+ /* Inject us -before- the last instruction which set r31 */
+ mir_insert_instruction_before(mir_prev_op(alu), ins);
+ }
+ }
+ }
+}
+
+/* Midgard supports two types of constants, embedded constants (128-bit) and
+ * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
+ * constants can be demoted to inline constants, for space savings and
+ * sometimes a performance boost */
+
+static void
+embedded_to_inline_constant(compiler_context *ctx)
+{
+ mir_foreach_instr(ctx, ins) {
+ if (!ins->has_constants) continue;
+
+ if (ins->ssa_args.inline_constant) continue;
+
+ /* Blend constants must not be inlined by definition */
+ if (ins->has_blend_constant) continue;
+
+ /* We can inline 32-bit (sometimes) or 16-bit (usually) */
+ bool is_16 = ins->alu.reg_mode == midgard_reg_mode_16;
+ bool is_32 = ins->alu.reg_mode == midgard_reg_mode_32;
+
+ if (!(is_16 || is_32))
+ continue;
+
+ /* src1 cannot be an inline constant due to encoding
+ * restrictions. So, if possible we try to flip the arguments
+ * in that case */
+
+ int op = ins->alu.op;
+
+ if (ins->ssa_args.src0 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
+ switch (op) {
+ /* These ops require an operational change to flip
+ * their arguments TODO */
+ case midgard_alu_op_flt:
+ case midgard_alu_op_fle:
+ case midgard_alu_op_ilt:
+ case midgard_alu_op_ile:
+ case midgard_alu_op_fcsel:
+ case midgard_alu_op_icsel:
+ DBG("Missed non-commutative flip (%s)\n", alu_opcode_props[op].name);
+ default:
+ break;
+ }
+
+ if (alu_opcode_props[op].props & OP_COMMUTES) {
+ /* Flip the SSA numbers */
+ ins->ssa_args.src0 = ins->ssa_args.src1;
+ ins->ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+
+ /* And flip the modifiers */
+
+ unsigned src_temp;
+
+ src_temp = ins->alu.src2;
+ ins->alu.src2 = ins->alu.src1;
+ ins->alu.src1 = src_temp;
+ }
+ }
+
+ if (ins->ssa_args.src1 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
+ /* Extract the source information */
+
+ midgard_vector_alu_src *src;
+ int q = ins->alu.src2;
+ midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+ src = m;
+
+ /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */
+ int component = src->swizzle & 3;
+
+ /* Scale constant appropriately, if we can legally */
+ uint16_t scaled_constant = 0;
+
+ if (midgard_is_integer_op(op) || is_16) {
+ unsigned int *iconstants = (unsigned int *) ins->constants;
+ scaled_constant = (uint16_t) iconstants[component];
+
+ /* Constant overflow after resize */
+ if (scaled_constant != iconstants[component])
+ continue;
+ } else {
+ float original = (float) ins->constants[component];
+ scaled_constant = _mesa_float_to_half(original);
+
+ /* Check for loss of precision. If this is
+ * mediump, we don't care, but for a highp
+ * shader, we need to pay attention. NIR
+ * doesn't yet tell us which mode we're in!
+ * Practically this prevents most constants
+ * from being inlined, sadly. */
+
+ float fp32 = _mesa_half_to_float(scaled_constant);
+
+ if (fp32 != original)
+ continue;
+ }
+
+ /* We don't know how to handle these with a constant */
+
+ if (src->mod || src->half || src->rep_low || src->rep_high) {
+ DBG("Bailing inline constant...\n");
+ continue;
+ }
+
+ /* Make sure that the constant is not itself a
+ * vector by checking if all accessed values
+ * (by the swizzle) are the same. */
+
+ uint32_t *cons = (uint32_t *) ins->constants;
+ uint32_t value = cons[component];
+
+ bool is_vector = false;
+ unsigned mask = effective_writemask(&ins->alu, ins->mask);
+
+ for (int c = 1; c < 4; ++c) {
+ /* We only care if this component is actually used */
+ if (!(mask & (1 << c)))
+ continue;
+
+ uint32_t test = cons[(src->swizzle >> (2 * c)) & 3];
+
+ if (test != value) {
+ is_vector = true;
+ break;
+ }
+ }
+
+ if (is_vector)
+ continue;
+
+ /* Get rid of the embedded constant */
+ ins->has_constants = false;
+ ins->ssa_args.src1 = SSA_UNUSED_0;
+ ins->ssa_args.inline_constant = true;
+ ins->inline_constant = scaled_constant;
+ }
+ }
+}
+
+/* Map normal SSA sources to other SSA sources / fixed registers (like
+ * uniforms) */
+
+static void
+map_ssa_to_alias(compiler_context *ctx, int *ref)
+{
+ /* Sign is used quite deliberately for unused */
+ if (*ref < 0)
+ return;
+
+ unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1);
+
+ if (alias) {
+ /* Remove entry in leftovers to avoid a redunant fmov */
+
+ struct set_entry *leftover = _mesa_set_search(ctx->leftover_ssa_to_alias, ((void *) (uintptr_t) (*ref + 1)));
+
+ if (leftover)
+ _mesa_set_remove(ctx->leftover_ssa_to_alias, leftover);
+
+ /* Assign the alias map */
+ *ref = alias - 1;
+ return;
+ }
+}
+
+/* Basic dead code elimination on the MIR itself, which cleans up e.g. the
+ * texture pipeline */
+
+static bool
+midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block)
+{
+ bool progress = false;
+
+ mir_foreach_instr_in_block_safe(block, ins) {
+ if (ins->type != TAG_ALU_4) continue;
+ if (ins->compact_branch) continue;
+
+ if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
+ if (mir_is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue;
+
+ mir_remove_instruction(ins);
+ progress = true;
+ }
+
+ return progress;
+}
+
+/* Dead code elimination for branches at the end of a block - only one branch
+ * per block is legal semantically */
+
+static void
+midgard_opt_cull_dead_branch(compiler_context *ctx, midgard_block *block)
+{
+ bool branched = false;
+
+ mir_foreach_instr_in_block_safe(block, ins) {
+ if (!midgard_is_branch_unit(ins->unit)) continue;
+
+ /* We ignore prepacked branches since the fragment epilogue is
+ * just generally special */
+ if (ins->prepacked_branch) continue;
+
+ /* Discards are similarly special and may not correspond to the
+ * end of a block */
+
+ if (ins->branch.target_type == TARGET_DISCARD) continue;
+
+ if (branched) {
+ /* We already branched, so this is dead */
+ mir_remove_instruction(ins);
+ }
+
+ branched = true;
+ }
+}
+
+static bool
+mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
+{
+ /* abs or neg */
+ if (!is_int && src.mod) return true;
+
+ /* Other int mods don't matter in isolation */
+ if (is_int && src.mod == midgard_int_shift) return true;
+
+ /* size-conversion */
+ if (src.half) return true;
+
+ /* swizzle */
+ for (unsigned c = 0; c < 4; ++c) {
+ if (!(mask & (1 << c))) continue;
+ if (((src.swizzle >> (2*c)) & 3) != c) return true;
+ }
+
+ return false;
+}
+
+static bool
+mir_nontrivial_source2_mod(midgard_instruction *ins)
+{
+ bool is_int = midgard_is_integer_op(ins->alu.op);
+
+ midgard_vector_alu_src src2 =
+ vector_alu_from_unsigned(ins->alu.src2);
+
+ return mir_nontrivial_mod(src2, is_int, ins->mask);
+}
+
+static bool
+mir_nontrivial_outmod(midgard_instruction *ins)
+{
+ bool is_int = midgard_is_integer_op(ins->alu.op);
+ unsigned mod = ins->alu.outmod;
+
+ /* Type conversion is a sort of outmod */
+ if (ins->alu.dest_override != midgard_dest_override_none)
+ return true;
+
+ if (is_int)
+ return mod != midgard_outmod_int_wrap;
+ else
+ return mod != midgard_outmod_none;
+}
+
+static bool
+midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
+{
+ bool progress = false;
+
+ mir_foreach_instr_in_block_safe(block, ins) {
+ if (ins->type != TAG_ALU_4) continue;
+ if (!OP_IS_MOVE(ins->alu.op)) continue;
+
+ unsigned from = ins->ssa_args.src1;
+ unsigned to = ins->ssa_args.dest;
+
+ /* We only work on pure SSA */
+
+ if (to >= SSA_FIXED_MINIMUM) continue;
+ if (from >= SSA_FIXED_MINIMUM) continue;
+ if (to >= ctx->func->impl->ssa_alloc) continue;
+ if (from >= ctx->func->impl->ssa_alloc) continue;
+
+ /* Constant propagation is not handled here, either */
+ if (ins->ssa_args.inline_constant) continue;
+ if (ins->has_constants) continue;
+
+ if (mir_nontrivial_source2_mod(ins)) continue;
+ if (mir_nontrivial_outmod(ins)) continue;
+
+ /* We're clear -- rewrite */
+ mir_rewrite_index_src(ctx, to, from);
+ mir_remove_instruction(ins);
+ progress |= true;
+ }
+
+ return progress;
+}
+
+/* fmov.pos is an idiom for fpos. Propoagate the .pos up to the source, so then
+ * the move can be propagated away entirely */
+
+static bool
+mir_compose_float_outmod(midgard_outmod_float *outmod, midgard_outmod_float comp)
+{
+ /* Nothing to do */
+ if (comp == midgard_outmod_none)
+ return true;
+
+ if (*outmod == midgard_outmod_none) {
+ *outmod = comp;
+ return true;
+ }
+
+ /* TODO: Compose rules */
+ return false;
+}
+
+static bool
+midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block)
+{
+ bool progress = false;
+
+ mir_foreach_instr_in_block_safe(block, ins) {
+ if (ins->type != TAG_ALU_4) continue;
+ if (ins->alu.op != midgard_alu_op_fmov) continue;
+ if (ins->alu.outmod != midgard_outmod_pos) continue;
+
+ /* TODO: Registers? */
+ unsigned src = ins->ssa_args.src1;
+ if (src >= ctx->func->impl->ssa_alloc) continue;
+ assert(!mir_has_multiple_writes(ctx, src));
+
+ /* There might be a source modifier, too */
+ if (mir_nontrivial_source2_mod(ins)) continue;
+
+ /* Backpropagate the modifier */
+ mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) {
+ if (v->type != TAG_ALU_4) continue;
+ if (v->ssa_args.dest != src) continue;
+
+ /* Can we even take a float outmod? */
+ if (midgard_is_integer_out_op(v->alu.op)) continue;
+
+ midgard_outmod_float temp = v->alu.outmod;
+ progress |= mir_compose_float_outmod(&temp, ins->alu.outmod);
+
+ /* Throw in the towel.. */
+ if (!progress) break;
+
+ /* Otherwise, transfer the modifier */
+ v->alu.outmod = temp;
+ ins->alu.outmod = midgard_outmod_none;
+
+ break;
+ }
+ }
+
+ return progress;
+}
+
+/* The following passes reorder MIR instructions to enable better scheduling */
+
+static void
+midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
+{
+ mir_foreach_instr_in_block_safe(block, ins) {
+ if (ins->type != TAG_LOAD_STORE_4) continue;
+
+ /* We've found a load/store op. Check if next is also load/store. */
+ midgard_instruction *next_op = mir_next_op(ins);
+ if (&next_op->link != &block->instructions) {
+ if (next_op->type == TAG_LOAD_STORE_4) {
+ /* If so, we're done since we're a pair */
+ ins = mir_next_op(ins);
+ continue;
+ }
+
+ /* Maximum search distance to pair, to avoid register pressure disasters */
+ int search_distance = 8;
+
+ /* Otherwise, we have an orphaned load/store -- search for another load */
+ mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
+ /* Terminate search if necessary */
+ if (!(search_distance--)) break;
+
+ if (c->type != TAG_LOAD_STORE_4) continue;
+
+ /* Stores cannot be reordered, since they have
+ * dependencies. For the same reason, indirect
+ * loads cannot be reordered as their index is
+ * loaded in r27.w */
+
+ if (OP_IS_STORE(c->load_store.op)) continue;
+
+ /* It appears the 0x800 bit is set whenever a
+ * load is direct, unset when it is indirect.
+ * Skip indirect loads. */
+
+ if (!(c->load_store.unknown & 0x800)) continue;
+
+ /* We found one! Move it up to pair and remove it from the old location */
+
+ mir_insert_instruction_before(ins, *c);
+ mir_remove_instruction(c);
+
+ break;
+ }
+ }
+ }
+}
+
+/* If there are leftovers after the below pass, emit actual fmov
+ * instructions for the slow-but-correct path */
+
+static void
+emit_leftover_move(compiler_context *ctx)
+{
+ set_foreach(ctx->leftover_ssa_to_alias, leftover) {
+ int base = ((uintptr_t) leftover->key) - 1;
+ int mapped = base;
+
+ map_ssa_to_alias(ctx, &mapped);
+ EMIT(mov, mapped, blank_alu_src, base);
+ }
+}
+
+static void
+actualise_ssa_to_alias(compiler_context *ctx)
+{
+ mir_foreach_instr(ctx, ins) {
+ map_ssa_to_alias(ctx, &ins->ssa_args.src0);
+ map_ssa_to_alias(ctx, &ins->ssa_args.src1);
+ }
+
+ emit_leftover_move(ctx);
+}
+
+static void
+emit_fragment_epilogue(compiler_context *ctx)
+{
+ /* Special case: writing out constants requires us to include the move
+ * explicitly now, so shove it into r0 */
+
+ void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, ctx->fragment_output + 1);
+
+ if (constant_value) {
+ midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(0));
+ attach_constants(ctx, &ins, constant_value, ctx->fragment_output + 1);
+ emit_mir_instruction(ctx, ins);
+ }
+
+ /* Perform the actual fragment writeout. We have two writeout/branch
+ * instructions, forming a loop until writeout is successful as per the
+ * docs. TODO: gl_FragDepth */
+
+ EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always);
+ EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always);
+}
+
+static midgard_block *
+emit_block(compiler_context *ctx, nir_block *block)
+{
+ midgard_block *this_block = calloc(sizeof(midgard_block), 1);
+ list_addtail(&this_block->link, &ctx->blocks);
+
+ this_block->is_scheduled = false;
+ ++ctx->block_count;
+
+ ctx->texture_index[0] = -1;
+ ctx->texture_index[1] = -1;
+
+ /* Add us as a successor to the block we are following */
+ if (ctx->current_block)
+ midgard_block_add_successor(ctx->current_block, this_block);
+
+ /* Set up current block */
+ list_inithead(&this_block->instructions);
+ ctx->current_block = this_block;
+
+ nir_foreach_instr(instr, block) {
+ emit_instr(ctx, instr);
+ ++ctx->instruction_count;
+ }
+
+ inline_alu_constants(ctx);
+ embedded_to_inline_constant(ctx);
+
+ /* Perform heavylifting for aliasing */
+ actualise_ssa_to_alias(ctx);
+
+ midgard_pair_load_store(ctx, this_block);
+
+ /* Append fragment shader epilogue (value writeout) */
+ if (ctx->stage == MESA_SHADER_FRAGMENT) {
+ if (block == nir_impl_last_block(ctx->func->impl)) {
+ emit_fragment_epilogue(ctx);
+ }
+ }
+
+ if (block == nir_start_block(ctx->func->impl))
+ ctx->initial_block = this_block;
+
+ if (block == nir_impl_last_block(ctx->func->impl))
+ ctx->final_block = this_block;
+
+ /* Allow the next control flow to access us retroactively, for
+ * branching etc */
+ ctx->current_block = this_block;
+
+ /* Document the fallthrough chain */
+ ctx->previous_source_block = this_block;
+
+ return this_block;
+}
+
+static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
+
+static void
+emit_if(struct compiler_context *ctx, nir_if *nif)
+{
+ /* Conditional branches expect the condition in r31.w; emit a move for
+ * that in the _previous_ block (which is the current block). */
+ emit_condition(ctx, &nif->condition, true, COMPONENT_X);
+
+ /* Speculatively emit the branch, but we can't fill it in until later */
+ EMIT(branch, true, true);
+ midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
+
+ /* Emit the two subblocks */
+ midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
+
+ /* Emit a jump from the end of the then block to the end of the else */
+ EMIT(branch, false, false);
+ midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
+
+ /* Emit second block, and check if it's empty */
+
+ int else_idx = ctx->block_count;
+ int count_in = ctx->instruction_count;
+ midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
+ int after_else_idx = ctx->block_count;
+
+ /* Now that we have the subblocks emitted, fix up the branches */
+
+ assert(then_block);
+ assert(else_block);
+
+ if (ctx->instruction_count == count_in) {
+ /* The else block is empty, so don't emit an exit jump */
+ mir_remove_instruction(then_exit);
+ then_branch->branch.target_block = after_else_idx;
+ } else {
+ then_branch->branch.target_block = else_idx;
+ then_exit->branch.target_block = after_else_idx;
+ }
+}
+
+static void
+emit_loop(struct compiler_context *ctx, nir_loop *nloop)
+{
+ /* Remember where we are */
+ midgard_block *start_block = ctx->current_block;
+
+ /* Allocate a loop number, growing the current inner loop depth */
+ int loop_idx = ++ctx->current_loop_depth;
+
+ /* Get index from before the body so we can loop back later */
+ int start_idx = ctx->block_count;
+
+ /* Emit the body itself */
+ emit_cf_list(ctx, &nloop->body);
+
+ /* Branch back to loop back */
+ struct midgard_instruction br_back = v_branch(false, false);
+ br_back.branch.target_block = start_idx;
+ emit_mir_instruction(ctx, br_back);
+
+ /* Mark down that branch in the graph. Note that we're really branching
+ * to the block *after* we started in. TODO: Why doesn't the branch
+ * itself have an off-by-one then...? */
+ midgard_block_add_successor(ctx->current_block, start_block->successors[0]);
+
+ /* Find the index of the block about to follow us (note: we don't add
+ * one; blocks are 0-indexed so we get a fencepost problem) */
+ int break_block_idx = ctx->block_count;
+
+ /* Fix up the break statements we emitted to point to the right place,
+ * now that we can allocate a block number for them */
+
+ list_for_each_entry_from(struct midgard_block, block, start_block, &ctx->blocks, link) {
+ mir_foreach_instr_in_block(block, ins) {
+ if (ins->type != TAG_ALU_4) continue;
+ if (!ins->compact_branch) continue;
+ if (ins->prepacked_branch) continue;
+
+ /* We found a branch -- check the type to see if we need to do anything */
+ if (ins->branch.target_type != TARGET_BREAK) continue;
+
+ /* It's a break! Check if it's our break */
+ if (ins->branch.target_break != loop_idx) continue;
+
+ /* Okay, cool, we're breaking out of this loop.
+ * Rewrite from a break to a goto */
+
+ ins->branch.target_type = TARGET_GOTO;
+ ins->branch.target_block = break_block_idx;
+ }
+ }
+
+ /* Now that we've finished emitting the loop, free up the depth again
+ * so we play nice with recursion amid nested loops */
+ --ctx->current_loop_depth;
+
+ /* Dump loop stats */
+ ++ctx->loop_count;
+}
+
+static midgard_block *
+emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
+{
+ midgard_block *start_block = NULL;
+
+ foreach_list_typed(nir_cf_node, node, node, list) {
+ switch (node->type) {
+ case nir_cf_node_block: {
+ midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
+
+ if (!start_block)
+ start_block = block;
+
+ break;
+ }
+
+ case nir_cf_node_if:
+ emit_if(ctx, nir_cf_node_as_if(node));
+ break;
+
+ case nir_cf_node_loop:
+ emit_loop(ctx, nir_cf_node_as_loop(node));
+ break;
+
+ case nir_cf_node_function:
+ assert(0);
+ break;
+ }
+ }
+
+ return start_block;
+}
+
+/* Due to lookahead, we need to report the first tag executed in the command
+ * stream and in branch targets. An initial block might be empty, so iterate
+ * until we find one that 'works' */
+
+static unsigned
+midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
+{
+ midgard_block *initial_block = mir_get_block(ctx, block_idx);
+
+ unsigned first_tag = 0;
+
+ do {
+ midgard_bundle *initial_bundle = util_dynarray_element(&initial_block->bundles, midgard_bundle, 0);
+
+ if (initial_bundle) {
+ first_tag = initial_bundle->tag;
+ break;
+ }
+
+ /* Initial block is empty, try the next block */
+ initial_block = list_first_entry(&(initial_block->link), midgard_block, link);
+ } while(initial_block != NULL);
+
+ assert(first_tag);
+ return first_tag;
+}
+
+int
+midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend)
+{
+ struct util_dynarray *compiled = &program->compiled;
+
+ midgard_debug = debug_get_option_midgard_debug();
+
+ compiler_context ictx = {
+ .nir = nir,
+ .stage = nir->info.stage,
+
+ .is_blend = is_blend,
+ .blend_constant_offset = 0,
+
+ .alpha_ref = program->alpha_ref
+ };
+
+ compiler_context *ctx = &ictx;
+
+ /* TODO: Decide this at runtime */
+ ctx->uniform_cutoff = 8;
+
+ /* Initialize at a global (not block) level hash tables */
+
+ ctx->ssa_constants = _mesa_hash_table_u64_create(NULL);
+ ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL);
+ ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
+ ctx->sysval_to_id = _mesa_hash_table_u64_create(NULL);
+ ctx->leftover_ssa_to_alias = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+ /* Record the varying mapping for the command stream's bookkeeping */
+
+ struct exec_list *varyings =
+ ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs;
+
+ unsigned max_varying = 0;
+ nir_foreach_variable(var, varyings) {
+ unsigned loc = var->data.driver_location;
+ unsigned sz = glsl_type_size(var->type, FALSE);
+
+ for (int c = 0; c < sz; ++c) {
+ program->varyings[loc + c] = var->data.location + c;
+ max_varying = MAX2(max_varying, loc + c);
+ }
+ }
+
+ /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
+ * (so we don't accidentally duplicate the epilogue since mesa/st has
+ * messed with our I/O quite a bit already) */
+
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+
+ if (ctx->stage == MESA_SHADER_VERTEX)
+ NIR_PASS_V(nir, nir_lower_viewport_transform);
+
+ NIR_PASS_V(nir, nir_lower_var_copies);
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+ NIR_PASS_V(nir, nir_split_var_copies);
+ NIR_PASS_V(nir, nir_lower_var_copies);
+ NIR_PASS_V(nir, nir_lower_global_vars_to_local);
+ NIR_PASS_V(nir, nir_lower_var_copies);
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+
+ NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
+
+ /* Optimisation passes */
+
+ optimise_nir(nir);
+
+ if (midgard_debug & MIDGARD_DBG_SHADERS) {
+ nir_print_shader(nir, stdout);
+ }
+
+ /* Assign sysvals and counts, now that we're sure
+ * (post-optimisation) */
+
+ midgard_nir_assign_sysvals(ctx, nir);
+
+ program->uniform_count = nir->num_uniforms;
+ program->sysval_count = ctx->sysval_count;
+ memcpy(program->sysvals, ctx->sysvals, sizeof(ctx->sysvals[0]) * ctx->sysval_count);
+
+ program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0;
+ program->varying_count = max_varying + 1; /* Fencepost off-by-one */
+
+ nir_foreach_function(func, nir) {
+ if (!func->impl)
+ continue;
+
+ list_inithead(&ctx->blocks);
+ ctx->block_count = 0;
+ ctx->func = func;
+
+ emit_cf_list(ctx, &func->impl->body);
+ emit_block(ctx, func->impl->end_block);
+
+ break; /* TODO: Multi-function shaders */
+ }
+
+ util_dynarray_init(compiled, NULL);
+
+ /* MIR-level optimizations */
+
+ bool progress = false;
+
+ do {
+ progress = false;
+
+ mir_foreach_block(ctx, block) {
+ progress |= midgard_opt_pos_propagate(ctx, block);
+ progress |= midgard_opt_copy_prop(ctx, block);
+ progress |= midgard_opt_dead_code_eliminate(ctx, block);
+ }
+ } while (progress);
+
+ /* Nested control-flow can result in dead branches at the end of the
+ * block. This messes with our analysis and is just dead code, so cull
+ * them */
+ mir_foreach_block(ctx, block) {
+ midgard_opt_cull_dead_branch(ctx, block);
+ }
+
+ /* Schedule! */
+ schedule_program(ctx);
+
+ /* Now that all the bundles are scheduled and we can calculate block
+ * sizes, emit actual branch instructions rather than placeholders */
+
+ int br_block_idx = 0;
+
+ mir_foreach_block(ctx, block) {
+ util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
+ for (int c = 0; c < bundle->instruction_count; ++c) {
+ midgard_instruction *ins = bundle->instructions[c];
+
+ if (!midgard_is_branch_unit(ins->unit)) continue;
+
+ if (ins->prepacked_branch) continue;
+
+ /* Parse some basic branch info */
+ bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT;
+ bool is_conditional = ins->branch.conditional;
+ bool is_inverted = ins->branch.invert_conditional;
+ bool is_discard = ins->branch.target_type == TARGET_DISCARD;
+
+ /* Determine the block we're jumping to */
+ int target_number = ins->branch.target_block;
+
+ /* Report the destination tag */
+ int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number);
+
+ /* Count up the number of quadwords we're
+ * jumping over = number of quadwords until
+ * (br_block_idx, target_number) */
+
+ int quadword_offset = 0;
+
+ if (is_discard) {
+ /* Jump to the end of the shader. We
+ * need to include not only the
+ * following blocks, but also the
+ * contents of our current block (since
+ * discard can come in the middle of
+ * the block) */
+
+ midgard_block *blk = mir_get_block(ctx, br_block_idx + 1);
+
+ for (midgard_bundle *bun = bundle + 1; bun < (midgard_bundle *)((char*) block->bundles.data + block->bundles.size); ++bun) {
+ quadword_offset += quadword_size(bun->tag);
+ }
+
+ mir_foreach_block_from(ctx, blk, b) {
+ quadword_offset += b->quadword_count;
+ }
+
+ } else if (target_number > br_block_idx) {
+ /* Jump forward */
+
+ for (int idx = br_block_idx + 1; idx < target_number; ++idx) {
+ midgard_block *blk = mir_get_block(ctx, idx);
+ assert(blk);
+
+ quadword_offset += blk->quadword_count;
+ }
+ } else {
+ /* Jump backwards */
+
+ for (int idx = br_block_idx; idx >= target_number; --idx) {
+ midgard_block *blk = mir_get_block(ctx, idx);
+ assert(blk);
+
+ quadword_offset -= blk->quadword_count;
+ }
+ }
+
+ /* Unconditional extended branches (far jumps)
+ * have issues, so we always use a conditional
+ * branch, setting the condition to always for
+ * unconditional. For compact unconditional
+ * branches, cond isn't used so it doesn't
+ * matter what we pick. */
+
+ midgard_condition cond =
+ !is_conditional ? midgard_condition_always :
+ is_inverted ? midgard_condition_false :
+ midgard_condition_true;
+
+ midgard_jmp_writeout_op op =
+ is_discard ? midgard_jmp_writeout_op_discard :
+ (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond :
+ midgard_jmp_writeout_op_branch_cond;
+
+ if (!is_compact) {
+ midgard_branch_extended branch =
+ midgard_create_branch_extended(
+ cond, op,
+ dest_tag,
+ quadword_offset);
+
+ memcpy(&ins->branch_extended, &branch, sizeof(branch));
+ } else if (is_conditional || is_discard) {
+ midgard_branch_cond branch = {
+ .op = op,
+ .dest_tag = dest_tag,
+ .offset = quadword_offset,
+ .cond = cond
+ };
+
+ assert(branch.offset == quadword_offset);
+
+ memcpy(&ins->br_compact, &branch, sizeof(branch));
+ } else {
+ assert(op == midgard_jmp_writeout_op_branch_uncond);
+
+ midgard_branch_uncond branch = {
+ .op = op,
+ .dest_tag = dest_tag,
+ .offset = quadword_offset,
+ .unknown = 1
+ };
+
+ assert(branch.offset == quadword_offset);
+
+ memcpy(&ins->br_compact, &branch, sizeof(branch));
+ }
+ }
+ }
+
+ ++br_block_idx;
+ }
+
+ /* Emit flat binary from the instruction arrays. Iterate each block in
+ * sequence. Save instruction boundaries such that lookahead tags can
+ * be assigned easily */
+
+ /* Cache _all_ bundles in source order for lookahead across failed branches */
+
+ int bundle_count = 0;
+ mir_foreach_block(ctx, block) {
+ bundle_count += block->bundles.size / sizeof(midgard_bundle);
+ }
+ midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
+ int bundle_idx = 0;
+ mir_foreach_block(ctx, block) {
+ util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
+ source_order_bundles[bundle_idx++] = bundle;
+ }
+ }
+
+ int current_bundle = 0;
+
+ /* Midgard prefetches instruction types, so during emission we
+ * need to lookahead. Unless this is the last instruction, in
+ * which we return 1. Or if this is the second to last and the
+ * last is an ALU, then it's also 1... */
+
+ mir_foreach_block(ctx, block) {
+ mir_foreach_bundle_in_block(block, bundle) {
+ int lookahead = 1;
+
+ if (current_bundle + 1 < bundle_count) {
+ uint8_t next = source_order_bundles[current_bundle + 1]->tag;
+
+ if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) {
+ lookahead = 1;
+ } else {
+ lookahead = next;
+ }
+ }
+
+ emit_binary_bundle(ctx, bundle, compiled, lookahead);
+ ++current_bundle;
+ }
+
+ /* TODO: Free deeper */
+ //util_dynarray_fini(&block->instructions);
+ }
+
+ free(source_order_bundles);
+
+ /* Report the very first tag executed */
+ program->first_tag = midgard_get_first_tag_from_block(ctx, 0);
+
+ /* Deal with off-by-one related to the fencepost problem */
+ program->work_register_count = ctx->work_registers + 1;
+
+ program->can_discard = ctx->can_discard;
+ program->uniform_cutoff = ctx->uniform_cutoff;
+
+ program->blend_patch_offset = ctx->blend_constant_offset;
+
+ if (midgard_debug & MIDGARD_DBG_SHADERS)
+ disassemble_midgard(program->compiled.data, program->compiled.size);
+
+ if (midgard_debug & MIDGARD_DBG_SHADERDB) {
+ unsigned nr_bundles = 0, nr_ins = 0, nr_quadwords = 0;
+
+ /* Count instructions and bundles */
+
+ mir_foreach_instr_global(ctx, ins) {
+ nr_ins++;
+ }
+
+ mir_foreach_block(ctx, block) {
+ nr_bundles += util_dynarray_num_elements(
+ &block->bundles, midgard_bundle);
+
+ nr_quadwords += block->quadword_count;
+ }
+
+ /* Calculate thread count. There are certain cutoffs by
+ * register count for thread count */
+
+ unsigned nr_registers = program->work_register_count;
+
+ unsigned nr_threads =
+ (nr_registers <= 4) ? 4 :
+ (nr_registers <= 8) ? 2 :
+ 1;
+
+ /* Dump stats */
+
+ fprintf(stderr, "shader%d - %s shader: "
+ "%u inst, %u bundles, %u quadwords, "
+ "%u registers, %u threads, %u loops\n",
+ SHADER_DB_COUNT++,
+ gl_shader_stage_name(ctx->stage),
+ nr_ins, nr_bundles, nr_quadwords,
+ nr_registers, nr_threads,
+ ctx->loop_count);
+ }
+
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MIDGARD_H_
+#define __MIDGARD_H_
+
+#include "compiler/nir/nir.h"
+#include "util/u_dynarray.h"
+
+/* Define the general compiler entry point */
+
+#define MAX_SYSVAL_COUNT 32
+
+/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal
+ * their class for equal comparison */
+
+#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type)
+#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff)
+#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16)
+
+/* Define some common types. We start at one for easy indexing of hash
+ * tables internal to the compiler */
+
+enum {
+ PAN_SYSVAL_VIEWPORT_SCALE = 1,
+ PAN_SYSVAL_VIEWPORT_OFFSET = 2,
+ PAN_SYSVAL_TEXTURE_SIZE = 3,
+} pan_sysval;
+
+#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \
+ ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0))
+
+#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id) & 0x7f)
+#define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3)
+#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9))
+
+typedef struct {
+ int work_register_count;
+ int uniform_count;
+ int uniform_cutoff;
+
+ int attribute_count;
+ int varying_count;
+
+ /* Prepended before uniforms, mapping to SYSVAL_ names for the
+ * sysval */
+
+ unsigned sysval_count;
+ unsigned sysvals[MAX_SYSVAL_COUNT];
+
+ unsigned varyings[32];
+
+ /* Boolean properties of the program */
+ bool can_discard;
+ bool writes_point_size;
+
+ int first_tag;
+
+ struct util_dynarray compiled;
+
+ /* For a blend shader using a constant color -- patch point. If
+ * negative, there's no constant. */
+
+ int blend_patch_offset;
+
+ /* IN: For a fragment shader with a lowered alpha test, the ref value */
+ float alpha_ref;
+} midgard_program;
+
+int
+midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend);
+
+/* NIR options are shared between the standalone compiler and the online
+ * compiler. Defining it here is the simplest, though maybe not the Right
+ * solution. */
+
+static const nir_shader_compiler_options midgard_nir_options = {
+ .lower_ffma = true,
+ .lower_sub = true,
+ .lower_scmp = true,
+ .lower_flrp32 = true,
+ .lower_flrp64 = true,
+ .lower_ffract = true,
+ .lower_fmod = true,
+ .lower_fdiv = true,
+ .lower_idiv = true,
+ .lower_isign = true,
+ .lower_fpow = true,
+ .lower_find_lsb = true,
+
+ .lower_wpos_pntc = true,
+
+ /* TODO: We have native ops to help here, which we'll want to look into
+ * eventually */
+ .lower_fsign = true,
+
+ .vertex_id_zero_based = true,
+ .lower_extract_byte = true,
+ .lower_extract_word = true,
+ .lower_rotate = true,
+
+ .lower_doubles_options = nir_lower_dmod,
+
+ .vectorize_io = true,
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "midgard_ops.h"
+
+/* Midgard IR only knows vector ALU types, but we sometimes need to actually
+ * use scalar ALU instructions, for functional or performance reasons. To do
+ * this, we just demote vector ALU payloads to scalar. */
+
+static int
+component_from_mask(unsigned mask)
+{
+ for (int c = 0; c < 8; ++c) {
+ if (mask & (1 << c))
+ return c;
+ }
+
+ assert(0);
+ return 0;
+}
+
+static unsigned
+vector_to_scalar_source(unsigned u, bool is_int, bool is_full)
+{
+ midgard_vector_alu_src v;
+ memcpy(&v, &u, sizeof(v));
+
+ /* TODO: Integers */
+
+ unsigned component = v.swizzle & 3;
+ bool upper = false; /* TODO */
+
+ midgard_scalar_alu_src s = { 0 };
+
+ if (is_full) {
+ /* For a 32-bit op, just check the source half flag */
+ s.full = !v.half;
+ } else if (!v.half) {
+ /* For a 16-bit op that's not subdivided, never full */
+ s.full = false;
+ } else {
+ /* We can't do 8-bit scalar, abort! */
+ assert(0);
+ }
+
+ /* Component indexing takes size into account */
+
+ if (s.full)
+ s.component = component << 1;
+ else
+ s.component = component + (upper << 2);
+
+ if (is_int) {
+ /* TODO */
+ } else {
+ s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS;
+ s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG;
+ }
+
+ unsigned o;
+ memcpy(&o, &s, sizeof(s));
+
+ return o & ((1 << 6) - 1);
+}
+
+static midgard_scalar_alu
+vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)
+{
+ bool is_int = midgard_is_integer_op(v.op);
+ bool is_full = v.reg_mode == midgard_reg_mode_32;
+ bool is_inline_constant = ins->ssa_args.inline_constant;
+
+ /* The output component is from the mask */
+ midgard_scalar_alu s = {
+ .op = v.op,
+ .src1 = vector_to_scalar_source(v.src1, is_int, is_full),
+ .src2 = !is_inline_constant ? vector_to_scalar_source(v.src2, is_int, is_full) : 0,
+ .unknown = 0,
+ .outmod = v.outmod,
+ .output_full = is_full,
+ .output_component = component_from_mask(ins->mask),
+ };
+
+ /* Full components are physically spaced out */
+ if (is_full) {
+ assert(s.output_component < 4);
+ s.output_component <<= 1;
+ }
+
+ /* Inline constant is passed along rather than trying to extract it
+ * from v */
+
+ if (ins->ssa_args.inline_constant) {
+ uint16_t imm = 0;
+ int lower_11 = ins->inline_constant & ((1 << 12) - 1);
+ imm |= (lower_11 >> 9) & 3;
+ imm |= (lower_11 >> 6) & 4;
+ imm |= (lower_11 >> 2) & 0x38;
+ imm |= (lower_11 & 63) << 6;
+
+ s.src2 = imm;
+ }
+
+ return s;
+}
+
+static void
+emit_alu_bundle(compiler_context *ctx,
+ midgard_bundle *bundle,
+ struct util_dynarray *emission,
+ unsigned lookahead)
+{
+ /* Emit the control word */
+ util_dynarray_append(emission, uint32_t, bundle->control | lookahead);
+
+ /* Next up, emit register words */
+ for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+ midgard_instruction *ins = bundle->instructions[i];
+
+ /* Check if this instruction has registers */
+ if (ins->compact_branch || ins->prepacked_branch) continue;
+
+ /* Otherwise, just emit the registers */
+ uint16_t reg_word = 0;
+ memcpy(®_word, &ins->registers, sizeof(uint16_t));
+ util_dynarray_append(emission, uint16_t, reg_word);
+ }
+
+ /* Now, we emit the body itself */
+ for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+ midgard_instruction *ins = bundle->instructions[i];
+
+ /* Where is this body */
+ unsigned size = 0;
+ void *source = NULL;
+
+ /* In case we demote to a scalar */
+ midgard_scalar_alu scalarized;
+
+ if (ins->unit & UNITS_ANY_VECTOR) {
+ if (ins->alu.reg_mode == midgard_reg_mode_32)
+ ins->alu.mask = expand_writemask_32(ins->mask);
+ else
+ ins->alu.mask = ins->mask;
+
+ size = sizeof(midgard_vector_alu);
+ source = &ins->alu;
+ } else if (ins->unit == ALU_ENAB_BR_COMPACT) {
+ size = sizeof(midgard_branch_cond);
+ source = &ins->br_compact;
+ } else if (ins->compact_branch) { /* misnomer */
+ size = sizeof(midgard_branch_extended);
+ source = &ins->branch_extended;
+ } else {
+ size = sizeof(midgard_scalar_alu);
+ scalarized = vector_to_scalar_alu(ins->alu, ins);
+ source = &scalarized;
+ }
+
+ memcpy(util_dynarray_grow_bytes(emission, 1, size), source, size);
+ }
+
+ /* Emit padding (all zero) */
+ memset(util_dynarray_grow_bytes(emission, 1, bundle->padding), 0, bundle->padding);
+
+ /* Tack on constants */
+
+ if (bundle->has_embedded_constants) {
+ util_dynarray_append(emission, float, bundle->constants[0]);
+ util_dynarray_append(emission, float, bundle->constants[1]);
+ util_dynarray_append(emission, float, bundle->constants[2]);
+ util_dynarray_append(emission, float, bundle->constants[3]);
+ }
+}
+
+/* After everything is scheduled, emit whole bundles at a time */
+
+void
+emit_binary_bundle(compiler_context *ctx,
+ midgard_bundle *bundle,
+ struct util_dynarray *emission,
+ int next_tag)
+{
+ int lookahead = next_tag << 4;
+
+ switch (bundle->tag) {
+ case TAG_ALU_4:
+ case TAG_ALU_8:
+ case TAG_ALU_12:
+ case TAG_ALU_16:
+ emit_alu_bundle(ctx, bundle, emission, lookahead);
+ break;
+
+ case TAG_LOAD_STORE_4: {
+ /* One or two composing instructions */
+
+ uint64_t current64, next64 = LDST_NOP;
+
+ /* Copy masks */
+
+ for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+ bundle->instructions[i]->load_store.mask =
+ bundle->instructions[i]->mask;
+ }
+
+ memcpy(¤t64, &bundle->instructions[0]->load_store, sizeof(current64));
+
+ if (bundle->instruction_count == 2)
+ memcpy(&next64, &bundle->instructions[1]->load_store, sizeof(next64));
+
+ midgard_load_store instruction = {
+ .type = bundle->tag,
+ .next_type = next_tag,
+ .word1 = current64,
+ .word2 = next64
+ };
+
+ util_dynarray_append(emission, midgard_load_store, instruction);
+
+ break;
+ }
+
+ case TAG_TEXTURE_4:
+ case TAG_TEXTURE_4_VTX: {
+ /* Texture instructions are easy, since there is no pipelining
+ * nor VLIW to worry about. We may need to set .cont/.last
+ * flags. */
+
+ midgard_instruction *ins = bundle->instructions[0];
+
+ ins->texture.type = bundle->tag;
+ ins->texture.next_type = next_tag;
+ ins->texture.mask = ins->mask;
+
+ ctx->texture_op_count--;
+
+ if (ins->texture.op == TEXTURE_OP_NORMAL) {
+ bool continues = ctx->texture_op_count > 0;
+ ins->texture.cont = continues;
+ ins->texture.last = !continues;
+ } else {
+ ins->texture.cont = ins->texture.last = 1;
+ }
+
+ util_dynarray_append(emission, midgard_texture_word, ins->texture);
+ break;
+ }
+
+ default:
+ unreachable("Unknown midgard instruction type\n");
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* mir_is_live_after performs liveness analysis on the MIR, used primarily
+ * as part of register allocation. TODO: Algorithmic improvements for
+ * compiler performance (this is the worst algorithm possible -- see
+ * backlog with Connor on IRC) */
+
+#include "compiler.h"
+
+static bool
+midgard_is_live_in_instr(midgard_instruction *ins, int src)
+{
+ if (ins->compact_branch)
+ return false;
+
+ if (ins->ssa_args.src0 == src)
+ return true;
+
+ if (!ins->ssa_args.inline_constant && ins->ssa_args.src1 == src)
+ return true;
+
+ return false;
+}
+
+/* Determine if a variable is live in the successors of a block */
+static bool
+is_live_after_successors(compiler_context *ctx, midgard_block *bl, int src)
+{
+ for (unsigned i = 0; i < bl->nr_successors; ++i) {
+ midgard_block *succ = bl->successors[i];
+
+ /* If we already visited, the value we're seeking
+ * isn't down this path (or we would have short
+ * circuited */
+
+ if (succ->visited) continue;
+
+ /* Otherwise (it's visited *now*), check the block */
+
+ succ->visited = true;
+
+ mir_foreach_instr_in_block(succ, ins) {
+ if (midgard_is_live_in_instr(ins, src))
+ return true;
+ }
+
+ /* ...and also, check *its* successors */
+ if (is_live_after_successors(ctx, succ, src))
+ return true;
+
+ }
+
+ /* Welp. We're really not live. */
+
+ return false;
+}
+
+bool
+mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src)
+{
+ /* Check the rest of the block for liveness */
+
+ mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) {
+ if (midgard_is_live_in_instr(ins, src))
+ return true;
+ }
+
+ /* Check the rest of the blocks for liveness recursively */
+
+ bool succ = is_live_after_successors(ctx, block, src);
+
+ mir_foreach_block(ctx, block) {
+ block->visited = false;
+ }
+
+ return succ;
+}
+
+/* Just a quick check -- is it written more than once? (I.e. are we definitely
+ * not SSA?) */
+
+bool
+mir_has_multiple_writes(compiler_context *ctx, int dest)
+{
+ unsigned write_count = 0;
+
+ mir_foreach_instr_global(ctx, ins) {
+ if (ins->ssa_args.dest == dest)
+ write_count++;
+ }
+
+ return write_count > 1;
+}
--- /dev/null
+#include <stdbool.h>
+#include "nir.h"
+
+bool midgard_nir_lower_algebraic_late(nir_shader *shader);
+bool midgard_nir_scale_trig(nir_shader *shader);
--- /dev/null
+#
+# Copyright (C) 2018 Alyssa Rosenzweig
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+import argparse
+import sys
+import math
+
+a = 'a'
+b = 'b'
+c = 'c'
+
+algebraic_late = [
+ # ineg must be lowered late, but only for integers; floats will try to
+ # have modifiers attached... hence why this has to be here rather than
+ # a more standard lower_negate approach
+
+ (('ineg', a), ('isub', 0, a)),
+
+ # These two special-cases save space/an op than the actual csel op +
+ # scheduler flexibility
+
+ (('b32csel', a, 'b@32', 0), ('iand', a, b)),
+ (('b32csel', a, 0, 'b@32'), ('iand', ('inot', a), b)),
+]
+
+
+# Midgard is able to type convert down by only one "step" per instruction; if
+# NIR wants more than one step, we need to break up into multiple instructions
+
+converts = [
+ (('i2i8', 'a@32'), ('i2i8', ('i2i16', a))),
+ (('u2u8', 'a@32'), ('u2u8', ('u2u16', a))),
+
+ (('i2i32', 'a@8'), ('i2i32', ('i2i16', a))),
+ (('u2u32', 'a@8'), ('u2u32', ('u2u16', a))),
+
+ (('f2i32', 'a@16'), ('f2i32', ('f2f32', a))),
+ (('f2u32', 'a@16'), ('f2u32', ('f2f32', a))),
+
+ # Totally redundant
+ (('~f2f16', ('f2f32', 'a@16')), a),
+
+ (('pack_half_2x16_split', 'a@32', 'b@32'), ('ior', ('ishl', ('i2i32', ('f2f16', b)), 16), ('i2i32', ('f2f16', a)))),
+]
+
+# Midgard scales fsin/fcos arguments by pi.
+# Pass must be run only once, after the main loop
+
+scale_trig = [
+ (('fsin', a), ('fsin', ('fdiv', a, math.pi))),
+ (('fcos', a), ('fcos', ('fdiv', a, math.pi))),
+]
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-p', '--import-path', required=True)
+ args = parser.parse_args()
+ sys.path.insert(0, args.import_path)
+ run()
+
+
+def run():
+ import nir_algebraic # pylint: disable=import-error
+
+ print('#include "midgard_nir.h"')
+
+ print(nir_algebraic.AlgebraicPass("midgard_nir_lower_algebraic_late",
+ algebraic_late + converts).render())
+
+ print(nir_algebraic.AlgebraicPass("midgard_nir_scale_trig",
+ scale_trig).render())
+
+
+if __name__ == '__main__':
+ main()
--- /dev/null
+/* Copyright (c) 2018-2019 Alyssa Rosenzweig (alyssa@rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "midgard.h"
+
+/* Include the definitions of the macros and such */
+
+#define MIDGARD_OPS_TABLE
+#include "helpers.h"
+#undef MIDGARD_OPS_TABLE
+
+/* Table of mapping opcodes to accompanying properties. This is used for both
+ * the disassembler and the compiler. It is placed in a .c file like this to
+ * avoid duplications in the binary */
+
+struct mir_op_props alu_opcode_props[256] = {
+ [midgard_alu_op_fadd] = {"fadd", UNITS_ADD | OP_COMMUTES},
+ [midgard_alu_op_fmul] = {"fmul", UNITS_MUL | UNIT_VLUT | OP_COMMUTES},
+ [midgard_alu_op_fmin] = {"fmin", UNITS_MUL | UNITS_ADD | OP_COMMUTES},
+ [midgard_alu_op_fmax] = {"fmax", UNITS_MUL | UNITS_ADD | OP_COMMUTES},
+ [midgard_alu_op_imin] = {"imin", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_imax] = {"imax", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_umin] = {"umin", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_umax] = {"umax", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_ihadd] = {"ihadd", UNITS_ADD | OP_COMMUTES},
+ [midgard_alu_op_uhadd] = {"uhadd", UNITS_ADD | OP_COMMUTES},
+ [midgard_alu_op_irhadd] = {"irhadd", UNITS_ADD | OP_COMMUTES},
+ [midgard_alu_op_urhadd] = {"urhadd", UNITS_ADD | OP_COMMUTES},
+
+ [midgard_alu_op_fmov] = {"fmov", UNITS_ALL | QUIRK_FLIPPED_R24},
+ [midgard_alu_op_fmov_rtz] = {"fmov_rtz", UNITS_ALL | QUIRK_FLIPPED_R24},
+ [midgard_alu_op_fmov_rtn] = {"fmov_rtn", UNITS_ALL | QUIRK_FLIPPED_R24},
+ [midgard_alu_op_fmov_rtp] = {"fmov_rtp", UNITS_ALL | QUIRK_FLIPPED_R24},
+ [midgard_alu_op_fround] = {"fround", UNITS_ADD},
+ [midgard_alu_op_froundeven] = {"froundeven", UNITS_ADD},
+ [midgard_alu_op_ftrunc] = {"ftrunc", UNITS_ADD},
+ [midgard_alu_op_ffloor] = {"ffloor", UNITS_ADD},
+ [midgard_alu_op_fceil] = {"fceil", UNITS_ADD},
+ [midgard_alu_op_ffma] = {"ffma", UNIT_VLUT},
+
+ /* Though they output a scalar, they need to run on a vector unit
+ * since they process vectors */
+ [midgard_alu_op_fdot3] = {"fdot3", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES},
+ [midgard_alu_op_fdot3r] = {"fdot3r", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES},
+ [midgard_alu_op_fdot4] = {"fdot4", UNIT_VMUL | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+
+ /* Incredibly, iadd can run on vmul, etc */
+ [midgard_alu_op_iadd] = {"iadd", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_iaddsat] = {"iaddsat", UNITS_ADD | OP_COMMUTES},
+ [midgard_alu_op_uaddsat] = {"uaddsat", UNITS_ADD | OP_COMMUTES},
+ [midgard_alu_op_iabsdiff] = {"iabsdiff", UNITS_ADD},
+ [midgard_alu_op_uabsdiff] = {"uabsdiff", UNITS_ADD},
+ [midgard_alu_op_ichoose] = {"ichoose", UNITS_ADD},
+ [midgard_alu_op_isub] = {"isub", UNITS_MOST},
+ [midgard_alu_op_isubsat] = {"isubsat", UNITS_MOST},
+ [midgard_alu_op_usubsat] = {"usubsat", UNITS_MOST},
+ [midgard_alu_op_imul] = {"imul", UNITS_MUL | OP_COMMUTES},
+ [midgard_alu_op_imov] = {"imov", UNITS_MOST | QUIRK_FLIPPED_R24},
+
+ /* For vector comparisons, use ball etc */
+ [midgard_alu_op_feq] = {"feq", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES},
+ [midgard_alu_op_fne] = {"fne", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES},
+ [midgard_alu_op_fle] = {"fle", UNITS_MOST | OP_TYPE_CONVERT},
+ [midgard_alu_op_flt] = {"flt", UNITS_MOST | OP_TYPE_CONVERT},
+ [midgard_alu_op_ieq] = {"ieq", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_ine] = {"ine", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_ilt] = {"ilt", UNITS_MOST},
+ [midgard_alu_op_ile] = {"ile", UNITS_MOST},
+ [midgard_alu_op_ult] = {"ult", UNITS_MOST},
+ [midgard_alu_op_ule] = {"ule", UNITS_MOST},
+
+ [midgard_alu_op_icsel] = {"icsel", UNITS_ADD},
+ [midgard_alu_op_icsel_v] = {"icsel_v", UNITS_ADD}, /* Acts as bitselect() */
+ [midgard_alu_op_fcsel_v] = {"fcsel_v", UNITS_ADD},
+ [midgard_alu_op_fcsel] = {"fcsel", UNITS_ADD | UNIT_SMUL},
+
+ [midgard_alu_op_frcp] = {"frcp", UNIT_VLUT},
+ [midgard_alu_op_frsqrt] = {"frsqrt", UNIT_VLUT},
+ [midgard_alu_op_fsqrt] = {"fsqrt", UNIT_VLUT},
+ [midgard_alu_op_fpow_pt1] = {"fpow_pt1", UNIT_VLUT},
+ [midgard_alu_op_fpown_pt1] = {"fpown_pt1", UNIT_VLUT},
+ [midgard_alu_op_fpowr_pt1] = {"fpowr_pt1", UNIT_VLUT},
+ [midgard_alu_op_fexp2] = {"fexp2", UNIT_VLUT},
+ [midgard_alu_op_flog2] = {"flog2", UNIT_VLUT},
+
+ [midgard_alu_op_f2i_rte] = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_f2i_rtz] = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_f2i_rtn] = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_f2i_rtp] = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_f2u_rte] = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_f2u_rtz] = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_f2u_rtn] = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_f2u_rtp] = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_i2f_rte] = {"i2f", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_i2f_rtz] = {"i2f_rtz", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_i2f_rtn] = {"i2f_rtn", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_i2f_rtp] = {"i2f_rtp", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_u2f_rte] = {"u2f", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_u2f_rtz] = {"u2f_rtz", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_u2f_rtn] = {"u2f_rtn", UNITS_ADD | OP_TYPE_CONVERT},
+ [midgard_alu_op_u2f_rtp] = {"u2f_rtp", UNITS_ADD | OP_TYPE_CONVERT},
+
+ [midgard_alu_op_fsin] = {"fsin", UNIT_VLUT},
+ [midgard_alu_op_fcos] = {"fcos", UNIT_VLUT},
+
+ /* XXX: Test case where it's right on smul but not sadd */
+ [midgard_alu_op_iand] = {"iand", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_iandnot] = {"iandnot", UNITS_MOST},
+
+ [midgard_alu_op_ior] = {"ior", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_iornot] = {"iornot", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_inor] = {"inor", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_ixor] = {"ixor", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_inxor] = {"inxor", UNITS_MOST | OP_COMMUTES},
+ [midgard_alu_op_iclz] = {"iclz", UNITS_ADD},
+ [midgard_alu_op_ibitcount8] = {"ibitcount8", UNITS_ADD},
+ [midgard_alu_op_inand] = {"inand", UNITS_MOST},
+ [midgard_alu_op_ishl] = {"ishl", UNITS_ADD},
+ [midgard_alu_op_iasr] = {"iasr", UNITS_ADD},
+ [midgard_alu_op_ilsr] = {"ilsr", UNITS_ADD},
+
+ [midgard_alu_op_fball_eq] = {"fball_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+ [midgard_alu_op_fbany_neq] = {"fbany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+ [midgard_alu_op_iball_eq] = {"iball_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+ [midgard_alu_op_iball_neq] = {"iball_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+ [midgard_alu_op_ibany_eq] = {"ibany_eq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+ [midgard_alu_op_ibany_neq] = {"ibany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+
+ /* These instructions are not yet emitted by the compiler, so
+ * don't speculate about units yet */
+ [midgard_alu_op_ishladd] = {"ishladd", 0},
+
+ [midgard_alu_op_uball_lt] = {"uball_lt", 0},
+ [midgard_alu_op_uball_lte] = {"uball_lte", 0},
+ [midgard_alu_op_iball_lt] = {"iball_lt", 0},
+ [midgard_alu_op_iball_lte] = {"iball_lte", 0},
+ [midgard_alu_op_ubany_lt] = {"ubany_lt", 0},
+ [midgard_alu_op_ubany_lte] = {"ubany_lte", 0},
+ [midgard_alu_op_ibany_lt] = {"ibany_lt", 0},
+ [midgard_alu_op_ibany_lte] = {"ibany_lte", 0},
+
+ [midgard_alu_op_freduce] = {"freduce", 0},
+ [midgard_alu_op_bball_eq] = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+ [midgard_alu_op_bbany_neq] = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+ [midgard_alu_op_fatan2_pt1] = {"fatan2_pt1", 0},
+ [midgard_alu_op_fatan_pt2] = {"fatan_pt2", 0},
+};
+
+const char *load_store_opcode_names[256] = {
+ [midgard_op_st_cubemap_coords] = "st_cubemap_coords",
+ [midgard_op_ld_global_id] = "ld_global_id",
+ [midgard_op_ldst_perspective_division_z] = "ldst_perspective_division_z",
+ [midgard_op_ldst_perspective_division_w] = "ldst_perspective_division_w",
+
+ [midgard_op_atomic_add] = "atomic_add",
+ [midgard_op_atomic_and] = "atomic_and",
+ [midgard_op_atomic_or] = "atomic_or",
+ [midgard_op_atomic_xor] = "atomic_xor",
+ [midgard_op_atomic_imin] = "atomic_imin",
+ [midgard_op_atomic_umin] = "atomic_umin",
+ [midgard_op_atomic_imax] = "atomic_imax",
+ [midgard_op_atomic_umax] = "atomic_umax",
+ [midgard_op_atomic_xchg] = "atomic_xchg",
+
+ [midgard_op_ld_char] = "ld_char",
+ [midgard_op_ld_char2] = "ld_char2",
+ [midgard_op_ld_short] = "ld_short",
+ [midgard_op_ld_char4] = "ld_char4",
+ [midgard_op_ld_short4] = "ld_short4",
+ [midgard_op_ld_int4] = "ld_int4",
+
+ [midgard_op_ld_attr_32] = "ld_attr_32",
+ [midgard_op_ld_attr_16] = "ld_attr_16",
+ [midgard_op_ld_attr_32i] = "ld_attr_32i",
+ [midgard_op_ld_attr_32u] = "ld_attr_32u",
+
+ [midgard_op_ld_vary_32] = "ld_vary_32",
+ [midgard_op_ld_vary_16] = "ld_vary_16",
+ [midgard_op_ld_vary_32i] = "ld_vary_32i",
+ [midgard_op_ld_vary_32u] = "ld_vary_32u",
+
+ [midgard_op_ld_color_buffer_16] = "ld_color_buffer_16",
+
+ [midgard_op_ld_uniform_16] = "ld_uniform_16",
+ [midgard_op_ld_uniform_32] = "ld_uniform_32",
+ [midgard_op_ld_uniform_32i] = "ld_uniform_32i",
+ [midgard_op_ld_color_buffer_8] = "ld_color_buffer_8",
+
+ [midgard_op_st_char] = "st_char",
+ [midgard_op_st_char2] = "st_char2",
+ [midgard_op_st_char4] = "st_char4",
+ [midgard_op_st_short4] = "st_short4",
+ [midgard_op_st_int4] = "st_int4",
+
+ [midgard_op_st_vary_32] = "st_vary_32",
+ [midgard_op_st_vary_16] = "st_vary_16",
+ [midgard_op_st_vary_32i] = "st_vary_32i",
+ [midgard_op_st_vary_32u] = "st_vary_32u",
+
+ [midgard_op_st_image_f] = "st_image_f",
+ [midgard_op_st_image_ui] = "st_image_ui",
+ [midgard_op_st_image_i] = "st_image_i",
+};
--- /dev/null
+/* Copyright (c) 2018-2019 Alyssa Rosenzweig (alyssa@rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "helpers.h"
+
+/* Forward declare */
+
+extern struct mir_op_props alu_opcode_props[256];
+extern const char *load_store_opcode_names[256];
+
+/* Is this opcode that of an integer (regardless of signedness)? Instruction
+ * names authoritatively determine types */
+
+static inline bool
+midgard_is_integer_op(int op)
+{
+ const char *name = alu_opcode_props[op].name;
+
+ if (!name)
+ return false;
+
+ return (name[0] == 'i') || (name[0] == 'u');
+}
+
+/* Does this opcode *write* an integer? Same as is_integer_op, unless it's a
+ * conversion between int<->float in which case we do the opposite */
+
+static inline bool
+midgard_is_integer_out_op(int op)
+{
+ bool is_int = midgard_is_integer_op(op);
+ bool is_conversion = alu_opcode_props[op].props & OP_TYPE_CONVERT;
+
+ return is_int ^ is_conversion;
+}
+
+/* Determines effective writemask, taking quirks and expansion into account */
+
+static inline unsigned
+effective_writemask(midgard_vector_alu *alu, unsigned existing_mask)
+{
+ /* Channel count is off-by-one to fit in two-bits (0 channel makes no
+ * sense) */
+
+ unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[alu->op].props);
+
+ /* If there is a fixed channel count, construct the appropriate mask */
+
+ if (channel_count)
+ return (1 << channel_count) - 1;
+
+ return existing_mask;
+};
+
+
--- /dev/null
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "helpers.h"
+#include "midgard_ops.h"
+
+/* Pretty printer for Midgard IR, for use debugging compiler-internal
+ * passes like register allocation. The output superficially resembles
+ * Midgard assembly, with the exception that unit information and such is
+ * (normally) omitted, and generic indices are usually used instead of
+ * registers */
+
+static void
+mir_print_source(int source)
+{
+ if (source >= SSA_FIXED_MINIMUM) {
+ /* Specific register */
+ int reg = SSA_REG_FROM_FIXED(source);
+
+ /* TODO: Moving threshold */
+ if (reg > 16 && reg < 24)
+ printf("u%d", 23 - reg);
+ else
+ printf("r%d", reg);
+ } else {
+ printf("%d", source);
+ }
+}
+
+void
+mir_print_instruction(midgard_instruction *ins)
+{
+ printf("\t");
+
+ switch (ins->type) {
+ case TAG_ALU_4: {
+ midgard_alu_op op = ins->alu.op;
+ const char *name = alu_opcode_props[op].name;
+
+ if (ins->unit)
+ printf("%d.", ins->unit);
+
+ printf("%s", name ? name : "??");
+ break;
+ }
+
+ case TAG_LOAD_STORE_4: {
+ midgard_load_store_op op = ins->load_store.op;
+ const char *name = load_store_opcode_names[op];
+
+ assert(name);
+ printf("%s", name);
+ break;
+ }
+
+ case TAG_TEXTURE_4: {
+ printf("texture");
+ break;
+ }
+
+ default:
+ assert(0);
+ }
+
+ ssa_args *args = &ins->ssa_args;
+
+ printf(" %d, ", args->dest);
+
+ mir_print_source(args->src0);
+ printf(", ");
+
+ if (args->inline_constant)
+ printf("#%d", ins->inline_constant);
+ else
+ mir_print_source(args->src1);
+
+ if (ins->has_constants)
+ printf(" <%f, %f, %f, %f>", ins->constants[0], ins->constants[1], ins->constants[2], ins->constants[3]);
+
+ printf("\n");
+}
+
+/* Dumps MIR for a block or entire shader respective */
+
+void
+mir_print_block(midgard_block *block)
+{
+ printf("{\n");
+
+ mir_foreach_instr_in_block(block, ins) {
+ mir_print_instruction(ins);
+ }
+
+ printf("}\n");
+}
+
+void
+mir_print_shader(compiler_context *ctx)
+{
+ mir_foreach_block(ctx, block) {
+ mir_print_block(block);
+ }
+}
+
+void
+mir_print_bundle(midgard_bundle *bundle)
+{
+ printf("[\n");
+
+ for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+ midgard_instruction *ins = bundle->instructions[i];
+ mir_print_instruction(ins);
+ }
+
+ printf("]\n");
+}
--- /dev/null
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "midgard_ops.h"
+#include "util/register_allocate.h"
+#include "util/u_math.h"
+
+/* For work registers, we can subdivide in various ways. So we create
+ * classes for the various sizes and conflict accordingly, keeping in
+ * mind that physical registers are divided along 128-bit boundaries.
+ * The important part is that 128-bit boundaries are not crossed.
+ *
+ * For each 128-bit register, we can subdivide to 32-bits 10 ways
+ *
+ * vec4: xyzw
+ * vec3: xyz, yzw
+ * vec2: xy, yz, zw,
+ * vec1: x, y, z, w
+ *
+ * For each 64-bit register, we can subdivide similarly to 16-bit
+ * (TODO: half-float RA, not that we support fp16 yet)
+ */
+
+#define WORK_STRIDE 10
+
+/* Prepacked masks/swizzles for virtual register types */
+static unsigned reg_type_to_mask[WORK_STRIDE] = {
+ 0xF, /* xyzw */
+ 0x7, 0x7 << 1, /* xyz */
+ 0x3, 0x3 << 1, 0x3 << 2, /* xy */
+ 0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3 /* x */
+};
+
+static unsigned reg_type_to_swizzle[WORK_STRIDE] = {
+ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+
+ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+ SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W),
+
+ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+ SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W),
+ SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W),
+
+ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+ SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+ SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+ SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+};
+
+struct phys_reg {
+ unsigned reg;
+ unsigned mask;
+ unsigned swizzle;
+};
+
+/* Given the mask/swizzle of both the register and the original source,
+ * compose to find the actual mask/swizzle to give the hardware */
+
+static unsigned
+compose_writemask(unsigned mask, struct phys_reg reg)
+{
+ /* Note: the reg mask is guaranteed to be contiguous. So we shift
+ * into the X place, compose via a simple AND, and shift back */
+
+ unsigned shift = __builtin_ctz(reg.mask);
+ return ((reg.mask >> shift) & mask) << shift;
+}
+
+static unsigned
+compose_swizzle(unsigned swizzle, unsigned mask,
+ struct phys_reg reg, struct phys_reg dst)
+{
+ unsigned out = pan_compose_swizzle(swizzle, reg.swizzle);
+
+ /* Based on the register mask, we need to adjust over. E.g if we're
+ * writing to yz, a base swizzle of xy__ becomes _xy_. Save the
+ * original first component (x). But to prevent duplicate shifting
+ * (only applies to ALU -- mask param is set to xyzw out on L/S to
+ * prevent changes), we have to account for the shift inherent to the
+ * original writemask */
+
+ unsigned rep = out & 0x3;
+ unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask);
+ unsigned shifted = out << (2*shift);
+
+ /* ..but we fill in the gaps so it appears to replicate */
+
+ for (unsigned s = 0; s < shift; ++s)
+ shifted |= rep << (2*s);
+
+ return shifted;
+}
+
+/* When we're 'squeezing down' the values in the IR, we maintain a hash
+ * as such */
+
+static unsigned
+find_or_allocate_temp(compiler_context *ctx, unsigned hash)
+{
+ if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
+ return hash;
+
+ unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
+ ctx->hash_to_temp, hash + 1);
+
+ if (temp)
+ return temp - 1;
+
+ /* If no temp is find, allocate one */
+ temp = ctx->temp_count++;
+ ctx->max_hash = MAX2(ctx->max_hash, hash);
+
+ _mesa_hash_table_u64_insert(ctx->hash_to_temp,
+ hash + 1, (void *) ((uintptr_t) temp + 1));
+
+ return temp;
+}
+
+/* Callback for register allocation selection, trivial default for now */
+
+static unsigned int
+midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
+{
+ /* Choose the first available register to minimise register pressure */
+
+ for (int i = 0; i < (16 * WORK_STRIDE); ++i) {
+ if (BITSET_TEST(regs, i)) {
+ return i;
+ }
+ }
+
+ assert(0);
+ return 0;
+}
+
+/* Helper to return the default phys_reg for a given register */
+
+static struct phys_reg
+default_phys_reg(int reg)
+{
+ struct phys_reg r = {
+ .reg = reg,
+ .mask = 0xF, /* xyzw */
+ .swizzle = 0xE4 /* xyzw */
+ };
+
+ return r;
+}
+
+/* Determine which physical register, swizzle, and mask a virtual
+ * register corresponds to */
+
+static struct phys_reg
+index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg)
+{
+ /* Check for special cases */
+ if (reg >= SSA_FIXED_MINIMUM)
+ return default_phys_reg(SSA_REG_FROM_FIXED(reg));
+ else if ((reg < 0) || !g)
+ return default_phys_reg(REGISTER_UNUSED);
+
+ /* Special cases aside, we pick the underlying register */
+ int virt = ra_get_node_reg(g, reg);
+
+ /* Divide out the register and classification */
+ int phys = virt / WORK_STRIDE;
+ int type = virt % WORK_STRIDE;
+
+ struct phys_reg r = {
+ .reg = phys,
+ .mask = reg_type_to_mask[type],
+ .swizzle = reg_type_to_swizzle[type]
+ };
+
+ /* Report that we actually use this register, and return it */
+ ctx->work_registers = MAX2(ctx->work_registers, phys);
+ return r;
+}
+
+/* This routine performs the actual register allocation. It should be succeeded
+ * by install_registers */
+
+struct ra_graph *
+allocate_registers(compiler_context *ctx)
+{
+ /* The number of vec4 work registers available depends on when the
+ * uniforms start, so compute that first */
+
+ int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
+
+ int virtual_count = work_count * WORK_STRIDE;
+
+ /* First, initialize the RA */
+ struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true);
+
+ int work_vec4 = ra_alloc_reg_class(regs);
+ int work_vec3 = ra_alloc_reg_class(regs);
+ int work_vec2 = ra_alloc_reg_class(regs);
+ int work_vec1 = ra_alloc_reg_class(regs);
+
+ unsigned classes[4] = {
+ work_vec1,
+ work_vec2,
+ work_vec3,
+ work_vec4
+ };
+
+ /* Add the full set of work registers */
+ for (unsigned i = 0; i < work_count; ++i) {
+ int base = WORK_STRIDE * i;
+
+ /* Build a full set of subdivisions */
+ ra_class_add_reg(regs, work_vec4, base);
+ ra_class_add_reg(regs, work_vec3, base + 1);
+ ra_class_add_reg(regs, work_vec3, base + 2);
+ ra_class_add_reg(regs, work_vec2, base + 3);
+ ra_class_add_reg(regs, work_vec2, base + 4);
+ ra_class_add_reg(regs, work_vec2, base + 5);
+ ra_class_add_reg(regs, work_vec1, base + 6);
+ ra_class_add_reg(regs, work_vec1, base + 7);
+ ra_class_add_reg(regs, work_vec1, base + 8);
+ ra_class_add_reg(regs, work_vec1, base + 9);
+
+ for (unsigned a = 0; a < 10; ++a) {
+ unsigned mask1 = reg_type_to_mask[a];
+
+ for (unsigned b = 0; b < 10; ++b) {
+ unsigned mask2 = reg_type_to_mask[b];
+
+ if (mask1 & mask2)
+ ra_add_reg_conflict(regs,
+ base + a, base + b);
+ }
+ }
+ }
+
+ /* We're done setting up */
+ ra_set_finalize(regs, NULL);
+
+ /* Transform the MIR into squeezed index form */
+ mir_foreach_block(ctx, block) {
+ mir_foreach_instr_in_block(block, ins) {
+ if (ins->compact_branch) continue;
+
+ ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
+ ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
+
+ if (!ins->ssa_args.inline_constant)
+ ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
+
+ }
+ }
+
+ /* No register allocation to do with no SSA */
+
+ if (!ctx->temp_count)
+ return NULL;
+
+ /* Let's actually do register allocation */
+ int nodes = ctx->temp_count;
+ struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
+
+ /* Determine minimum size needed to hold values, to indirectly
+ * determine class */
+
+ unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count);
+
+ mir_foreach_block(ctx, block) {
+ mir_foreach_instr_in_block(block, ins) {
+ if (ins->compact_branch) continue;
+ if (ins->ssa_args.dest < 0) continue;
+ if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
+
+ int class = util_logbase2(ins->mask) + 1;
+
+ /* Use the largest class if there's ambiguity, this
+ * handles partial writes */
+
+ int dest = ins->ssa_args.dest;
+ found_class[dest] = MAX2(found_class[dest], class);
+ }
+ }
+
+ for (unsigned i = 0; i < ctx->temp_count; ++i) {
+ unsigned class = found_class[i];
+ if (!class) continue;
+ ra_set_node_class(g, i, classes[class - 1]);
+ }
+
+ /* Determine liveness */
+
+ int *live_start = malloc(nodes * sizeof(int));
+ int *live_end = malloc(nodes * sizeof(int));
+
+ /* Initialize as non-existent */
+
+ for (int i = 0; i < nodes; ++i) {
+ live_start[i] = live_end[i] = -1;
+ }
+
+ int d = 0;
+
+ mir_foreach_block(ctx, block) {
+ mir_foreach_instr_in_block(block, ins) {
+ if (ins->compact_branch) continue;
+
+ /* Dest is < 0 for st_vary instructions, which break
+ * the usual SSA conventions. Liveness analysis doesn't
+ * make sense on these instructions, so skip them to
+ * avoid memory corruption */
+
+ if (ins->ssa_args.dest < 0) continue;
+
+ if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) {
+ /* If this destination is not yet live, it is
+ * now since we just wrote it */
+
+ int dest = ins->ssa_args.dest;
+
+ if (live_start[dest] == -1)
+ live_start[dest] = d;
+ }
+
+ /* Since we just used a source, the source might be
+ * dead now. Scan the rest of the block for
+ * invocations, and if there are none, the source dies
+ * */
+
+ int sources[2] = {
+ ins->ssa_args.src0, ins->ssa_args.src1
+ };
+
+ for (int src = 0; src < 2; ++src) {
+ int s = sources[src];
+
+ if (s < 0) continue;
+
+ if (s >= SSA_FIXED_MINIMUM) continue;
+
+ if (!mir_is_live_after(ctx, block, ins, s)) {
+ live_end[s] = d;
+ }
+ }
+
+ ++d;
+ }
+ }
+
+ /* If a node still hasn't been killed, kill it now */
+
+ for (int i = 0; i < nodes; ++i) {
+ /* live_start == -1 most likely indicates a pinned output */
+
+ if (live_end[i] == -1)
+ live_end[i] = d;
+ }
+
+ /* Setup interference between nodes that are live at the same time */
+
+ for (int i = 0; i < nodes; ++i) {
+ for (int j = i + 1; j < nodes; ++j) {
+ bool j_overlaps_i = live_start[j] < live_end[i];
+ bool i_overlaps_j = live_end[j] < live_start[i];
+
+ if (i_overlaps_j || j_overlaps_i)
+ ra_add_node_interference(g, i, j);
+ }
+ }
+
+ ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL);
+
+ if (!ra_allocate(g)) {
+ unreachable("Error allocating registers\n");
+ }
+
+ /* Cleanup */
+ free(live_start);
+ free(live_end);
+
+ return g;
+}
+
+/* Once registers have been decided via register allocation
+ * (allocate_registers), we need to rewrite the MIR to use registers instead of
+ * indices */
+
+static void
+install_registers_instr(
+ compiler_context *ctx,
+ struct ra_graph *g,
+ midgard_instruction *ins)
+{
+ ssa_args args = ins->ssa_args;
+
+ switch (ins->type) {
+ case TAG_ALU_4: {
+ int adjusted_src = args.inline_constant ? -1 : args.src1;
+ struct phys_reg src1 = index_to_reg(ctx, g, args.src0);
+ struct phys_reg src2 = index_to_reg(ctx, g, adjusted_src);
+ struct phys_reg dest = index_to_reg(ctx, g, args.dest);
+
+ unsigned uncomposed_mask = ins->mask;
+ ins->mask = compose_writemask(uncomposed_mask, dest);
+
+ /* Adjust the dest mask if necessary. Mostly this is a no-op
+ * but it matters for dot products */
+ dest.mask = effective_writemask(&ins->alu, ins->mask);
+
+ midgard_vector_alu_src mod1 =
+ vector_alu_from_unsigned(ins->alu.src1);
+ mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest);
+ ins->alu.src1 = vector_alu_srco_unsigned(mod1);
+
+ ins->registers.src1_reg = src1.reg;
+
+ ins->registers.src2_imm = args.inline_constant;
+
+ if (args.inline_constant) {
+ /* Encode inline 16-bit constant. See disassembler for
+ * where the algorithm is from */
+
+ ins->registers.src2_reg = ins->inline_constant >> 11;
+
+ int lower_11 = ins->inline_constant & ((1 << 12) - 1);
+ uint16_t imm = ((lower_11 >> 8) & 0x7) |
+ ((lower_11 & 0xFF) << 3);
+
+ ins->alu.src2 = imm << 2;
+ } else {
+ midgard_vector_alu_src mod2 =
+ vector_alu_from_unsigned(ins->alu.src2);
+ mod2.swizzle = compose_swizzle(
+ mod2.swizzle, uncomposed_mask, src2, dest);
+ ins->alu.src2 = vector_alu_srco_unsigned(mod2);
+
+ ins->registers.src2_reg = src2.reg;
+ }
+
+ ins->registers.out_reg = dest.reg;
+ break;
+ }
+
+ case TAG_LOAD_STORE_4: {
+ if (OP_IS_STORE_VARY(ins->load_store.op)) {
+ /* TODO: use ssa_args for st_vary */
+ ins->load_store.reg = 0;
+ } else {
+ /* Which physical register we read off depends on
+ * whether we are loading or storing -- think about the
+ * logical dataflow */
+
+ unsigned r = OP_IS_STORE(ins->load_store.op) ?
+ args.src0 : args.dest;
+ struct phys_reg src = index_to_reg(ctx, g, r);
+
+ ins->load_store.reg = src.reg;
+
+ ins->load_store.swizzle = compose_swizzle(
+ ins->load_store.swizzle, 0xF,
+ default_phys_reg(0), src);
+
+ ins->mask = compose_writemask(
+ ins->mask, src);
+ }
+
+ break;
+ }
+
+ default:
+ break;
+ }
+}
+
+void
+install_registers(compiler_context *ctx, struct ra_graph *g)
+{
+ mir_foreach_block(ctx, block) {
+ mir_foreach_instr_in_block(block, ins) {
+ if (ins->compact_branch) continue;
+ install_registers_instr(ctx, g, ins);
+ }
+ }
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+
+/* Creates pipeline registers. This is a prepass run before the main register
+ * allocator but after scheduling, once bundles are created. It works by
+ * iterating the scheduled IR, checking if a value is ever used after the end
+ * of the current bundle. If it is not, it is promoted to a bundle-specific
+ * pipeline register.
+ *
+ * Pipeline registers are only written from the first two stages of the
+ * pipeline (vmul/sadd) lasting the duration of the bundle only. There are two
+ * 128-bit pipeline registers available (r24/r25). The upshot is that no actual
+ * register allocation is needed; we can _always_ promote a value to a pipeline
+ * register, liveness permitting. This greatly simplifies the logic of this
+ * passing, negating the need for a proper RA like work registers.
+ */
+
+static bool
+mir_pipeline_ins(
+ compiler_context *ctx,
+ midgard_block *block,
+ midgard_bundle *bundle, unsigned i,
+ unsigned pipeline_count)
+{
+ midgard_instruction *ins = bundle->instructions[i];
+ unsigned dest = ins->ssa_args.dest;
+
+ /* Check to make sure we're legal */
+
+ if (ins->compact_branch)
+ return false;
+
+ /* Don't allow non-SSA. Pipelining registers is theoretically possible,
+ * but the analysis is much hairier, so don't bother quite yet */
+ if ((dest < 0) || (dest >= ctx->func->impl->ssa_alloc))
+ return false;
+
+ /* Make sure they're not lying to us. Blend shaders lie. TODO: Fix your
+ * bad code Alyssa */
+
+ if (mir_has_multiple_writes(ctx, dest))
+ return false;
+
+ /* We want to know if we live after this bundle, so check if
+ * we're live after the last instruction of the bundle */
+
+ midgard_instruction *end = bundle->instructions[
+ bundle->instruction_count - 1];
+
+ if (mir_is_live_after(ctx, block, end, ins->ssa_args.dest))
+ return false;
+
+ /* We're only live in this bundle -- pipeline! */
+
+ mir_rewrite_index(ctx, dest, SSA_FIXED_REGISTER(24 + pipeline_count));
+
+ return true;
+}
+
+void
+mir_create_pipeline_registers(compiler_context *ctx)
+{
+ mir_foreach_block(ctx, block) {
+ mir_foreach_bundle_in_block(block, bundle) {
+ if (!mir_is_alu_bundle(bundle)) continue;
+ if (bundle->instruction_count < 2) continue;
+
+ /* Only first 2 instructions could pipeline */
+ bool succ = mir_pipeline_ins(ctx, block, bundle, 0, 0);
+ mir_pipeline_ins(ctx, block, bundle, 1, succ);
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "midgard_ops.h"
+#include "util/u_memory.h"
+
+/* Create a mask of accessed components from a swizzle to figure out vector
+ * dependencies */
+
+static unsigned
+swizzle_to_access_mask(unsigned swizzle)
+{
+ unsigned component_mask = 0;
+
+ for (int i = 0; i < 4; ++i) {
+ unsigned c = (swizzle >> (2 * i)) & 3;
+ component_mask |= (1 << c);
+ }
+
+ return component_mask;
+}
+
+/* Does the mask cover more than a scalar? */
+
+static bool
+is_single_component_mask(unsigned mask)
+{
+ int components = 0;
+
+ for (int c = 0; c < 8; ++c) {
+ if (mask & (1 << c))
+ components++;
+ }
+
+ return components == 1;
+}
+
+/* Checks for an SSA data hazard between two adjacent instructions, keeping in
+ * mind that we are a vector architecture and we can write to different
+ * components simultaneously */
+
+static bool
+can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
+{
+ /* Each instruction reads some registers and writes to a register. See
+ * where the first writes */
+
+ /* Figure out where exactly we wrote to */
+ int source = first->ssa_args.dest;
+ int source_mask = first->mask;
+
+ /* As long as the second doesn't read from the first, we're okay */
+ if (second->ssa_args.src0 == source) {
+ if (first->type == TAG_ALU_4) {
+ /* Figure out which components we just read from */
+
+ int q = second->alu.src1;
+ midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+
+ /* Check if there are components in common, and fail if so */
+ if (swizzle_to_access_mask(m->swizzle) & source_mask)
+ return false;
+ } else
+ return false;
+
+ }
+
+ if (second->ssa_args.src1 == source)
+ return false;
+
+ /* Otherwise, it's safe in that regard. Another data hazard is both
+ * writing to the same place, of course */
+
+ if (second->ssa_args.dest == source) {
+ /* ...but only if the components overlap */
+
+ if (second->mask & source_mask)
+ return false;
+ }
+
+ /* ...That's it */
+ return true;
+}
+
+static bool
+midgard_has_hazard(
+ midgard_instruction **segment, unsigned segment_size,
+ midgard_instruction *ains)
+{
+ for (int s = 0; s < segment_size; ++s)
+ if (!can_run_concurrent_ssa(segment[s], ains))
+ return true;
+
+ return false;
+
+
+}
+
+/* Schedules, but does not emit, a single basic block. After scheduling, the
+ * final tag and size of the block are known, which are necessary for branching
+ * */
+
+static midgard_bundle
+schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
+{
+ int instructions_emitted = 0, packed_idx = 0;
+ midgard_bundle bundle = { 0 };
+
+ uint8_t tag = ins->type;
+
+ /* Default to the instruction's tag */
+ bundle.tag = tag;
+
+ switch (ins->type) {
+ case TAG_ALU_4: {
+ uint32_t control = 0;
+ size_t bytes_emitted = sizeof(control);
+
+ /* TODO: Constant combining */
+ int index = 0, last_unit = 0;
+
+ /* Previous instructions, for the purpose of parallelism */
+ midgard_instruction *segment[4] = {0};
+ int segment_size = 0;
+
+ instructions_emitted = -1;
+ midgard_instruction *pins = ins;
+
+ unsigned constant_count = 0;
+
+ for (;;) {
+ midgard_instruction *ains = pins;
+
+ /* Advance instruction pointer */
+ if (index) {
+ ains = mir_next_op(pins);
+ pins = ains;
+ }
+
+ /* Out-of-work condition */
+ if ((struct list_head *) ains == &block->instructions)
+ break;
+
+ /* Ensure that the chain can continue */
+ if (ains->type != TAG_ALU_4) break;
+
+ /* If there's already something in the bundle and we
+ * have weird scheduler constraints, break now */
+ if (ains->precede_break && index) break;
+
+ /* According to the presentation "The ARM
+ * Mali-T880 Mobile GPU" from HotChips 27,
+ * there are two pipeline stages. Branching
+ * position determined experimentally. Lines
+ * are executed in parallel:
+ *
+ * [ VMUL ] [ SADD ]
+ * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
+ *
+ * Verify that there are no ordering dependencies here.
+ *
+ * TODO: Allow for parallelism!!!
+ */
+
+ /* Pick a unit for it if it doesn't force a particular unit */
+
+ int unit = ains->unit;
+
+ if (!unit) {
+ int op = ains->alu.op;
+ int units = alu_opcode_props[op].props;
+
+ bool scalarable = units & UNITS_SCALAR;
+ bool could_scalar = is_single_component_mask(ains->mask);
+
+ /* Only 16/32-bit can run on a scalar unit */
+ could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
+ could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
+ could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
+
+ if (ains->alu.reg_mode == midgard_reg_mode_16) {
+ /* If we're running in 16-bit mode, we
+ * can't have any 8-bit sources on the
+ * scalar unit (since the scalar unit
+ * doesn't understand 8-bit) */
+
+ midgard_vector_alu_src s1 =
+ vector_alu_from_unsigned(ains->alu.src1);
+
+ could_scalar &= !s1.half;
+
+ if (!ains->ssa_args.inline_constant) {
+ midgard_vector_alu_src s2 =
+ vector_alu_from_unsigned(ains->alu.src2);
+
+ could_scalar &= !s2.half;
+ }
+
+ }
+
+ bool scalar = could_scalar && scalarable;
+
+ /* TODO: Check ahead-of-time for other scalar
+ * hazards that otherwise get aborted out */
+
+ if (scalar)
+ assert(units & UNITS_SCALAR);
+
+ if (!scalar) {
+ if (last_unit >= UNIT_VADD) {
+ if (units & UNIT_VLUT)
+ unit = UNIT_VLUT;
+ else
+ break;
+ } else {
+ if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
+ unit = UNIT_VMUL;
+ else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+ unit = UNIT_VADD;
+ else if (units & UNIT_VLUT)
+ unit = UNIT_VLUT;
+ else
+ break;
+ }
+ } else {
+ if (last_unit >= UNIT_VADD) {
+ if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
+ unit = UNIT_SMUL;
+ else if (units & UNIT_VLUT)
+ unit = UNIT_VLUT;
+ else
+ break;
+ } else {
+ if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
+ unit = UNIT_SADD;
+ else if (units & UNIT_SMUL)
+ unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
+ else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+ unit = UNIT_VADD;
+ else
+ break;
+ }
+ }
+
+ assert(unit & units);
+ }
+
+ /* Late unit check, this time for encoding (not parallelism) */
+ if (unit <= last_unit) break;
+
+ /* Clear the segment */
+ if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
+ segment_size = 0;
+
+ if (midgard_has_hazard(segment, segment_size, ains))
+ break;
+
+ /* We're good to go -- emit the instruction */
+ ains->unit = unit;
+
+ segment[segment_size++] = ains;
+
+ /* We try to reuse constants if possible, by adjusting
+ * the swizzle */
+
+ if (ains->has_blend_constant) {
+ /* Everything conflicts with the blend constant */
+ if (bundle.has_embedded_constants)
+ break;
+
+ bundle.has_blend_constant = 1;
+ bundle.has_embedded_constants = 1;
+ } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
+ /* TODO: DRY with the analysis pass */
+
+ if (bundle.has_blend_constant)
+ break;
+
+ if (constant_count)
+ break;
+
+ /* TODO: Fix packing XXX */
+ uint16_t *bundles = (uint16_t *) bundle.constants;
+ uint32_t *constants = (uint32_t *) ains->constants;
+
+ /* Copy them wholesale */
+ for (unsigned i = 0; i < 4; ++i)
+ bundles[i] = constants[i];
+
+ bundle.has_embedded_constants = true;
+ constant_count = 4;
+ } else if (ains->has_constants) {
+ /* By definition, blend constants conflict with
+ * everything, so if there are already
+ * constants we break the bundle *now* */
+
+ if (bundle.has_blend_constant)
+ break;
+
+ /* For anything but blend constants, we can do
+ * proper analysis, however */
+
+ /* TODO: Mask by which are used */
+ uint32_t *constants = (uint32_t *) ains->constants;
+ uint32_t *bundles = (uint32_t *) bundle.constants;
+
+ uint32_t indices[4] = { 0 };
+ bool break_bundle = false;
+
+ for (unsigned i = 0; i < 4; ++i) {
+ uint32_t cons = constants[i];
+ bool constant_found = false;
+
+ /* Search for the constant */
+ for (unsigned j = 0; j < constant_count; ++j) {
+ if (bundles[j] != cons)
+ continue;
+
+ /* We found it, reuse */
+ indices[i] = j;
+ constant_found = true;
+ break;
+ }
+
+ if (constant_found)
+ continue;
+
+ /* We didn't find it, so allocate it */
+ unsigned idx = constant_count++;
+
+ if (idx >= 4) {
+ /* Uh-oh, out of space */
+ break_bundle = true;
+ break;
+ }
+
+ /* We have space, copy it in! */
+ bundles[idx] = cons;
+ indices[i] = idx;
+ }
+
+ if (break_bundle)
+ break;
+
+ /* Cool, we have it in. So use indices as a
+ * swizzle */
+
+ unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
+ unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+
+ if (ains->ssa_args.src0 == r_constant)
+ ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
+
+ if (ains->ssa_args.src1 == r_constant)
+ ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
+
+ bundle.has_embedded_constants = true;
+ }
+
+ if (ains->unit & UNITS_ANY_VECTOR) {
+ bytes_emitted += sizeof(midgard_reg_info);
+ bytes_emitted += sizeof(midgard_vector_alu);
+ } else if (ains->compact_branch) {
+ /* All of r0 has to be written out along with
+ * the branch writeout */
+
+ if (ains->writeout) {
+ /* The rules for when "bare" writeout
+ * is safe are when all components are
+ * r0 are written out in the final
+ * bundle, earlier than VLUT, where any
+ * register dependencies of r0 are from
+ * an earlier bundle. We can't verify
+ * this before RA, so we don't try. */
+
+ if (index != 0)
+ break;
+
+ /* Inject a move */
+ midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
+ ins.unit = UNIT_VMUL;
+ control |= ins.unit;
+
+ /* TODO don't leak */
+ midgard_instruction *move =
+ mem_dup(&ins, sizeof(midgard_instruction));
+ bytes_emitted += sizeof(midgard_reg_info);
+ bytes_emitted += sizeof(midgard_vector_alu);
+ bundle.instructions[packed_idx++] = move;
+ }
+
+ if (ains->unit == ALU_ENAB_BRANCH) {
+ bytes_emitted += sizeof(midgard_branch_extended);
+ } else {
+ bytes_emitted += sizeof(ains->br_compact);
+ }
+ } else {
+ bytes_emitted += sizeof(midgard_reg_info);
+ bytes_emitted += sizeof(midgard_scalar_alu);
+ }
+
+ /* Defer marking until after writing to allow for break */
+ control |= ains->unit;
+ last_unit = ains->unit;
+ ++instructions_emitted;
+ ++index;
+ }
+
+ int padding = 0;
+
+ /* Pad ALU op to nearest word */
+
+ if (bytes_emitted & 15) {
+ padding = 16 - (bytes_emitted & 15);
+ bytes_emitted += padding;
+ }
+
+ /* Constants must always be quadwords */
+ if (bundle.has_embedded_constants)
+ bytes_emitted += 16;
+
+ /* Size ALU instruction for tag */
+ bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
+ bundle.padding = padding;
+ bundle.control = bundle.tag | control;
+
+ break;
+ }
+
+ case TAG_LOAD_STORE_4: {
+ /* Load store instructions have two words at once. If
+ * we only have one queued up, we need to NOP pad.
+ * Otherwise, we store both in succession to save space
+ * and cycles -- letting them go in parallel -- skip
+ * the next. The usefulness of this optimisation is
+ * greatly dependent on the quality of the instruction
+ * scheduler.
+ */
+
+ midgard_instruction *next_op = mir_next_op(ins);
+
+ if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
+ /* TODO: Concurrency check */
+ instructions_emitted++;
+ }
+
+ break;
+ }
+
+ case TAG_TEXTURE_4: {
+ /* Which tag we use depends on the shader stage */
+ bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
+ bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
+ break;
+ }
+
+ default:
+ unreachable("Unknown tag");
+ break;
+ }
+
+ /* Copy the instructions into the bundle */
+ bundle.instruction_count = instructions_emitted + 1 + packed_idx;
+
+ midgard_instruction *uins = ins;
+ for (; packed_idx < bundle.instruction_count; ++packed_idx) {
+ bundle.instructions[packed_idx] = uins;
+ uins = mir_next_op(uins);
+ }
+
+ *skip = instructions_emitted;
+
+ return bundle;
+}
+
+/* Schedule a single block by iterating its instruction to create bundles.
+ * While we go, tally about the bundle sizes to compute the block size. */
+
+static void
+schedule_block(compiler_context *ctx, midgard_block *block)
+{
+ util_dynarray_init(&block->bundles, NULL);
+
+ block->quadword_count = 0;
+
+ mir_foreach_instr_in_block(block, ins) {
+ int skip;
+ midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
+ util_dynarray_append(&block->bundles, midgard_bundle, bundle);
+
+ if (bundle.has_blend_constant) {
+ /* TODO: Multiblock? */
+ int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
+ ctx->blend_constant_offset = quadwords_within_block * 0x10;
+ }
+
+ while(skip--)
+ ins = mir_next_op(ins);
+
+ block->quadword_count += quadword_size(bundle.tag);
+ }
+
+ block->is_scheduled = true;
+}
+
+void
+schedule_program(compiler_context *ctx)
+{
+ /* We run RA prior to scheduling */
+
+ mir_foreach_block(ctx, block) {
+ schedule_block(ctx, block);
+ }
+
+ /* Pipeline registers creation is a prepass before RA */
+ mir_create_pipeline_registers(ctx);
+
+ struct ra_graph *g = allocate_registers(ctx);
+ install_registers(ctx, g);
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+
+void
+mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new)
+{
+ mir_foreach_instr_global(ctx, ins) {
+ if (ins->ssa_args.src0 == old)
+ ins->ssa_args.src0 = new;
+
+ if (ins->ssa_args.src1 == old &&
+ !ins->ssa_args.inline_constant)
+ ins->ssa_args.src1 = new;
+ }
+}
+
+void
+mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new)
+{
+ mir_foreach_instr_global(ctx, ins) {
+ if (ins->ssa_args.dest == old)
+ ins->ssa_args.dest = new;
+ }
+}
+
+void
+mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new)
+{
+ mir_rewrite_index_src(ctx, old, new);
+ mir_rewrite_index_dst(ctx, old, new);
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Alyssa Rosenzweig
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "decode.h"
+
+/* Parsing */
+
+static FILE *
+pandecode_read_filename(const char *base, const char *name)
+{
+ char *fn = NULL;
+ asprintf(&fn, "%s/%s", base, name);
+
+ FILE *fp = fopen(fn, "rb");
+ free(fn);
+
+ return fp;
+}
+
+static void
+pandecode_read_memory(const char *base, const char *name, mali_ptr gpu_va)
+{
+ FILE *fp = pandecode_read_filename(base, name);
+
+ if (!fp) {
+ fprintf(stderr, "Warning: missing %s\n", name);
+ return;
+ }
+
+ fseek(fp, 0, SEEK_END);
+ long sz = ftell(fp);
+ fseek(fp, 0, SEEK_SET);
+
+ char *buf = malloc(sz);
+ assert(buf);
+ fread(buf, 1, sz, fp);
+ fclose(fp);
+
+ pandecode_inject_mmap(gpu_va, buf, sz, name);
+}
+
+static void
+pandecode_read_mmap(const char *base, const char *line)
+{
+ assert(strlen(line) < 500);
+
+ mali_ptr addr;
+ char name[512];
+
+ sscanf(line, "MMAP %" PRIx64 " %s", &addr, name);
+ pandecode_read_memory(base, name, addr);
+}
+
+static void
+pandecode_read_job_submit(const char *base, const char *line)
+{
+ mali_ptr addr;
+ unsigned core_req;
+ unsigned is_bifrost;
+
+ sscanf(line, "JS %" PRIx64 " %x %x", &addr, &core_req, &is_bifrost);
+ pandecode_replay_jc(addr, is_bifrost);
+}
+
+
+
+/* Reads the control file, processing as it goes. */
+
+static void
+pandecode_read_control(const char *base)
+{
+ FILE *fp = pandecode_read_filename(base, "control.log");
+
+ if (!fp) {
+ fprintf(stderr, "Invalid directory path\n");
+ return;
+ }
+
+ char *line = NULL;
+ size_t len = 0;
+
+ while (getline(&line, &len, fp) != -1) {
+ switch (line[0]) {
+ case 'M':
+ pandecode_read_mmap(base, line);
+ break;
+
+ case 'J':
+ pandecode_read_job_submit(base, line);
+ break;
+
+ default:
+ assert(0);
+ break;
+ }
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ if (argc < 2) {
+ fprintf(stderr, "Usage: pandecode [directory]\n");
+ exit(1);
+ }
+
+ pandecode_initialize();
+ pandecode_read_control(argv[1]);
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Alyssa Rosenzweig
+ * Copyright (C) 2017-2018 Lyude Paul
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "decode.h"
+#include "util/macros.h"
+
+/* Memory handling */
+
+static struct pandecode_mapped_memory mmaps;
+
+struct pandecode_mapped_memory *
+pandecode_find_mapped_gpu_mem_containing(mali_ptr addr)
+{
+ list_for_each_entry(struct pandecode_mapped_memory, pos, &mmaps.node, node) {
+ if (addr >= pos->gpu_va && addr < pos->gpu_va + pos->length)
+ return pos;
+ }
+
+ return NULL;
+}
+
+void
+pandecode_inject_mmap(mali_ptr gpu_va, void *cpu, unsigned sz, const char *name)
+{
+ struct pandecode_mapped_memory *mapped_mem = NULL;
+
+ mapped_mem = malloc(sizeof(*mapped_mem));
+ list_inithead(&mapped_mem->node);
+
+ mapped_mem->gpu_va = gpu_va;
+ mapped_mem->length = sz;
+ mapped_mem->addr = cpu;
+
+ if (!name) {
+ /* If we don't have a name, assign one */
+
+ snprintf(mapped_mem->name, ARRAY_SIZE(mapped_mem->name) - 1,
+ "memory_%" PRIx64, gpu_va);
+ } else {
+ assert(strlen(name) < ARRAY_SIZE(mapped_mem->name));
+ memcpy(mapped_mem->name, name, strlen(name));
+ }
+
+ list_add(&mapped_mem->node, &mmaps.node);
+}
+
+char *
+pointer_as_memory_reference(mali_ptr ptr)
+{
+ struct pandecode_mapped_memory *mapped;
+ char *out = malloc(128);
+
+ /* Try to find the corresponding mapped zone */
+
+ mapped = pandecode_find_mapped_gpu_mem_containing(ptr);
+
+ if (mapped) {
+ snprintf(out, 128, "%s + %d", mapped->name, (int) (ptr - mapped->gpu_va));
+ return out;
+ }
+
+ /* Just use the raw address if other options are exhausted */
+
+ snprintf(out, 128, MALI_PTR_FMT, ptr);
+ return out;
+
+}
+
+void
+pandecode_initialize(void)
+{
+ list_inithead(&mmaps.node);
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2017-2019 Alyssa Rosenzweig
+ * Copyright (C) 2017-2019 Connor Abbott
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <panfrost-job.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include "decode.h"
+#include "util/u_math.h"
+
+#include "pan_pretty_print.h"
+#include "midgard/disassemble.h"
+#include "bifrost/disassemble.h"
+
+int pandecode_replay_jc(mali_ptr jc_gpu_va, bool bifrost);
+
+#define MEMORY_PROP(obj, p) {\
+ if (obj->p) { \
+ char *a = pointer_as_memory_reference(obj->p); \
+ pandecode_prop("%s = %s", #p, a); \
+ free(a); \
+ } \
+}
+
+#define DYN_MEMORY_PROP(obj, no, p) { \
+ if (obj->p) \
+ pandecode_prop("%s = %s_%d_p", #p, #p, no); \
+}
+
+/* Semantic logging type.
+ *
+ * Raw: for raw messages to be printed as is.
+ * Message: for helpful information to be commented out in replays.
+ * Property: for properties of a struct
+ *
+ * Use one of pandecode_log, pandecode_msg, or pandecode_prop as syntax sugar.
+ */
+
+enum pandecode_log_type {
+ PANDECODE_RAW,
+ PANDECODE_MESSAGE,
+ PANDECODE_PROPERTY
+};
+
+#define pandecode_log(...) pandecode_log_typed(PANDECODE_RAW, __VA_ARGS__)
+#define pandecode_msg(...) pandecode_log_typed(PANDECODE_MESSAGE, __VA_ARGS__)
+#define pandecode_prop(...) pandecode_log_typed(PANDECODE_PROPERTY, __VA_ARGS__)
+
+unsigned pandecode_indent = 0;
+
+static void
+pandecode_make_indent(void)
+{
+ for (unsigned i = 0; i < pandecode_indent; ++i)
+ printf(" ");
+}
+
+static void
+pandecode_log_typed(enum pandecode_log_type type, const char *format, ...)
+{
+ va_list ap;
+
+ pandecode_make_indent();
+
+ if (type == PANDECODE_MESSAGE)
+ printf("// ");
+ else if (type == PANDECODE_PROPERTY)
+ printf(".");
+
+ va_start(ap, format);
+ vprintf(format, ap);
+ va_end(ap);
+
+ if (type == PANDECODE_PROPERTY)
+ printf(",\n");
+}
+
+static void
+pandecode_log_cont(const char *format, ...)
+{
+ va_list ap;
+
+ va_start(ap, format);
+ vprintf(format, ap);
+ va_end(ap);
+}
+
+struct pandecode_flag_info {
+ u64 flag;
+ const char *name;
+};
+
+static void
+pandecode_log_decoded_flags(const struct pandecode_flag_info *flag_info,
+ u64 flags)
+{
+ bool decodable_flags_found = false;
+
+ for (int i = 0; flag_info[i].name; i++) {
+ if ((flags & flag_info[i].flag) != flag_info[i].flag)
+ continue;
+
+ if (!decodable_flags_found) {
+ decodable_flags_found = true;
+ } else {
+ pandecode_log_cont(" | ");
+ }
+
+ pandecode_log_cont("%s", flag_info[i].name);
+
+ flags &= ~flag_info[i].flag;
+ }
+
+ if (decodable_flags_found) {
+ if (flags)
+ pandecode_log_cont(" | 0x%" PRIx64, flags);
+ } else {
+ pandecode_log_cont("0x%" PRIx64, flags);
+ }
+}
+
+#define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag }
+static const struct pandecode_flag_info gl_enable_flag_info[] = {
+ FLAG_INFO(OCCLUSION_QUERY),
+ FLAG_INFO(OCCLUSION_PRECISE),
+ FLAG_INFO(FRONT_CCW_TOP),
+ FLAG_INFO(CULL_FACE_FRONT),
+ FLAG_INFO(CULL_FACE_BACK),
+ {}
+};
+#undef FLAG_INFO
+
+#define FLAG_INFO(flag) { MALI_CLEAR_##flag, "MALI_CLEAR_" #flag }
+static const struct pandecode_flag_info clear_flag_info[] = {
+ FLAG_INFO(FAST),
+ FLAG_INFO(SLOW),
+ FLAG_INFO(SLOW_STENCIL),
+ {}
+};
+#undef FLAG_INFO
+
+#define FLAG_INFO(flag) { MALI_MASK_##flag, "MALI_MASK_" #flag }
+static const struct pandecode_flag_info mask_flag_info[] = {
+ FLAG_INFO(R),
+ FLAG_INFO(G),
+ FLAG_INFO(B),
+ FLAG_INFO(A),
+ {}
+};
+#undef FLAG_INFO
+
+#define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag }
+static const struct pandecode_flag_info u3_flag_info[] = {
+ FLAG_INFO(HAS_MSAA),
+ FLAG_INFO(CAN_DISCARD),
+ FLAG_INFO(HAS_BLEND_SHADER),
+ FLAG_INFO(DEPTH_TEST),
+ {}
+};
+
+static const struct pandecode_flag_info u4_flag_info[] = {
+ FLAG_INFO(NO_MSAA),
+ FLAG_INFO(NO_DITHER),
+ FLAG_INFO(DEPTH_RANGE_A),
+ FLAG_INFO(DEPTH_RANGE_B),
+ FLAG_INFO(STENCIL_TEST),
+ FLAG_INFO(SAMPLE_ALPHA_TO_COVERAGE_NO_BLEND_SHADER),
+ {}
+};
+#undef FLAG_INFO
+
+#define FLAG_INFO(flag) { MALI_FRAMEBUFFER_##flag, "MALI_FRAMEBUFFER_" #flag }
+static const struct pandecode_flag_info fb_fmt_flag_info[] = {
+ FLAG_INFO(MSAA_A),
+ FLAG_INFO(MSAA_B),
+ FLAG_INFO(MSAA_8),
+ {}
+};
+#undef FLAG_INFO
+
+#define FLAG_INFO(flag) { MALI_MFBD_FORMAT_##flag, "MALI_MFBD_FORMAT_" #flag }
+static const struct pandecode_flag_info mfbd_fmt_flag_info[] = {
+ FLAG_INFO(MSAA),
+ FLAG_INFO(SRGB),
+ {}
+};
+#undef FLAG_INFO
+
+#define FLAG_INFO(flag) { MALI_EXTRA_##flag, "MALI_EXTRA_" #flag }
+static const struct pandecode_flag_info mfbd_extra_flag_info[] = {
+ FLAG_INFO(PRESENT),
+ FLAG_INFO(AFBC),
+ FLAG_INFO(ZS),
+ {}
+};
+#undef FLAG_INFO
+
+#define FLAG_INFO(flag) { MALI_##flag, "MALI_" #flag }
+static const struct pandecode_flag_info shader_midgard1_flag_info [] = {
+ FLAG_INFO(EARLY_Z),
+ FLAG_INFO(HELPER_INVOCATIONS),
+ FLAG_INFO(READS_TILEBUFFER),
+ FLAG_INFO(READS_ZS),
+ {}
+};
+#undef FLAG_INFO
+
+#define FLAG_INFO(flag) { MALI_MFBD_##flag, "MALI_MFBD_" #flag }
+static const struct pandecode_flag_info mfbd_flag_info [] = {
+ FLAG_INFO(DEPTH_WRITE),
+ FLAG_INFO(EXTRA),
+ {}
+};
+#undef FLAG_INFO
+
+
+extern char *replace_fragment;
+extern char *replace_vertex;
+
+static char *
+pandecode_job_type_name(enum mali_job_type type)
+{
+#define DEFINE_CASE(name) case JOB_TYPE_ ## name: return "JOB_TYPE_" #name
+
+ switch (type) {
+ DEFINE_CASE(NULL);
+ DEFINE_CASE(SET_VALUE);
+ DEFINE_CASE(CACHE_FLUSH);
+ DEFINE_CASE(COMPUTE);
+ DEFINE_CASE(VERTEX);
+ DEFINE_CASE(TILER);
+ DEFINE_CASE(FUSED);
+ DEFINE_CASE(FRAGMENT);
+
+ case JOB_NOT_STARTED:
+ return "NOT_STARTED";
+
+ default:
+ pandecode_log("Warning! Unknown job type %x\n", type);
+ return "!?!?!?";
+ }
+
+#undef DEFINE_CASE
+}
+
+static char *
+pandecode_draw_mode_name(enum mali_draw_mode mode)
+{
+#define DEFINE_CASE(name) case MALI_ ## name: return "MALI_" #name
+
+ switch (mode) {
+ DEFINE_CASE(DRAW_NONE);
+ DEFINE_CASE(POINTS);
+ DEFINE_CASE(LINES);
+ DEFINE_CASE(TRIANGLES);
+ DEFINE_CASE(TRIANGLE_STRIP);
+ DEFINE_CASE(TRIANGLE_FAN);
+ DEFINE_CASE(LINE_STRIP);
+ DEFINE_CASE(LINE_LOOP);
+ DEFINE_CASE(POLYGON);
+ DEFINE_CASE(QUADS);
+ DEFINE_CASE(QUAD_STRIP);
+
+ default:
+ return "MALI_TRIANGLES /* XXX: Unknown GL mode, check dump */";
+ }
+
+#undef DEFINE_CASE
+}
+
+#define DEFINE_CASE(name) case MALI_FUNC_ ## name: return "MALI_FUNC_" #name
+static char *
+pandecode_func_name(enum mali_func mode)
+{
+ switch (mode) {
+ DEFINE_CASE(NEVER);
+ DEFINE_CASE(LESS);
+ DEFINE_CASE(EQUAL);
+ DEFINE_CASE(LEQUAL);
+ DEFINE_CASE(GREATER);
+ DEFINE_CASE(NOTEQUAL);
+ DEFINE_CASE(GEQUAL);
+ DEFINE_CASE(ALWAYS);
+
+ default:
+ return "MALI_FUNC_NEVER /* XXX: Unknown function, check dump */";
+ }
+}
+#undef DEFINE_CASE
+
+/* Why is this duplicated? Who knows... */
+#define DEFINE_CASE(name) case MALI_ALT_FUNC_ ## name: return "MALI_ALT_FUNC_" #name
+static char *
+pandecode_alt_func_name(enum mali_alt_func mode)
+{
+ switch (mode) {
+ DEFINE_CASE(NEVER);
+ DEFINE_CASE(LESS);
+ DEFINE_CASE(EQUAL);
+ DEFINE_CASE(LEQUAL);
+ DEFINE_CASE(GREATER);
+ DEFINE_CASE(NOTEQUAL);
+ DEFINE_CASE(GEQUAL);
+ DEFINE_CASE(ALWAYS);
+
+ default:
+ return "MALI_FUNC_NEVER /* XXX: Unknown function, check dump */";
+ }
+}
+#undef DEFINE_CASE
+
+#define DEFINE_CASE(name) case MALI_STENCIL_ ## name: return "MALI_STENCIL_" #name
+static char *
+pandecode_stencil_op_name(enum mali_stencil_op op)
+{
+ switch (op) {
+ DEFINE_CASE(KEEP);
+ DEFINE_CASE(REPLACE);
+ DEFINE_CASE(ZERO);
+ DEFINE_CASE(INVERT);
+ DEFINE_CASE(INCR_WRAP);
+ DEFINE_CASE(DECR_WRAP);
+ DEFINE_CASE(INCR);
+ DEFINE_CASE(DECR);
+
+ default:
+ return "MALI_STENCIL_KEEP /* XXX: Unknown stencil op, check dump */";
+ }
+}
+
+#undef DEFINE_CASE
+
+#define DEFINE_CASE(name) case MALI_ATTR_ ## name: return "MALI_ATTR_" #name
+static char *pandecode_attr_mode_name(enum mali_attr_mode mode)
+{
+ switch(mode) {
+ DEFINE_CASE(UNUSED);
+ DEFINE_CASE(LINEAR);
+ DEFINE_CASE(POT_DIVIDE);
+ DEFINE_CASE(MODULO);
+ DEFINE_CASE(NPOT_DIVIDE);
+ default: return "MALI_ATTR_UNUSED /* XXX: Unknown stencil op, check dump */";
+ }
+}
+
+#undef DEFINE_CASE
+
+#define DEFINE_CASE(name) case MALI_CHANNEL_## name: return "MALI_CHANNEL_" #name
+static char *
+pandecode_channel_name(enum mali_channel channel)
+{
+ switch (channel) {
+ DEFINE_CASE(RED);
+ DEFINE_CASE(GREEN);
+ DEFINE_CASE(BLUE);
+ DEFINE_CASE(ALPHA);
+ DEFINE_CASE(ZERO);
+ DEFINE_CASE(ONE);
+ DEFINE_CASE(RESERVED_0);
+ DEFINE_CASE(RESERVED_1);
+
+ default:
+ return "MALI_CHANNEL_ZERO /* XXX: Unknown channel, check dump */";
+ }
+}
+#undef DEFINE_CASE
+
+#define DEFINE_CASE(name) case MALI_WRAP_## name: return "MALI_WRAP_" #name
+static char *
+pandecode_wrap_mode_name(enum mali_wrap_mode op)
+{
+ switch (op) {
+ DEFINE_CASE(REPEAT);
+ DEFINE_CASE(CLAMP_TO_EDGE);
+ DEFINE_CASE(CLAMP_TO_BORDER);
+ DEFINE_CASE(MIRRORED_REPEAT);
+
+ default:
+ return "MALI_WRAP_REPEAT /* XXX: Unknown wrap mode, check dump */";
+ }
+}
+#undef DEFINE_CASE
+
+#define DEFINE_CASE(name) case MALI_TEX_## name: return "MALI_TEX_" #name
+static char *
+pandecode_texture_type(enum mali_texture_type type)
+{
+ switch (type) {
+ DEFINE_CASE(1D);
+ DEFINE_CASE(2D);
+ DEFINE_CASE(3D);
+ DEFINE_CASE(CUBE);
+
+ default:
+ unreachable("Unknown case");
+ }
+}
+#undef DEFINE_CASE
+
+#define DEFINE_CASE(name) case MALI_MFBD_BLOCK_## name: return "MALI_MFBD_BLOCK_" #name
+static char *
+pandecode_mfbd_block_format(enum mali_mfbd_block_format fmt)
+{
+ switch (fmt) {
+ DEFINE_CASE(TILED);
+ DEFINE_CASE(UNKNOWN);
+ DEFINE_CASE(LINEAR);
+ DEFINE_CASE(AFBC);
+
+ default:
+ unreachable("Invalid case");
+ }
+}
+#undef DEFINE_CASE
+
+static inline char *
+pandecode_decode_fbd_type(enum mali_fbd_type type)
+{
+ if (type == MALI_SFBD) return "SFBD";
+ else if (type == MALI_MFBD) return "MFBD";
+ else return "WATFBD /* XXX */";
+}
+
+/* Midgard's tiler descriptor is embedded within the
+ * larger FBD */
+
+static void
+pandecode_midgard_tiler_descriptor(const struct midgard_tiler_descriptor *t)
+{
+ pandecode_log(".tiler = {\n");
+ pandecode_indent++;
+
+ pandecode_prop("hierarchy_mask = 0x%" PRIx16, t->hierarchy_mask);
+ pandecode_prop("flags = 0x%" PRIx16, t->flags);
+ pandecode_prop("polygon_list_size = 0x%x", t->polygon_list_size);
+
+ MEMORY_PROP(t, polygon_list);
+ MEMORY_PROP(t, polygon_list_body);
+
+ MEMORY_PROP(t, heap_start);
+
+ {
+ /* Points to the end of a buffer */
+ char *a = pointer_as_memory_reference(t->heap_end - 1);
+ pandecode_prop("heap_end = %s + 1", a);
+ free(a);
+ }
+
+ bool nonzero_weights = false;
+
+ for (unsigned w = 0; w < ARRAY_SIZE(t->weights); ++w) {
+ nonzero_weights |= t->weights[w] != 0x0;
+ }
+
+ if (nonzero_weights) {
+ pandecode_log(".weights = {");
+
+ for (unsigned w = 0; w < ARRAY_SIZE(t->weights); ++w) {
+ pandecode_log("%d, ", t->weights[w]);
+ }
+
+ pandecode_log("},");
+ }
+
+ pandecode_indent--;
+ pandecode_log("}\n");
+}
+
+static void
+pandecode_replay_sfbd(uint64_t gpu_va, int job_no)
+{
+ struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
+ const struct mali_single_framebuffer *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va);
+
+ pandecode_log("struct mali_single_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
+ pandecode_indent++;
+
+ pandecode_prop("unknown1 = 0x%" PRIx32, s->unknown1);
+ pandecode_prop("unknown2 = 0x%" PRIx32, s->unknown2);
+
+ pandecode_log(".format = ");
+ pandecode_log_decoded_flags(fb_fmt_flag_info, s->format);
+ pandecode_log_cont(",\n");
+
+ pandecode_prop("width = MALI_POSITIVE(%" PRId16 ")", s->width + 1);
+ pandecode_prop("height = MALI_POSITIVE(%" PRId16 ")", s->height + 1);
+
+ MEMORY_PROP(s, framebuffer);
+ pandecode_prop("stride = %d", s->stride);
+
+ /* Earlier in the actual commandstream -- right before width -- but we
+ * delay to flow nicer */
+
+ pandecode_log(".clear_flags = ");
+ pandecode_log_decoded_flags(clear_flag_info, s->clear_flags);
+ pandecode_log_cont(",\n");
+
+ if (s->depth_buffer | s->depth_buffer_enable) {
+ MEMORY_PROP(s, depth_buffer);
+ pandecode_prop("depth_buffer_enable = %s", DS_ENABLE(s->depth_buffer_enable));
+ }
+
+ if (s->stencil_buffer | s->stencil_buffer_enable) {
+ MEMORY_PROP(s, stencil_buffer);
+ pandecode_prop("stencil_buffer_enable = %s", DS_ENABLE(s->stencil_buffer_enable));
+ }
+
+ if (s->clear_color_1 | s->clear_color_2 | s->clear_color_3 | s->clear_color_4) {
+ pandecode_prop("clear_color_1 = 0x%" PRIx32, s->clear_color_1);
+ pandecode_prop("clear_color_2 = 0x%" PRIx32, s->clear_color_2);
+ pandecode_prop("clear_color_3 = 0x%" PRIx32, s->clear_color_3);
+ pandecode_prop("clear_color_4 = 0x%" PRIx32, s->clear_color_4);
+ }
+
+ if (s->clear_depth_1 != 0 || s->clear_depth_2 != 0 || s->clear_depth_3 != 0 || s->clear_depth_4 != 0) {
+ pandecode_prop("clear_depth_1 = %f", s->clear_depth_1);
+ pandecode_prop("clear_depth_2 = %f", s->clear_depth_2);
+ pandecode_prop("clear_depth_3 = %f", s->clear_depth_3);
+ pandecode_prop("clear_depth_4 = %f", s->clear_depth_4);
+ }
+
+ if (s->clear_stencil) {
+ pandecode_prop("clear_stencil = 0x%x", s->clear_stencil);
+ }
+
+ MEMORY_PROP(s, unknown_address_0);
+ pandecode_midgard_tiler_descriptor(&s->tiler);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ pandecode_prop("zero0 = 0x%" PRIx64, s->zero0);
+ pandecode_prop("zero1 = 0x%" PRIx64, s->zero1);
+ pandecode_prop("zero2 = 0x%" PRIx32, s->zero2);
+ pandecode_prop("zero4 = 0x%" PRIx32, s->zero4);
+
+ printf(".zero3 = {");
+
+ for (int i = 0; i < sizeof(s->zero3) / sizeof(s->zero3[0]); ++i)
+ printf("%X, ", s->zero3[i]);
+
+ printf("},\n");
+
+ printf(".zero6 = {");
+
+ for (int i = 0; i < sizeof(s->zero6) / sizeof(s->zero6[0]); ++i)
+ printf("%X, ", s->zero6[i]);
+
+ printf("},\n");
+}
+
+static void
+pandecode_u32_slide(unsigned name, const u32 *slide, unsigned count)
+{
+ pandecode_log(".unknown%d = {", name);
+
+ for (int i = 0; i < count; ++i)
+ printf("%X, ", slide[i]);
+
+ pandecode_log("},\n");
+}
+
+#define SHORT_SLIDE(num) \
+ pandecode_u32_slide(num, s->unknown ## num, ARRAY_SIZE(s->unknown ## num))
+
+static void
+pandecode_compute_fbd(uint64_t gpu_va, int job_no)
+{
+ struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
+ const struct mali_compute_fbd *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va);
+
+ pandecode_log("struct mali_compute_fbd framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
+ pandecode_indent++;
+
+ SHORT_SLIDE(1);
+
+ pandecode_indent--;
+ printf("},\n");
+}
+
+static void
+pandecode_replay_swizzle(unsigned swizzle)
+{
+ pandecode_prop("swizzle = %s | (%s << 3) | (%s << 6) | (%s << 9)",
+ pandecode_channel_name((swizzle >> 0) & 0x7),
+ pandecode_channel_name((swizzle >> 3) & 0x7),
+ pandecode_channel_name((swizzle >> 6) & 0x7),
+ pandecode_channel_name((swizzle >> 9) & 0x7));
+}
+
+static void
+pandecode_rt_format(struct mali_rt_format format)
+{
+ pandecode_log(".format = {\n");
+ pandecode_indent++;
+
+ pandecode_prop("unk1 = 0x%" PRIx32, format.unk1);
+ pandecode_prop("unk2 = 0x%" PRIx32, format.unk2);
+ pandecode_prop("unk3 = 0x%" PRIx32, format.unk3);
+
+ pandecode_prop("block = %s",
+ pandecode_mfbd_block_format(format.block));
+
+ pandecode_prop("nr_channels = MALI_POSITIVE(%d)",
+ MALI_NEGATIVE(format.nr_channels));
+
+ pandecode_log(".flags = ");
+ pandecode_log_decoded_flags(mfbd_fmt_flag_info, format.flags);
+ pandecode_log_cont(",\n");
+
+ pandecode_replay_swizzle(format.swizzle);
+
+ pandecode_prop("unk4 = 0x%" PRIx32, format.unk4);
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+}
+
+static void
+pandecode_render_target(uint64_t gpu_va, unsigned job_no, const struct bifrost_framebuffer *fb)
+{
+ pandecode_log("struct bifrost_render_target rts_list_%"PRIx64"_%d[] = {\n", gpu_va, job_no);
+ pandecode_indent++;
+
+ for (int i = 0; i < MALI_NEGATIVE(fb->rt_count_1); i++) {
+ mali_ptr rt_va = gpu_va + i * sizeof(struct bifrost_render_target);
+ struct pandecode_mapped_memory *mem =
+ pandecode_find_mapped_gpu_mem_containing(rt_va);
+ const struct bifrost_render_target *PANDECODE_PTR_VAR(rt, mem, (mali_ptr) rt_va);
+
+ pandecode_log("{\n");
+ pandecode_indent++;
+
+ pandecode_rt_format(rt->format);
+
+ if (rt->format.block == MALI_MFBD_BLOCK_AFBC) {
+ pandecode_log(".afbc = {\n");
+ pandecode_indent++;
+
+ char *a = pointer_as_memory_reference(rt->afbc.metadata);
+ pandecode_prop("metadata = %s", a);
+ free(a);
+
+ pandecode_prop("stride = %d", rt->afbc.stride);
+ pandecode_prop("unk = 0x%" PRIx32, rt->afbc.unk);
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+ } else {
+ pandecode_log(".chunknown = {\n");
+ pandecode_indent++;
+
+ pandecode_prop("unk = 0x%" PRIx64, rt->chunknown.unk);
+
+ char *a = pointer_as_memory_reference(rt->chunknown.pointer);
+ pandecode_prop("pointer = %s", a);
+ free(a);
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+ }
+
+ MEMORY_PROP(rt, framebuffer);
+ pandecode_prop("framebuffer_stride = %d", rt->framebuffer_stride);
+
+ if (rt->clear_color_1 | rt->clear_color_2 | rt->clear_color_3 | rt->clear_color_4) {
+ pandecode_prop("clear_color_1 = 0x%" PRIx32, rt->clear_color_1);
+ pandecode_prop("clear_color_2 = 0x%" PRIx32, rt->clear_color_2);
+ pandecode_prop("clear_color_3 = 0x%" PRIx32, rt->clear_color_3);
+ pandecode_prop("clear_color_4 = 0x%" PRIx32, rt->clear_color_4);
+ }
+
+ if (rt->zero1 || rt->zero2 || rt->zero3) {
+ pandecode_msg("render target zeros tripped\n");
+ pandecode_prop("zero1 = 0x%" PRIx64, rt->zero1);
+ pandecode_prop("zero2 = 0x%" PRIx32, rt->zero2);
+ pandecode_prop("zero3 = 0x%" PRIx32, rt->zero3);
+ }
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+}
+
+static unsigned
+pandecode_replay_mfbd_bfr(uint64_t gpu_va, int job_no, bool with_render_targets)
+{
+ struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
+ const struct bifrost_framebuffer *PANDECODE_PTR_VAR(fb, mem, (mali_ptr) gpu_va);
+
+ if (fb->sample_locations) {
+ /* The blob stores all possible sample locations in a single buffer
+ * allocated on startup, and just switches the pointer when switching
+ * MSAA state. For now, we just put the data into the cmdstream, but we
+ * should do something like what the blob does with a real driver.
+ *
+ * There seem to be 32 slots for sample locations, followed by another
+ * 16. The second 16 is just the center location followed by 15 zeros
+ * in all the cases I've identified (maybe shader vs. depth/color
+ * samples?).
+ */
+
+ struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(fb->sample_locations);
+
+ const u16 *PANDECODE_PTR_VAR(samples, smem, fb->sample_locations);
+
+ pandecode_log("uint16_t sample_locations_%d[] = {\n", job_no);
+ pandecode_indent++;
+
+ for (int i = 0; i < 32 + 16; i++) {
+ pandecode_log("%d, %d,\n", samples[2 * i], samples[2 * i + 1]);
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+ }
+
+ pandecode_log("struct bifrost_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
+ pandecode_indent++;
+
+ pandecode_prop("unk0 = 0x%x", fb->unk0);
+
+ if (fb->sample_locations)
+ pandecode_prop("sample_locations = sample_locations_%d", job_no);
+
+ /* Assume that unknown1 was emitted in the last job for
+ * now */
+ MEMORY_PROP(fb, unknown1);
+
+ pandecode_prop("width1 = MALI_POSITIVE(%d)", fb->width1 + 1);
+ pandecode_prop("height1 = MALI_POSITIVE(%d)", fb->height1 + 1);
+ pandecode_prop("width2 = MALI_POSITIVE(%d)", fb->width2 + 1);
+ pandecode_prop("height2 = MALI_POSITIVE(%d)", fb->height2 + 1);
+
+ pandecode_prop("unk1 = 0x%x", fb->unk1);
+ pandecode_prop("unk2 = 0x%x", fb->unk2);
+ pandecode_prop("rt_count_1 = MALI_POSITIVE(%d)", fb->rt_count_1 + 1);
+ pandecode_prop("rt_count_2 = %d", fb->rt_count_2);
+
+ pandecode_log(".mfbd_flags = ");
+ pandecode_log_decoded_flags(mfbd_flag_info, fb->mfbd_flags);
+ pandecode_log_cont(",\n");
+
+ pandecode_prop("clear_stencil = 0x%x", fb->clear_stencil);
+ pandecode_prop("clear_depth = %f", fb->clear_depth);
+
+ pandecode_prop("unknown2 = 0x%x", fb->unknown2);
+ MEMORY_PROP(fb, scratchpad);
+ pandecode_midgard_tiler_descriptor(&fb->tiler);
+
+ if (fb->zero3 || fb->zero4) {
+ pandecode_msg("framebuffer zeros tripped\n");
+ pandecode_prop("zero3 = 0x%" PRIx32, fb->zero3);
+ pandecode_prop("zero4 = 0x%" PRIx32, fb->zero4);
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ gpu_va += sizeof(struct bifrost_framebuffer);
+
+ if ((fb->mfbd_flags & MALI_MFBD_EXTRA) && with_render_targets) {
+ mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
+ const struct bifrost_fb_extra *PANDECODE_PTR_VAR(fbx, mem, (mali_ptr) gpu_va);
+
+ pandecode_log("struct bifrost_fb_extra fb_extra_%"PRIx64"_%d = {\n", gpu_va, job_no);
+ pandecode_indent++;
+
+ MEMORY_PROP(fbx, checksum);
+
+ if (fbx->checksum_stride)
+ pandecode_prop("checksum_stride = %d", fbx->checksum_stride);
+
+ pandecode_log(".flags = ");
+ pandecode_log_decoded_flags(mfbd_extra_flag_info, fbx->flags);
+ pandecode_log_cont(",\n");
+
+ if (fbx->flags & MALI_EXTRA_AFBC_ZS) {
+ pandecode_log(".ds_afbc = {\n");
+ pandecode_indent++;
+
+ MEMORY_PROP((&fbx->ds_afbc), depth_stencil_afbc_metadata);
+ pandecode_prop("depth_stencil_afbc_stride = %d",
+ fbx->ds_afbc.depth_stencil_afbc_stride);
+ MEMORY_PROP((&fbx->ds_afbc), depth_stencil);
+
+ if (fbx->ds_afbc.zero1 || fbx->ds_afbc.padding) {
+ pandecode_msg("Depth/stencil AFBC zeros tripped\n");
+ pandecode_prop("zero1 = 0x%" PRIx32,
+ fbx->ds_afbc.zero1);
+ pandecode_prop("padding = 0x%" PRIx64,
+ fbx->ds_afbc.padding);
+ }
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+ } else {
+ pandecode_log(".ds_linear = {\n");
+ pandecode_indent++;
+
+ if (fbx->ds_linear.depth) {
+ MEMORY_PROP((&fbx->ds_linear), depth);
+ pandecode_prop("depth_stride = %d",
+ fbx->ds_linear.depth_stride);
+ }
+
+ if (fbx->ds_linear.stencil) {
+ MEMORY_PROP((&fbx->ds_linear), stencil);
+ pandecode_prop("stencil_stride = %d",
+ fbx->ds_linear.stencil_stride);
+ }
+
+ if (fbx->ds_linear.depth_stride_zero ||
+ fbx->ds_linear.stencil_stride_zero ||
+ fbx->ds_linear.zero1 || fbx->ds_linear.zero2) {
+ pandecode_msg("Depth/stencil zeros tripped\n");
+ pandecode_prop("depth_stride_zero = 0x%x",
+ fbx->ds_linear.depth_stride_zero);
+ pandecode_prop("stencil_stride_zero = 0x%x",
+ fbx->ds_linear.stencil_stride_zero);
+ pandecode_prop("zero1 = 0x%" PRIx32,
+ fbx->ds_linear.zero1);
+ pandecode_prop("zero2 = 0x%" PRIx32,
+ fbx->ds_linear.zero2);
+ }
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+ }
+
+ if (fbx->zero3 || fbx->zero4) {
+ pandecode_msg("fb_extra zeros tripped\n");
+ pandecode_prop("zero3 = 0x%" PRIx64, fbx->zero3);
+ pandecode_prop("zero4 = 0x%" PRIx64, fbx->zero4);
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ gpu_va += sizeof(struct bifrost_fb_extra);
+ }
+
+ if (with_render_targets)
+ pandecode_render_target(gpu_va, job_no, fb);
+
+ /* Passback the render target count */
+ return MALI_NEGATIVE(fb->rt_count_1);
+}
+
+/* Just add a comment decoding the shift/odd fields forming the padded vertices
+ * count */
+
+static void
+pandecode_padded_vertices(unsigned shift, unsigned k)
+{
+ unsigned odd = 2*k + 1;
+ unsigned pot = 1 << shift;
+ pandecode_msg("padded_num_vertices = %d\n", odd * pot);
+}
+
+/* Given a magic divisor, recover what we were trying to divide by.
+ *
+ * Let m represent the magic divisor. By definition, m is an element on Z, whre
+ * 0 <= m < 2^N, for N bits in m.
+ *
+ * Let q represent the number we would like to divide by.
+ *
+ * By definition of a magic divisor for N-bit unsigned integers (a number you
+ * multiply by to magically get division), m is a number such that:
+ *
+ * (m * x) & (2^N - 1) = floor(x/q).
+ * for all x on Z where 0 <= x < 2^N
+ *
+ * Ignore the case where any of the above values equals zero; it is irrelevant
+ * for our purposes (instanced arrays).
+ *
+ * Choose x = q. Then:
+ *
+ * (m * x) & (2^N - 1) = floor(x/q).
+ * (m * q) & (2^N - 1) = floor(q/q).
+ *
+ * floor(q/q) = floor(1) = 1, therefore:
+ *
+ * (m * q) & (2^N - 1) = 1
+ *
+ * Recall the identity that the bitwise AND of one less than a power-of-two
+ * equals the modulo with that power of two, i.e. for all x:
+ *
+ * x & (2^N - 1) = x % N
+ *
+ * Therefore:
+ *
+ * mq % (2^N) = 1
+ *
+ * By definition, a modular multiplicative inverse of a number m is the number
+ * q such that with respect to a modulos M:
+ *
+ * mq % M = 1
+ *
+ * Therefore, q is the modular multiplicative inverse of m with modulus 2^N.
+ *
+ */
+
+static void
+pandecode_magic_divisor(uint32_t magic, unsigned shift, unsigned orig_divisor, unsigned extra)
+{
+ /* Compute the modular inverse of `magic` with respect to 2^(32 -
+ * shift) the most lame way possible... just repeatedly add.
+ * Asymptoptically slow but nobody cares in practice, unless you have
+ * massive numbers of vertices or high divisors. */
+
+ unsigned inverse = 0;
+
+ /* Magic implicitly has the highest bit set */
+ magic |= (1 << 31);
+
+ /* Depending on rounding direction */
+ if (extra)
+ magic++;
+
+ for (;;) {
+ uint32_t product = magic * inverse;
+
+ if (shift) {
+ product >>= shift;
+ }
+
+ if (product == 1)
+ break;
+
+ ++inverse;
+ }
+
+ pandecode_msg("dividing by %d (maybe off by two)\n", inverse);
+
+ /* Recall we're supposed to divide by (gl_level_divisor *
+ * padded_num_vertices) */
+
+ unsigned padded_num_vertices = inverse / orig_divisor;
+
+ pandecode_msg("padded_num_vertices = %d\n", padded_num_vertices);
+}
+
+static void
+pandecode_replay_attributes(const struct pandecode_mapped_memory *mem,
+ mali_ptr addr, int job_no, char *suffix,
+ int count, bool varying)
+{
+ char *prefix = varying ? "varyings" : "attributes";
+
+ union mali_attr *attr = pandecode_fetch_gpu_mem(mem, addr, sizeof(union mali_attr) * count);
+
+ char base[128];
+ snprintf(base, sizeof(base), "%s_data_%d%s", prefix, job_no, suffix);
+
+ for (int i = 0; i < count; ++i) {
+ enum mali_attr_mode mode = attr[i].elements & 7;
+
+ if (mode == MALI_ATTR_UNUSED)
+ continue;
+
+ mali_ptr raw_elements = attr[i].elements & ~7;
+
+ /* TODO: Do we maybe want to dump the attribute values
+ * themselves given the specified format? Or is that too hard?
+ * */
+
+ char *a = pointer_as_memory_reference(raw_elements);
+ pandecode_log("mali_ptr %s_%d_p = %s;\n", base, i, a);
+ free(a);
+ }
+
+ pandecode_log("union mali_attr %s_%d[] = {\n", prefix, job_no);
+ pandecode_indent++;
+
+ for (int i = 0; i < count; ++i) {
+ pandecode_log("{\n");
+ pandecode_indent++;
+
+ unsigned mode = attr[i].elements & 7;
+ pandecode_prop("elements = (%s_%d_p) | %s", base, i, pandecode_attr_mode_name(mode));
+ pandecode_prop("shift = %d", attr[i].shift);
+ pandecode_prop("extra_flags = %d", attr[i].extra_flags);
+ pandecode_prop("stride = 0x%" PRIx32, attr[i].stride);
+ pandecode_prop("size = 0x%" PRIx32, attr[i].size);
+
+ /* Decode further where possible */
+
+ if (mode == MALI_ATTR_MODULO) {
+ pandecode_padded_vertices(
+ attr[i].shift,
+ attr[i].extra_flags);
+ }
+
+ pandecode_indent--;
+ pandecode_log("}, \n");
+
+ if (mode == MALI_ATTR_NPOT_DIVIDE) {
+ i++;
+ pandecode_log("{\n");
+ pandecode_indent++;
+ pandecode_prop("unk = 0x%x", attr[i].unk);
+ pandecode_prop("magic_divisor = 0x%08x", attr[i].magic_divisor);
+ if (attr[i].zero != 0)
+ pandecode_prop("zero = 0x%x /* XXX zero tripped */", attr[i].zero);
+ pandecode_prop("divisor = %d", attr[i].divisor);
+ pandecode_magic_divisor(attr[i].magic_divisor, attr[i - 1].shift, attr[i].divisor, attr[i - 1].extra_flags);
+ pandecode_indent--;
+ pandecode_log("}, \n");
+ }
+
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+}
+
+static mali_ptr
+pandecode_replay_shader_address(const char *name, mali_ptr ptr)
+{
+ /* TODO: Decode flags */
+ mali_ptr shader_ptr = ptr & ~15;
+
+ char *a = pointer_as_memory_reference(shader_ptr);
+ pandecode_prop("%s = (%s) | %d", name, a, (int) (ptr & 15));
+ free(a);
+
+ return shader_ptr;
+}
+
+static bool
+all_zero(unsigned *buffer, unsigned count)
+{
+ for (unsigned i = 0; i < count; ++i) {
+ if (buffer[i])
+ return false;
+ }
+
+ return true;
+}
+
+static void
+pandecode_replay_stencil(const char *name, const struct mali_stencil_test *stencil)
+{
+ if (all_zero((unsigned *) stencil, sizeof(stencil) / sizeof(unsigned)))
+ return;
+
+ const char *func = pandecode_func_name(stencil->func);
+ const char *sfail = pandecode_stencil_op_name(stencil->sfail);
+ const char *dpfail = pandecode_stencil_op_name(stencil->dpfail);
+ const char *dppass = pandecode_stencil_op_name(stencil->dppass);
+
+ if (stencil->zero)
+ pandecode_msg("Stencil zero tripped: %X\n", stencil->zero);
+
+ pandecode_log(".stencil_%s = {\n", name);
+ pandecode_indent++;
+ pandecode_prop("ref = %d", stencil->ref);
+ pandecode_prop("mask = 0x%02X", stencil->mask);
+ pandecode_prop("func = %s", func);
+ pandecode_prop("sfail = %s", sfail);
+ pandecode_prop("dpfail = %s", dpfail);
+ pandecode_prop("dppass = %s", dppass);
+ pandecode_indent--;
+ pandecode_log("},\n");
+}
+
+static void
+pandecode_replay_blend_equation(const struct mali_blend_equation *blend)
+{
+ if (blend->zero1)
+ pandecode_msg("Blend zero tripped: %X\n", blend->zero1);
+
+ pandecode_log(".equation = {\n");
+ pandecode_indent++;
+
+ pandecode_prop("rgb_mode = 0x%X", blend->rgb_mode);
+ pandecode_prop("alpha_mode = 0x%X", blend->alpha_mode);
+
+ pandecode_log(".color_mask = ");
+ pandecode_log_decoded_flags(mask_flag_info, blend->color_mask);
+ pandecode_log_cont(",\n");
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+}
+
+/* Decodes a Bifrost blend constant. See the notes in bifrost_blend_rt */
+
+static unsigned
+decode_bifrost_constant(u16 constant)
+{
+ float lo = (float) (constant & 0xFF);
+ float hi = (float) (constant >> 8);
+
+ return (hi / 255.0) + (lo / 65535.0);
+}
+
+static mali_ptr
+pandecode_bifrost_blend(void *descs, int job_no, int rt_no)
+{
+ struct bifrost_blend_rt *b =
+ ((struct bifrost_blend_rt *) descs) + rt_no;
+
+ pandecode_log("struct bifrost_blend_rt blend_rt_%d_%d = {\n", job_no, rt_no);
+ pandecode_indent++;
+
+ pandecode_prop("flags = 0x%" PRIx16, b->flags);
+ pandecode_prop("constant = 0x%" PRIx8 " /* %f */",
+ b->constant, decode_bifrost_constant(b->constant));
+
+ /* TODO figure out blend shader enable bit */
+ pandecode_replay_blend_equation(&b->equation);
+ pandecode_prop("unk2 = 0x%" PRIx16, b->unk2);
+ pandecode_prop("index = 0x%" PRIx16, b->index);
+ pandecode_prop("shader = 0x%" PRIx32, b->shader);
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+
+ return 0;
+}
+
+static mali_ptr
+pandecode_midgard_blend(union midgard_blend *blend, bool is_shader)
+{
+ if (all_zero((unsigned *) blend, sizeof(blend) / sizeof(unsigned)))
+ return 0;
+
+ pandecode_log(".blend = {\n");
+ pandecode_indent++;
+
+ if (is_shader) {
+ pandecode_replay_shader_address("shader", blend->shader);
+ } else {
+ pandecode_replay_blend_equation(&blend->equation);
+ pandecode_prop("constant = %f", blend->constant);
+ }
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+
+ /* Return blend shader to disassemble if present */
+ return is_shader ? (blend->shader & ~0xF) : 0;
+}
+
+static mali_ptr
+pandecode_midgard_blend_mrt(void *descs, int job_no, int rt_no)
+{
+ struct midgard_blend_rt *b =
+ ((struct midgard_blend_rt *) descs) + rt_no;
+
+ /* Flags determine presence of blend shader */
+ bool is_shader = (b->flags & 0xF) >= 0x2;
+
+ pandecode_log("struct midgard_blend_rt blend_rt_%d_%d = {\n", job_no, rt_no);
+ pandecode_indent++;
+
+ pandecode_prop("flags = 0x%" PRIx64, b->flags);
+
+ mali_ptr shader = pandecode_midgard_blend(&b->blend, is_shader);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ return shader;
+}
+
+static int
+pandecode_replay_attribute_meta(int job_no, int count, const struct mali_vertex_tiler_postfix *v, bool varying, char *suffix)
+{
+ char base[128];
+ char *prefix = varying ? "varying" : "attribute";
+ unsigned max_index = 0;
+ snprintf(base, sizeof(base), "%s_meta", prefix);
+
+ pandecode_log("struct mali_attr_meta %s_%d%s[] = {\n", base, job_no, suffix);
+ pandecode_indent++;
+
+ struct mali_attr_meta *attr_meta;
+ mali_ptr p = varying ? (v->varying_meta & ~0xF) : v->attribute_meta;
+
+ struct pandecode_mapped_memory *attr_mem = pandecode_find_mapped_gpu_mem_containing(p);
+
+ for (int i = 0; i < count; ++i, p += sizeof(struct mali_attr_meta)) {
+ attr_meta = pandecode_fetch_gpu_mem(attr_mem, p,
+ sizeof(*attr_mem));
+
+ pandecode_log("{\n");
+ pandecode_indent++;
+ pandecode_prop("index = %d", attr_meta->index);
+
+ if (attr_meta->index > max_index)
+ max_index = attr_meta->index;
+ pandecode_replay_swizzle(attr_meta->swizzle);
+ pandecode_prop("format = %s", pandecode_format_name(attr_meta->format));
+
+ pandecode_prop("unknown1 = 0x%" PRIx64, (u64) attr_meta->unknown1);
+ pandecode_prop("unknown3 = 0x%" PRIx64, (u64) attr_meta->unknown3);
+ pandecode_prop("src_offset = %d", attr_meta->src_offset);
+ pandecode_indent--;
+ pandecode_log("},\n");
+
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ return max_index;
+}
+
+static void
+pandecode_replay_indices(uintptr_t pindices, uint32_t index_count, int job_no)
+{
+ struct pandecode_mapped_memory *imem = pandecode_find_mapped_gpu_mem_containing(pindices);
+
+ if (imem) {
+ /* Indices are literally just a u32 array :) */
+
+ uint32_t *PANDECODE_PTR_VAR(indices, imem, pindices);
+
+ pandecode_log("uint32_t indices_%d[] = {\n", job_no);
+ pandecode_indent++;
+
+ for (unsigned i = 0; i < (index_count + 1); i += 3)
+ pandecode_log("%d, %d, %d,\n",
+ indices[i],
+ indices[i + 1],
+ indices[i + 2]);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+ }
+}
+
+/* return bits [lo, hi) of word */
+static u32
+bits(u32 word, u32 lo, u32 hi)
+{
+ if (hi - lo >= 32)
+ return word; // avoid undefined behavior with the shift
+
+ return (word >> lo) & ((1 << (hi - lo)) - 1);
+}
+
+static void
+pandecode_replay_vertex_tiler_prefix(struct mali_vertex_tiler_prefix *p, int job_no)
+{
+ pandecode_log_cont("{\n");
+ pandecode_indent++;
+
+ pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count);
+ pandecode_prop("size_y_shift = %d", p->size_y_shift);
+ pandecode_prop("size_z_shift = %d", p->size_z_shift);
+ pandecode_prop("workgroups_x_shift = %d", p->workgroups_x_shift);
+ pandecode_prop("workgroups_y_shift = %d", p->workgroups_y_shift);
+ pandecode_prop("workgroups_z_shift = %d", p->workgroups_z_shift);
+ pandecode_prop("workgroups_x_shift_2 = 0x%" PRIx32, p->workgroups_x_shift_2);
+
+ /* Decode invocation_count. See the comment before the definition of
+ * invocation_count for an explanation.
+ */
+ pandecode_msg("size: (%d, %d, %d)\n",
+ bits(p->invocation_count, 0, p->size_y_shift) + 1,
+ bits(p->invocation_count, p->size_y_shift, p->size_z_shift) + 1,
+ bits(p->invocation_count, p->size_z_shift,
+ p->workgroups_x_shift) + 1);
+ pandecode_msg("workgroups: (%d, %d, %d)\n",
+ bits(p->invocation_count, p->workgroups_x_shift,
+ p->workgroups_y_shift) + 1,
+ bits(p->invocation_count, p->workgroups_y_shift,
+ p->workgroups_z_shift) + 1,
+ bits(p->invocation_count, p->workgroups_z_shift,
+ 32) + 1);
+
+ /* TODO: Decode */
+ if (p->unknown_draw)
+ pandecode_prop("unknown_draw = 0x%" PRIx32, p->unknown_draw);
+
+ pandecode_prop("workgroups_x_shift_3 = 0x%" PRIx32, p->workgroups_x_shift_3);
+
+ pandecode_prop("draw_mode = %s", pandecode_draw_mode_name(p->draw_mode));
+
+ /* Index count only exists for tiler jobs anyway */
+
+ if (p->index_count)
+ pandecode_prop("index_count = MALI_POSITIVE(%" PRId32 ")", p->index_count + 1);
+
+ if (p->negative_start)
+ pandecode_prop("negative_start = %d", p->negative_start);
+
+ DYN_MEMORY_PROP(p, job_no, indices);
+
+ if (p->zero1) {
+ pandecode_msg("Zero tripped\n");
+ pandecode_prop("zero1 = 0x%" PRIx32, p->zero1);
+ }
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+}
+
+static void
+pandecode_replay_uniform_buffers(mali_ptr pubufs, int ubufs_count, int job_no)
+{
+ struct pandecode_mapped_memory *umem = pandecode_find_mapped_gpu_mem_containing(pubufs);
+
+ struct mali_uniform_buffer_meta *PANDECODE_PTR_VAR(ubufs, umem, pubufs);
+
+ for (int i = 0; i < ubufs_count; i++) {
+ mali_ptr ptr = ubufs[i].ptr << 2;
+ struct pandecode_mapped_memory *umem2 = pandecode_find_mapped_gpu_mem_containing(ptr);
+ uint32_t *PANDECODE_PTR_VAR(ubuf, umem2, ptr);
+ char name[50];
+ snprintf(name, sizeof(name), "ubuf_%d", i);
+ /* The blob uses ubuf 0 to upload internal stuff and
+ * uniforms that won't fit/are accessed indirectly, so
+ * it puts it in the batchbuffer.
+ */
+ pandecode_log("uint32_t %s_%d[] = {\n", name, job_no);
+ pandecode_indent++;
+
+ for (int j = 0; j <= ubufs[i].size; j++) {
+ for (int k = 0; k < 4; k++) {
+ if (k == 0)
+ pandecode_log("0x%"PRIx32", ", ubuf[4 * j + k]);
+ else
+ pandecode_log_cont("0x%"PRIx32", ", ubuf[4 * j + k]);
+
+ }
+
+ pandecode_log_cont("\n");
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+ }
+
+ pandecode_log("struct mali_uniform_buffer_meta uniform_buffers_%d[] = {\n",
+ job_no);
+ pandecode_indent++;
+
+ for (int i = 0; i < ubufs_count; i++) {
+ pandecode_log("{\n");
+ pandecode_indent++;
+ pandecode_prop("size = MALI_POSITIVE(%d)", ubufs[i].size + 1);
+ pandecode_prop("ptr = ubuf_%d_%d_p >> 2", i, job_no);
+ pandecode_indent--;
+ pandecode_log("},\n");
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+}
+
+static void
+pandecode_replay_scratchpad(uintptr_t pscratchpad, int job_no, char *suffix)
+{
+
+ struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(pscratchpad);
+
+ struct bifrost_scratchpad *PANDECODE_PTR_VAR(scratchpad, mem, pscratchpad);
+
+ if (scratchpad->zero)
+ pandecode_msg("XXX scratchpad zero tripped");
+
+ pandecode_log("struct bifrost_scratchpad scratchpad_%"PRIx64"_%d%s = {\n", pscratchpad, job_no, suffix);
+ pandecode_indent++;
+
+ pandecode_prop("flags = 0x%x", scratchpad->flags);
+ MEMORY_PROP(scratchpad, gpu_scratchpad);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+}
+
+static void
+pandecode_shader_disassemble(mali_ptr shader_ptr, int shader_no, int type,
+ bool is_bifrost)
+{
+ struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(shader_ptr);
+ uint8_t *PANDECODE_PTR_VAR(code, mem, shader_ptr);
+
+ /* Compute maximum possible size */
+ size_t sz = mem->length - (shader_ptr - mem->gpu_va);
+
+ /* Print some boilerplate to clearly denote the assembly (which doesn't
+ * obey indentation rules), and actually do the disassembly! */
+
+ printf("\n\n");
+
+ if (is_bifrost) {
+ disassemble_bifrost(code, sz, false);
+ } else {
+ disassemble_midgard(code, sz);
+ }
+
+ printf("\n\n");
+}
+
+static void
+pandecode_replay_vertex_tiler_postfix_pre(const struct mali_vertex_tiler_postfix *p,
+ int job_no, enum mali_job_type job_type,
+ char *suffix, bool is_bifrost)
+{
+ mali_ptr shader_meta_ptr = (u64) (uintptr_t) (p->_shader_upper << 4);
+ struct pandecode_mapped_memory *attr_mem;
+
+ unsigned rt_count = 1;
+
+ /* On Bifrost, since the tiler heap (for tiler jobs) and the scratchpad
+ * are the only things actually needed from the FBD, vertex/tiler jobs
+ * no longer reference the FBD -- instead, this field points to some
+ * info about the scratchpad.
+ */
+ if (is_bifrost)
+ pandecode_replay_scratchpad(p->framebuffer & ~FBD_TYPE, job_no, suffix);
+ else if (p->framebuffer & MALI_MFBD)
+ rt_count = pandecode_replay_mfbd_bfr((u64) ((uintptr_t) p->framebuffer) & FBD_MASK, job_no, false);
+ else if (job_type == JOB_TYPE_COMPUTE)
+ pandecode_compute_fbd((u64) (uintptr_t) p->framebuffer, job_no);
+ else
+ pandecode_replay_sfbd((u64) (uintptr_t) p->framebuffer, job_no);
+
+ int varying_count = 0, attribute_count = 0, uniform_count = 0, uniform_buffer_count = 0;
+ int texture_count = 0, sampler_count = 0;
+
+ if (shader_meta_ptr) {
+ struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(shader_meta_ptr);
+ struct mali_shader_meta *PANDECODE_PTR_VAR(s, smem, shader_meta_ptr);
+
+ pandecode_log("struct mali_shader_meta shader_meta_%"PRIx64"_%d%s = {\n", shader_meta_ptr, job_no, suffix);
+ pandecode_indent++;
+
+ /* Save for dumps */
+ attribute_count = s->attribute_count;
+ varying_count = s->varying_count;
+ texture_count = s->texture_count;
+ sampler_count = s->sampler_count;
+
+ if (is_bifrost) {
+ uniform_count = s->bifrost2.uniform_count;
+ uniform_buffer_count = s->bifrost1.uniform_buffer_count;
+ } else {
+ uniform_count = s->midgard1.uniform_count;
+ uniform_buffer_count = s->midgard1.uniform_buffer_count;
+ }
+
+ mali_ptr shader_ptr = pandecode_replay_shader_address("shader", s->shader);
+
+ pandecode_prop("texture_count = %" PRId16, s->texture_count);
+ pandecode_prop("sampler_count = %" PRId16, s->sampler_count);
+ pandecode_prop("attribute_count = %" PRId16, s->attribute_count);
+ pandecode_prop("varying_count = %" PRId16, s->varying_count);
+
+ if (is_bifrost) {
+ pandecode_log(".bifrost1 = {\n");
+ pandecode_indent++;
+
+ pandecode_prop("uniform_buffer_count = %" PRId32, s->bifrost1.uniform_buffer_count);
+ pandecode_prop("unk1 = 0x%" PRIx32, s->bifrost1.unk1);
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+ } else {
+ pandecode_log(".midgard1 = {\n");
+ pandecode_indent++;
+
+ pandecode_prop("uniform_count = %" PRId16, s->midgard1.uniform_count);
+ pandecode_prop("uniform_buffer_count = %" PRId16, s->midgard1.uniform_buffer_count);
+ pandecode_prop("work_count = %" PRId16, s->midgard1.work_count);
+
+ pandecode_log(".flags = ");
+ pandecode_log_decoded_flags(shader_midgard1_flag_info, s->midgard1.flags);
+ pandecode_log_cont(",\n");
+
+ pandecode_prop("unknown2 = 0x%" PRIx32, s->midgard1.unknown2);
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+ }
+
+ if (s->depth_units || s->depth_factor) {
+ if (is_bifrost)
+ pandecode_prop("depth_units = %f", s->depth_units);
+ else
+ pandecode_prop("depth_units = MALI_NEGATIVE(%f)", s->depth_units - 1.0f);
+
+ pandecode_prop("depth_factor = %f", s->depth_factor);
+ }
+
+ if (s->alpha_coverage) {
+ bool invert_alpha_coverage = s->alpha_coverage & 0xFFF0;
+ uint16_t inverted_coverage = invert_alpha_coverage ? ~s->alpha_coverage : s->alpha_coverage;
+
+ pandecode_prop("alpha_coverage = %sMALI_ALPHA_COVERAGE(%f)",
+ invert_alpha_coverage ? "~" : "",
+ MALI_GET_ALPHA_COVERAGE(inverted_coverage));
+ }
+
+ if (s->unknown2_3 || s->unknown2_4) {
+ pandecode_log(".unknown2_3 = ");
+
+ int unknown2_3 = s->unknown2_3;
+ int unknown2_4 = s->unknown2_4;
+
+ /* We're not quite sure what these flags mean without the depth test, if anything */
+
+ if (unknown2_3 & (MALI_DEPTH_TEST | MALI_DEPTH_FUNC_MASK)) {
+ const char *func = pandecode_func_name(MALI_GET_DEPTH_FUNC(unknown2_3));
+ unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
+
+ pandecode_log_cont("MALI_DEPTH_FUNC(%s) | ", func);
+ }
+
+ pandecode_log_decoded_flags(u3_flag_info, unknown2_3);
+ pandecode_log_cont(",\n");
+
+ pandecode_log(".unknown2_4 = ");
+ pandecode_log_decoded_flags(u4_flag_info, unknown2_4);
+ pandecode_log_cont(",\n");
+ }
+
+ if (s->stencil_mask_front || s->stencil_mask_back) {
+ pandecode_prop("stencil_mask_front = 0x%02X", s->stencil_mask_front);
+ pandecode_prop("stencil_mask_back = 0x%02X", s->stencil_mask_back);
+ }
+
+ pandecode_replay_stencil("front", &s->stencil_front);
+ pandecode_replay_stencil("back", &s->stencil_back);
+
+ if (is_bifrost) {
+ pandecode_log(".bifrost2 = {\n");
+ pandecode_indent++;
+
+ pandecode_prop("unk3 = 0x%" PRIx32, s->bifrost2.unk3);
+ pandecode_prop("preload_regs = 0x%" PRIx32, s->bifrost2.preload_regs);
+ pandecode_prop("uniform_count = %" PRId32, s->bifrost2.uniform_count);
+ pandecode_prop("unk4 = 0x%" PRIx32, s->bifrost2.unk4);
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+ } else if (s->midgard2.unknown2_7) {
+ pandecode_log(".midgard2 = {\n");
+ pandecode_indent++;
+
+ pandecode_prop("unknown2_7 = 0x%" PRIx32, s->midgard2.unknown2_7);
+ pandecode_indent--;
+ pandecode_log("},\n");
+ }
+
+ if (s->unknown2_8)
+ pandecode_prop("unknown2_8 = 0x%" PRIx32, s->unknown2_8);
+
+ if (!is_bifrost) {
+ /* TODO: Blend shaders routing/disasm */
+
+ pandecode_midgard_blend(&s->blend, false);
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ /* MRT blend fields are used whenever MFBD is used, with
+ * per-RT descriptors */
+
+ if (job_type == JOB_TYPE_TILER) {
+ void* blend_base = (void *) (s + 1);
+
+ for (unsigned i = 0; i < rt_count; i++) {
+ mali_ptr shader = 0;
+
+ if (is_bifrost)
+ shader = pandecode_bifrost_blend(blend_base, job_no, i);
+ else
+ shader = pandecode_midgard_blend_mrt(blend_base, job_no, i);
+
+ if (shader)
+ pandecode_shader_disassemble(shader, job_no, job_type, false);
+ }
+ }
+
+ pandecode_shader_disassemble(shader_ptr, job_no, job_type, is_bifrost);
+ } else
+ pandecode_msg("<no shader>\n");
+
+ if (p->viewport) {
+ struct pandecode_mapped_memory *fmem = pandecode_find_mapped_gpu_mem_containing(p->viewport);
+ struct mali_viewport *PANDECODE_PTR_VAR(f, fmem, p->viewport);
+
+ pandecode_log("struct mali_viewport viewport_%d%s = {\n", job_no, suffix);
+ pandecode_indent++;
+
+ pandecode_prop("clip_minx = %f", f->clip_minx);
+ pandecode_prop("clip_miny = %f", f->clip_miny);
+ pandecode_prop("clip_minz = %f", f->clip_minz);
+ pandecode_prop("clip_maxx = %f", f->clip_maxx);
+ pandecode_prop("clip_maxy = %f", f->clip_maxy);
+ pandecode_prop("clip_maxz = %f", f->clip_maxz);
+
+ /* Only the higher coordinates are MALI_POSITIVE scaled */
+
+ pandecode_prop("viewport0 = { %d, %d }",
+ f->viewport0[0], f->viewport0[1]);
+
+ pandecode_prop("viewport1 = { MALI_POSITIVE(%d), MALI_POSITIVE(%d) }",
+ f->viewport1[0] + 1, f->viewport1[1] + 1);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+ }
+
+ if (p->attribute_meta) {
+ unsigned max_attr_index = pandecode_replay_attribute_meta(job_no, attribute_count, p, false, suffix);
+
+ attr_mem = pandecode_find_mapped_gpu_mem_containing(p->attributes);
+ pandecode_replay_attributes(attr_mem, p->attributes, job_no, suffix, max_attr_index + 1, false);
+ }
+
+ /* Varyings are encoded like attributes but not actually sent; we just
+ * pass a zero buffer with the right stride/size set, (or whatever)
+ * since the GPU will write to it itself */
+
+ if (p->varyings) {
+ attr_mem = pandecode_find_mapped_gpu_mem_containing(p->varyings);
+
+ /* Number of descriptors depends on whether there are
+ * non-internal varyings */
+
+ pandecode_replay_attributes(attr_mem, p->varyings, job_no, suffix, varying_count > 1 ? 4 : 1, true);
+ }
+
+ if (p->varying_meta) {
+ pandecode_replay_attribute_meta(job_no, varying_count, p, true, suffix);
+ }
+
+ bool is_compute = job_type == JOB_TYPE_COMPUTE;
+
+ if (p->uniforms && !is_compute) {
+ int rows = uniform_count, width = 4;
+ size_t sz = rows * width * sizeof(float);
+
+ struct pandecode_mapped_memory *uniform_mem = pandecode_find_mapped_gpu_mem_containing(p->uniforms);
+ pandecode_fetch_gpu_mem(uniform_mem, p->uniforms, sz);
+ u32 *PANDECODE_PTR_VAR(uniforms, uniform_mem, p->uniforms);
+
+ pandecode_log("u32 uniforms_%d%s[] = {\n", job_no, suffix);
+
+ pandecode_indent++;
+
+ for (int row = 0; row < rows; row++) {
+ for (int i = 0; i < width; i++) {
+ u32 v = uniforms[i];
+ float f;
+ memcpy(&f, &v, sizeof(v));
+ pandecode_log_cont("%X /* %f */, ", v, f);
+ }
+
+ pandecode_log_cont("\n");
+
+ uniforms += width;
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+ } else if (p->uniforms) {
+ int rows = uniform_count * 2;
+ size_t sz = rows * sizeof(mali_ptr);
+
+ struct pandecode_mapped_memory *uniform_mem = pandecode_find_mapped_gpu_mem_containing(p->uniforms);
+ pandecode_fetch_gpu_mem(uniform_mem, p->uniforms, sz);
+ mali_ptr *PANDECODE_PTR_VAR(uniforms, uniform_mem, p->uniforms);
+
+ pandecode_log("mali_ptr uniforms_%d%s[] = {\n", job_no, suffix);
+
+ pandecode_indent++;
+
+ for (int row = 0; row < rows; row++) {
+ char *a = pointer_as_memory_reference(uniforms[row]);
+ pandecode_log("%s,\n", a);
+ free(a);
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ }
+
+ if (p->uniform_buffers) {
+ pandecode_replay_uniform_buffers(p->uniform_buffers, uniform_buffer_count, job_no);
+ }
+
+ if (p->texture_trampoline) {
+ struct pandecode_mapped_memory *mmem = pandecode_find_mapped_gpu_mem_containing(p->texture_trampoline);
+
+ if (mmem) {
+ mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline);
+
+ pandecode_log("uint64_t texture_trampoline_%d[] = {\n", job_no);
+ pandecode_indent++;
+
+ for (int tex = 0; tex < texture_count; ++tex) {
+ mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline + tex * sizeof(mali_ptr));
+ char *a = pointer_as_memory_reference(*u);
+ pandecode_log("%s,\n", a);
+ free(a);
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ /* Now, finally, descend down into the texture descriptor */
+ for (int tex = 0; tex < texture_count; ++tex) {
+ mali_ptr *PANDECODE_PTR_VAR(u, mmem, p->texture_trampoline + tex * sizeof(mali_ptr));
+ struct pandecode_mapped_memory *tmem = pandecode_find_mapped_gpu_mem_containing(*u);
+
+ if (tmem) {
+ struct mali_texture_descriptor *PANDECODE_PTR_VAR(t, tmem, *u);
+
+ pandecode_log("struct mali_texture_descriptor texture_descriptor_%"PRIx64"_%d_%d = {\n", *u, job_no, tex);
+ pandecode_indent++;
+
+ pandecode_prop("width = MALI_POSITIVE(%" PRId16 ")", t->width + 1);
+ pandecode_prop("height = MALI_POSITIVE(%" PRId16 ")", t->height + 1);
+ pandecode_prop("depth = MALI_POSITIVE(%" PRId16 ")", t->depth + 1);
+ pandecode_prop("array_size = MALI_POSITIVE(%" PRId16 ")", t->array_size + 1);
+ pandecode_prop("unknown3 = %" PRId16, t->unknown3);
+ pandecode_prop("unknown3A = %" PRId8, t->unknown3A);
+ pandecode_prop("nr_mipmap_levels = %" PRId8, t->nr_mipmap_levels);
+
+ struct mali_texture_format f = t->format;
+
+ pandecode_log(".format = {\n");
+ pandecode_indent++;
+
+ pandecode_replay_swizzle(f.swizzle);
+ pandecode_prop("format = %s", pandecode_format_name(f.format));
+ pandecode_prop("type = %s", pandecode_texture_type(f.type));
+ pandecode_prop("srgb = %" PRId32, f.srgb);
+ pandecode_prop("unknown1 = %" PRId32, f.unknown1);
+ pandecode_prop("usage2 = 0x%" PRIx32, f.usage2);
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+
+ pandecode_replay_swizzle(t->swizzle);
+
+ if (t->swizzle_zero) {
+ /* Shouldn't happen */
+ pandecode_msg("Swizzle zero tripped but replay will be fine anyway");
+ pandecode_prop("swizzle_zero = %d", t->swizzle_zero);
+ }
+
+ pandecode_prop("unknown3 = 0x%" PRIx32, t->unknown3);
+
+ pandecode_prop("unknown5 = 0x%" PRIx32, t->unknown5);
+ pandecode_prop("unknown6 = 0x%" PRIx32, t->unknown6);
+ pandecode_prop("unknown7 = 0x%" PRIx32, t->unknown7);
+
+ pandecode_log(".payload = {\n");
+ pandecode_indent++;
+
+ /* A bunch of bitmap pointers follow.
+ * We work out the correct number,
+ * based on the mipmap/cubemap
+ * properties, but dump extra
+ * possibilities to futureproof */
+
+ int bitmap_count = MALI_NEGATIVE(t->nr_mipmap_levels);
+ bool manual_stride = f.usage2 & MALI_TEX_MANUAL_STRIDE;
+
+ /* Miptree for each face */
+ if (f.type == MALI_TEX_CUBE)
+ bitmap_count *= 6;
+
+ /* Array of textures */
+ bitmap_count *= MALI_NEGATIVE(t->array_size);
+
+ /* Stride for each element */
+ if (manual_stride)
+ bitmap_count *= 2;
+
+ /* Sanity check the size */
+ int max_count = sizeof(t->payload) / sizeof(t->payload[0]);
+ assert (bitmap_count <= max_count);
+
+ /* Dump more to be safe, but not _that_ much more */
+ int safe_count = MIN2(bitmap_count * 2, max_count);
+
+ for (int i = 0; i < safe_count; ++i) {
+ char *prefix = (i >= bitmap_count) ? "// " : "";
+
+ /* How we dump depends if this is a stride or a pointer */
+
+ if ((f.usage2 & MALI_TEX_MANUAL_STRIDE) && (i & 1)) {
+ /* signed 32-bit snuck in as a 64-bit pointer */
+ uint64_t stride_set = t->payload[i];
+ uint32_t clamped_stride = stride_set;
+ int32_t stride = clamped_stride;
+ assert(stride_set == clamped_stride);
+ pandecode_log("%s(mali_ptr) %d /* stride */, \n", prefix, stride);
+ } else {
+ char *a = pointer_as_memory_reference(t->payload[i]);
+ pandecode_log("%s%s, \n", prefix, a);
+ free(a);
+ }
+ }
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+ }
+ }
+ }
+ }
+
+ if (p->sampler_descriptor) {
+ struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(p->sampler_descriptor);
+
+ if (smem) {
+ struct mali_sampler_descriptor *s;
+
+ mali_ptr d = p->sampler_descriptor;
+
+ for (int i = 0; i < sampler_count; ++i) {
+ s = pandecode_fetch_gpu_mem(smem, d + sizeof(*s) * i, sizeof(*s));
+
+ pandecode_log("struct mali_sampler_descriptor sampler_descriptor_%d_%d = {\n", job_no, i);
+ pandecode_indent++;
+
+ /* Only the lower two bits are understood right now; the rest we display as hex */
+ pandecode_log(".filter_mode = MALI_TEX_MIN(%s) | MALI_TEX_MAG(%s) | 0x%" PRIx32",\n",
+ MALI_FILTER_NAME(s->filter_mode & MALI_TEX_MIN_MASK),
+ MALI_FILTER_NAME(s->filter_mode & MALI_TEX_MAG_MASK),
+ s->filter_mode & ~3);
+
+ pandecode_prop("min_lod = FIXED_16(%f)", DECODE_FIXED_16(s->min_lod));
+ pandecode_prop("max_lod = FIXED_16(%f)", DECODE_FIXED_16(s->max_lod));
+
+ pandecode_prop("wrap_s = %s", pandecode_wrap_mode_name(s->wrap_s));
+ pandecode_prop("wrap_t = %s", pandecode_wrap_mode_name(s->wrap_t));
+ pandecode_prop("wrap_r = %s", pandecode_wrap_mode_name(s->wrap_r));
+
+ pandecode_prop("compare_func = %s", pandecode_alt_func_name(s->compare_func));
+
+ if (s->zero || s->zero2) {
+ pandecode_msg("Zero tripped\n");
+ pandecode_prop("zero = 0x%X, 0x%X\n", s->zero, s->zero2);
+ }
+
+ pandecode_prop("seamless_cube_map = %d", s->seamless_cube_map);
+
+ pandecode_prop("border_color = { %f, %f, %f, %f }",
+ s->border_color[0],
+ s->border_color[1],
+ s->border_color[2],
+ s->border_color[3]);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+ }
+ }
+ }
+}
+
+static void
+pandecode_replay_vertex_tiler_postfix(const struct mali_vertex_tiler_postfix *p, int job_no, bool is_bifrost)
+{
+ pandecode_log_cont("{\n");
+ pandecode_indent++;
+
+ MEMORY_PROP(p, position_varying);
+ DYN_MEMORY_PROP(p, job_no, uniform_buffers);
+ DYN_MEMORY_PROP(p, job_no, texture_trampoline);
+ DYN_MEMORY_PROP(p, job_no, sampler_descriptor);
+ DYN_MEMORY_PROP(p, job_no, uniforms);
+ DYN_MEMORY_PROP(p, job_no, attributes);
+ DYN_MEMORY_PROP(p, job_no, attribute_meta);
+ DYN_MEMORY_PROP(p, job_no, varyings);
+ DYN_MEMORY_PROP(p, job_no, varying_meta);
+ DYN_MEMORY_PROP(p, job_no, viewport);
+ DYN_MEMORY_PROP(p, job_no, occlusion_counter);
+
+ if (is_bifrost)
+ pandecode_prop("framebuffer = scratchpad_%d_p", job_no);
+ else
+ pandecode_prop("framebuffer = framebuffer_%d_p | %s", job_no, p->framebuffer & MALI_MFBD ? "MALI_MFBD" : "0");
+
+ pandecode_prop("_shader_upper = (shader_meta_%d_p) >> 4", job_no);
+ pandecode_prop("flags = %d", p->flags);
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+}
+
+static void
+pandecode_replay_vertex_only_bfr(struct bifrost_vertex_only *v)
+{
+ pandecode_log_cont("{\n");
+ pandecode_indent++;
+
+ pandecode_prop("unk2 = 0x%x", v->unk2);
+
+ if (v->zero0 || v->zero1) {
+ pandecode_msg("vertex only zero tripped");
+ pandecode_prop("zero0 = 0x%" PRIx32, v->zero0);
+ pandecode_prop("zero1 = 0x%" PRIx64, v->zero1);
+ }
+
+ pandecode_indent--;
+ pandecode_log("}\n");
+}
+
+static void
+pandecode_replay_tiler_heap_meta(mali_ptr gpu_va, int job_no)
+{
+
+ struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
+ const struct bifrost_tiler_heap_meta *PANDECODE_PTR_VAR(h, mem, gpu_va);
+
+ pandecode_log("struct mali_tiler_heap_meta tiler_heap_meta_%d = {\n", job_no);
+ pandecode_indent++;
+
+ if (h->zero) {
+ pandecode_msg("tiler heap zero tripped\n");
+ pandecode_prop("zero = 0x%x", h->zero);
+ }
+
+ for (int i = 0; i < 12; i++) {
+ if (h->zeros[i] != 0) {
+ pandecode_msg("tiler heap zero %d tripped, value %x\n",
+ i, h->zeros[i]);
+ }
+ }
+
+ pandecode_prop("heap_size = 0x%x", h->heap_size);
+ MEMORY_PROP(h, tiler_heap_start);
+ MEMORY_PROP(h, tiler_heap_free);
+
+ /* this might point to the beginning of another buffer, when it's
+ * really the end of the tiler heap buffer, so we have to be careful
+ * here.
+ */
+ char *a = pointer_as_memory_reference(h->tiler_heap_end - 1);
+ pandecode_prop("tiler_heap_end = %s + 1", a);
+ free(a);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+}
+
+static void
+pandecode_replay_tiler_meta(mali_ptr gpu_va, int job_no)
+{
+ struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
+ const struct bifrost_tiler_meta *PANDECODE_PTR_VAR(t, mem, gpu_va);
+
+ pandecode_replay_tiler_heap_meta(t->tiler_heap_meta, job_no);
+
+ pandecode_log("struct bifrost_tiler_meta tiler_meta_%d = {\n", job_no);
+ pandecode_indent++;
+
+ if (t->zero0 || t->zero1) {
+ pandecode_msg("tiler meta zero tripped");
+ pandecode_prop("zero0 = 0x%" PRIx64, t->zero0);
+ pandecode_prop("zero1 = 0x%" PRIx64, t->zero1);
+ }
+
+ pandecode_prop("hierarchy_mask = 0x%" PRIx16, t->hierarchy_mask);
+ pandecode_prop("flags = 0x%" PRIx16, t->flags);
+
+ pandecode_prop("width = MALI_POSITIVE(%d)", t->width + 1);
+ pandecode_prop("height = MALI_POSITIVE(%d)", t->height + 1);
+ DYN_MEMORY_PROP(t, job_no, tiler_heap_meta);
+
+ for (int i = 0; i < 12; i++) {
+ if (t->zeros[i] != 0) {
+ pandecode_msg("tiler heap zero %d tripped, value %" PRIx64 "\n",
+ i, t->zeros[i]);
+ }
+ }
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+}
+
+static void
+pandecode_replay_gl_enables(uint32_t gl_enables, int job_type)
+{
+ pandecode_log(".gl_enables = ");
+
+ pandecode_log_decoded_flags(gl_enable_flag_info, gl_enables);
+
+ pandecode_log_cont(",\n");
+}
+
+static void
+pandecode_replay_primitive_size(union midgard_primitive_size u, bool constant)
+{
+ if (u.pointer == 0x0)
+ return;
+
+ pandecode_log(".primitive_size = {\n");
+ pandecode_indent++;
+
+ if (constant) {
+ pandecode_prop("constant = %f", u.constant);
+ } else {
+ MEMORY_PROP((&u), pointer);
+ }
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+}
+
+static void
+pandecode_replay_tiler_only_bfr(const struct bifrost_tiler_only *t, int job_no)
+{
+ pandecode_log_cont("{\n");
+ pandecode_indent++;
+
+ /* TODO: gl_PointSize on Bifrost */
+ pandecode_replay_primitive_size(t->primitive_size, true);
+
+ DYN_MEMORY_PROP(t, job_no, tiler_meta);
+ pandecode_replay_gl_enables(t->gl_enables, JOB_TYPE_TILER);
+
+ if (t->zero1 || t->zero2 || t->zero3 || t->zero4 || t->zero5
+ || t->zero6 || t->zero7 || t->zero8) {
+ pandecode_msg("tiler only zero tripped");
+ pandecode_prop("zero1 = 0x%" PRIx64, t->zero1);
+ pandecode_prop("zero2 = 0x%" PRIx64, t->zero2);
+ pandecode_prop("zero3 = 0x%" PRIx64, t->zero3);
+ pandecode_prop("zero4 = 0x%" PRIx64, t->zero4);
+ pandecode_prop("zero5 = 0x%" PRIx64, t->zero5);
+ pandecode_prop("zero6 = 0x%" PRIx64, t->zero6);
+ pandecode_prop("zero7 = 0x%" PRIx32, t->zero7);
+ pandecode_prop("zero8 = 0x%" PRIx64, t->zero8);
+ }
+
+ pandecode_indent--;
+ pandecode_log("},\n");
+}
+
+static int
+pandecode_replay_vertex_job_bfr(const struct mali_job_descriptor_header *h,
+ const struct pandecode_mapped_memory *mem,
+ mali_ptr payload, int job_no)
+{
+ struct bifrost_payload_vertex *PANDECODE_PTR_VAR(v, mem, payload);
+
+ pandecode_replay_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", true);
+
+ pandecode_log("struct bifrost_payload_vertex payload_%d = {\n", job_no);
+ pandecode_indent++;
+
+ pandecode_log(".prefix = ");
+ pandecode_replay_vertex_tiler_prefix(&v->prefix, job_no);
+
+ pandecode_log(".vertex = ");
+ pandecode_replay_vertex_only_bfr(&v->vertex);
+
+ pandecode_log(".postfix = ");
+ pandecode_replay_vertex_tiler_postfix(&v->postfix, job_no, true);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ return sizeof(*v);
+}
+
+static int
+pandecode_replay_tiler_job_bfr(const struct mali_job_descriptor_header *h,
+ const struct pandecode_mapped_memory *mem,
+ mali_ptr payload, int job_no)
+{
+ struct bifrost_payload_tiler *PANDECODE_PTR_VAR(t, mem, payload);
+
+ pandecode_replay_vertex_tiler_postfix_pre(&t->postfix, job_no, h->job_type, "", true);
+
+ pandecode_replay_indices(t->prefix.indices, t->prefix.index_count, job_no);
+ pandecode_replay_tiler_meta(t->tiler.tiler_meta, job_no);
+
+ pandecode_log("struct bifrost_payload_tiler payload_%d = {\n", job_no);
+ pandecode_indent++;
+
+ pandecode_log(".prefix = ");
+ pandecode_replay_vertex_tiler_prefix(&t->prefix, job_no);
+
+ pandecode_log(".tiler = ");
+ pandecode_replay_tiler_only_bfr(&t->tiler, job_no);
+
+ pandecode_log(".postfix = ");
+ pandecode_replay_vertex_tiler_postfix(&t->postfix, job_no, true);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ return sizeof(*t);
+}
+
+static int
+pandecode_replay_vertex_or_tiler_job_mdg(const struct mali_job_descriptor_header *h,
+ const struct pandecode_mapped_memory *mem,
+ mali_ptr payload, int job_no)
+{
+ struct midgard_payload_vertex_tiler *PANDECODE_PTR_VAR(v, mem, payload);
+
+ pandecode_replay_vertex_tiler_postfix_pre(&v->postfix, job_no, h->job_type, "", false);
+
+ pandecode_replay_indices(v->prefix.indices, v->prefix.index_count, job_no);
+
+ pandecode_log("struct midgard_payload_vertex_tiler payload_%d = {\n", job_no);
+ pandecode_indent++;
+
+ bool has_primitive_pointer = v->prefix.unknown_draw & MALI_DRAW_VARYING_SIZE;
+ pandecode_replay_primitive_size(v->primitive_size, !has_primitive_pointer);
+
+ pandecode_log(".prefix = ");
+ pandecode_replay_vertex_tiler_prefix(&v->prefix, job_no);
+
+ pandecode_replay_gl_enables(v->gl_enables, h->job_type);
+
+ if (v->instance_shift || v->instance_odd) {
+ pandecode_prop("instance_shift = 0x%d /* %d */",
+ v->instance_shift, 1 << v->instance_shift);
+ pandecode_prop("instance_odd = 0x%X /* %d */",
+ v->instance_odd, (2 * v->instance_odd) + 1);
+
+ pandecode_padded_vertices(v->instance_shift, v->instance_odd);
+ }
+
+ if (v->draw_start)
+ pandecode_prop("draw_start = %d", v->draw_start);
+
+#ifndef __LP64__
+
+ if (v->zero3) {
+ pandecode_msg("Zero tripped\n");
+ pandecode_prop("zero3 = 0x%" PRIx32, v->zero3);
+ }
+
+#endif
+
+ if (v->zero5) {
+ pandecode_msg("Zero tripped\n");
+ pandecode_prop("zero5 = 0x%" PRIx64, v->zero5);
+ }
+
+ pandecode_log(".postfix = ");
+ pandecode_replay_vertex_tiler_postfix(&v->postfix, job_no, false);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ return sizeof(*v);
+}
+
+static int
+pandecode_replay_fragment_job(const struct pandecode_mapped_memory *mem,
+ mali_ptr payload, int job_no,
+ bool is_bifrost)
+{
+ const struct mali_payload_fragment *PANDECODE_PTR_VAR(s, mem, payload);
+
+ bool fbd_dumped = false;
+
+ if (!is_bifrost && (s->framebuffer & FBD_TYPE) == MALI_SFBD) {
+ /* Only SFBDs are understood, not MFBDs. We're speculating,
+ * based on the versioning, kernel code, etc, that the
+ * difference is between Single FrameBuffer Descriptor and
+ * Multiple FrmaeBuffer Descriptor; the change apparently lines
+ * up with multi-framebuffer support being added (T7xx onwards,
+ * including Gxx). In any event, there's some field shuffling
+ * that we haven't looked into yet. */
+
+ pandecode_replay_sfbd(s->framebuffer & FBD_MASK, job_no);
+ fbd_dumped = true;
+ } else if ((s->framebuffer & FBD_TYPE) == MALI_MFBD) {
+ /* We don't know if Bifrost supports SFBD's at all, since the
+ * driver never uses them. And the format is different from
+ * Midgard anyways, due to the tiler heap and scratchpad being
+ * moved out into separate structures, so it's not clear what a
+ * Bifrost SFBD would even look like without getting an actual
+ * trace, which appears impossible.
+ */
+
+ pandecode_replay_mfbd_bfr(s->framebuffer & FBD_MASK, job_no, true);
+ fbd_dumped = true;
+ }
+
+ uintptr_t p = (uintptr_t) s->framebuffer & FBD_MASK;
+ pandecode_log("struct mali_payload_fragment payload_%"PRIx64"_%d = {\n", payload, job_no);
+ pandecode_indent++;
+
+ /* See the comments by the macro definitions for mathematical context
+ * on why this is so weird */
+
+ if (MALI_TILE_COORD_FLAGS(s->max_tile_coord) || MALI_TILE_COORD_FLAGS(s->min_tile_coord))
+ pandecode_msg("Tile coordinate flag missed, replay wrong\n");
+
+ pandecode_prop("min_tile_coord = MALI_COORDINATE_TO_TILE_MIN(%d, %d)",
+ MALI_TILE_COORD_X(s->min_tile_coord) << MALI_TILE_SHIFT,
+ MALI_TILE_COORD_Y(s->min_tile_coord) << MALI_TILE_SHIFT);
+
+ pandecode_prop("max_tile_coord = MALI_COORDINATE_TO_TILE_MAX(%d, %d)",
+ (MALI_TILE_COORD_X(s->max_tile_coord) + 1) << MALI_TILE_SHIFT,
+ (MALI_TILE_COORD_Y(s->max_tile_coord) + 1) << MALI_TILE_SHIFT);
+
+ /* If the FBD was just decoded, we can refer to it by pointer. If not,
+ * we have to fallback on offsets. */
+
+ const char *fbd_type = s->framebuffer & MALI_MFBD ? "MALI_MFBD" : "MALI_SFBD";
+
+ if (fbd_dumped)
+ pandecode_prop("framebuffer = framebuffer_%d_p | %s", job_no, fbd_type);
+ else
+ pandecode_prop("framebuffer = %s | %s", pointer_as_memory_reference(p), fbd_type);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ return sizeof(*s);
+}
+
+static int job_descriptor_number = 0;
+
+int
+pandecode_replay_jc(mali_ptr jc_gpu_va, bool bifrost)
+{
+ struct mali_job_descriptor_header *h;
+
+ int start_number = 0;
+
+ bool first = true;
+ bool last_size;
+
+ do {
+ struct pandecode_mapped_memory *mem =
+ pandecode_find_mapped_gpu_mem_containing(jc_gpu_va);
+
+ void *payload;
+
+ h = PANDECODE_PTR(mem, jc_gpu_va, struct mali_job_descriptor_header);
+
+ /* On Midgard, for 32-bit jobs except for fragment jobs, the
+ * high 32-bits of the 64-bit pointer are reused to store
+ * something else.
+ */
+ int offset = h->job_descriptor_size == MALI_JOB_32 &&
+ h->job_type != JOB_TYPE_FRAGMENT ? 4 : 0;
+ mali_ptr payload_ptr = jc_gpu_va + sizeof(*h) - offset;
+
+ payload = pandecode_fetch_gpu_mem(mem, payload_ptr,
+ MALI_PAYLOAD_SIZE);
+
+ int job_no = job_descriptor_number++;
+
+ if (first)
+ start_number = job_no;
+
+ pandecode_log("struct mali_job_descriptor_header job_%"PRIx64"_%d = {\n", jc_gpu_va, job_no);
+ pandecode_indent++;
+
+ pandecode_prop("job_type = %s", pandecode_job_type_name(h->job_type));
+
+ /* Save for next job fixing */
+ last_size = h->job_descriptor_size;
+
+ if (h->job_descriptor_size)
+ pandecode_prop("job_descriptor_size = %d", h->job_descriptor_size);
+
+ if (h->exception_status != 0x1)
+ pandecode_prop("exception_status = %x (source ID: 0x%x access: 0x%x exception: 0x%x)",
+ h->exception_status,
+ (h->exception_status >> 16) & 0xFFFF,
+ (h->exception_status >> 8) & 0x3,
+ h->exception_status & 0xFF);
+
+ if (h->first_incomplete_task)
+ pandecode_prop("first_incomplete_task = %d", h->first_incomplete_task);
+
+ if (h->fault_pointer)
+ pandecode_prop("fault_pointer = 0x%" PRIx64, h->fault_pointer);
+
+ if (h->job_barrier)
+ pandecode_prop("job_barrier = %d", h->job_barrier);
+
+ pandecode_prop("job_index = %d", h->job_index);
+
+ if (h->unknown_flags)
+ pandecode_prop("unknown_flags = %d", h->unknown_flags);
+
+ if (h->job_dependency_index_1)
+ pandecode_prop("job_dependency_index_1 = %d", h->job_dependency_index_1);
+
+ if (h->job_dependency_index_2)
+ pandecode_prop("job_dependency_index_2 = %d", h->job_dependency_index_2);
+
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ /* Do not touch the field yet -- decode the payload first, and
+ * don't touch that either. This is essential for the uploads
+ * to occur in sequence and therefore be dynamically allocated
+ * correctly. Do note the size, however, for that related
+ * reason. */
+
+ switch (h->job_type) {
+ case JOB_TYPE_SET_VALUE: {
+ struct mali_payload_set_value *s = payload;
+ pandecode_log("struct mali_payload_set_value payload_%"PRIx64"_%d = {\n", payload_ptr, job_no);
+ pandecode_indent++;
+ MEMORY_PROP(s, out);
+ pandecode_prop("unknown = 0x%" PRIX64, s->unknown);
+ pandecode_indent--;
+ pandecode_log("};\n");
+
+ break;
+ }
+
+ case JOB_TYPE_TILER:
+ case JOB_TYPE_VERTEX:
+ case JOB_TYPE_COMPUTE:
+ if (bifrost) {
+ if (h->job_type == JOB_TYPE_TILER)
+ pandecode_replay_tiler_job_bfr(h, mem, payload_ptr, job_no);
+ else
+ pandecode_replay_vertex_job_bfr(h, mem, payload_ptr, job_no);
+ } else
+ pandecode_replay_vertex_or_tiler_job_mdg(h, mem, payload_ptr, job_no);
+
+ break;
+
+ case JOB_TYPE_FRAGMENT:
+ pandecode_replay_fragment_job(mem, payload_ptr, job_no, bifrost);
+ break;
+
+ default:
+ break;
+ }
+
+ /* Handle linkage */
+
+ if (!first) {
+ pandecode_log("((struct mali_job_descriptor_header *) (uintptr_t) job_%d_p)->", job_no - 1);
+
+ if (last_size)
+ pandecode_log_cont("next_job_64 = job_%d_p;\n\n", job_no);
+ else
+ pandecode_log_cont("next_job_32 = (u32) (uintptr_t) job_%d_p;\n\n", job_no);
+ }
+
+ first = false;
+
+ } while ((jc_gpu_va = h->job_descriptor_size ? h->next_job_64 : h->next_job_32));
+
+ return start_number;
+}
--- /dev/null
+/*
+ * Copyright (C) 2017-2019 Lyude Paul
+ * Copyright (C) 2017-2019 Alyssa Rosenzweig
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __PAN_DECODE_H__
+#define __PAN_DECODE_H__
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <panfrost-job.h>
+#include "util/list.h"
+
+struct pandecode_mapped_memory {
+ struct list_head node;
+
+ size_t length;
+
+ void *addr;
+ mali_ptr gpu_va;
+
+ char name[32];
+};
+
+void pandecode_initialize(void);
+
+char *pointer_as_memory_reference(mali_ptr ptr);
+
+struct pandecode_mapped_memory *pandecode_find_mapped_gpu_mem_containing(mali_ptr addr);
+
+void
+pandecode_inject_mmap(mali_ptr gpu_va, void *cpu, unsigned sz, const char *name);
+
+static inline void *
+__pandecode_fetch_gpu_mem(const struct pandecode_mapped_memory *mem,
+ mali_ptr gpu_va, size_t size,
+ int line, const char *filename)
+{
+ if (!mem)
+ mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
+
+ if (!mem) {
+ fprintf(stderr, "Access to unknown memory %" PRIx64 " in %s:%d",
+ gpu_va, filename, line);
+ assert(0);
+ }
+
+ assert(mem);
+ assert(size + (gpu_va - mem->gpu_va) <= mem->length);
+
+ return mem->addr + gpu_va - mem->gpu_va;
+}
+
+#define pandecode_fetch_gpu_mem(mem, gpu_va, size) \
+ __pandecode_fetch_gpu_mem(mem, gpu_va, size, __LINE__, __FILE__)
+
+/* Returns a validated pointer to mapped GPU memory with the given pointer type,
+ * size automatically determined from the pointer type
+ */
+#define PANDECODE_PTR(mem, gpu_va, type) \
+ ((type*)(__pandecode_fetch_gpu_mem(mem, gpu_va, sizeof(type), \
+ __LINE__, __FILE__)))
+
+/* Usage: <variable type> PANDECODE_PTR_VAR(name, mem, gpu_va) */
+#define PANDECODE_PTR_VAR(name, mem, gpu_va) \
+ name = __pandecode_fetch_gpu_mem(mem, gpu_va, sizeof(*name), \
+ __LINE__, __FILE__)
+
+/* Common entrypoint */
+int pandecode_replay_jc(mali_ptr jc_gpu_va, bool bifrost);
+
+#endif /* __MMAP_TRACE_H__ */
--- /dev/null
+# Copyright © 2018 Rob Clark
+# Copyright © 2019 Collabora
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libpanfrost_decode_files = files(
+ 'pan_pretty_print.c',
+ 'common.c',
+ 'decode.c',
+)
+
+libpanfrost_decode = static_library(
+ 'panfrost_decode',
+ [libpanfrost_decode_files],
+ include_directories : [inc_common, inc_panfrost],
+ c_args : [c_vis_args, no_override_init_args],
+ cpp_args : [cpp_vis_args],
+ build_by_default : false,
+)
--- /dev/null
+/*
+ * © Copyright 2017-2098 The Panfrost Communiy
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pan_pretty_print.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+/* Some self-contained prettyprinting functions shared between pandecode and
+ * the main driver */
+
+#define DEFINE_CASE(name) case MALI_## name: return "MALI_" #name
+char *pandecode_format_name(enum mali_format format)
+{
+ static char unk_format_str[5];
+
+ switch (format) {
+ DEFINE_CASE(RGB565);
+ DEFINE_CASE(RGB5_A1_UNORM);
+ DEFINE_CASE(RGB10_A2_UNORM);
+ DEFINE_CASE(RGB10_A2_SNORM);
+ DEFINE_CASE(RGB10_A2UI);
+ DEFINE_CASE(RGB10_A2I);
+ DEFINE_CASE(NV12);
+ DEFINE_CASE(Z32_UNORM);
+ DEFINE_CASE(R32_FIXED);
+ DEFINE_CASE(RG32_FIXED);
+ DEFINE_CASE(RGB32_FIXED);
+ DEFINE_CASE(RGBA32_FIXED);
+ DEFINE_CASE(R11F_G11F_B10F);
+ DEFINE_CASE(R9F_G9F_B9F_E5F);
+ DEFINE_CASE(VARYING_POS);
+ DEFINE_CASE(VARYING_DISCARD);
+
+ DEFINE_CASE(R8_SNORM);
+ DEFINE_CASE(R16_SNORM);
+ DEFINE_CASE(R32_SNORM);
+ DEFINE_CASE(RG8_SNORM);
+ DEFINE_CASE(RG16_SNORM);
+ DEFINE_CASE(RG32_SNORM);
+ DEFINE_CASE(RGB8_SNORM);
+ DEFINE_CASE(RGB16_SNORM);
+ DEFINE_CASE(RGB32_SNORM);
+ DEFINE_CASE(RGBA8_SNORM);
+ DEFINE_CASE(RGBA16_SNORM);
+ DEFINE_CASE(RGBA32_SNORM);
+
+ DEFINE_CASE(R8UI);
+ DEFINE_CASE(R16UI);
+ DEFINE_CASE(R32UI);
+ DEFINE_CASE(RG8UI);
+ DEFINE_CASE(RG16UI);
+ DEFINE_CASE(RG32UI);
+ DEFINE_CASE(RGB8UI);
+ DEFINE_CASE(RGB16UI);
+ DEFINE_CASE(RGB32UI);
+ DEFINE_CASE(RGBA8UI);
+ DEFINE_CASE(RGBA16UI);
+ DEFINE_CASE(RGBA32UI);
+
+ DEFINE_CASE(R8_UNORM);
+ DEFINE_CASE(R16_UNORM);
+ DEFINE_CASE(R32_UNORM);
+ DEFINE_CASE(R32F);
+ DEFINE_CASE(RG8_UNORM);
+ DEFINE_CASE(RG16_UNORM);
+ DEFINE_CASE(RG32_UNORM);
+ DEFINE_CASE(RG32F);
+ DEFINE_CASE(RGB8_UNORM);
+ DEFINE_CASE(RGB16_UNORM);
+ DEFINE_CASE(RGB32_UNORM);
+ DEFINE_CASE(RGB32F);
+ DEFINE_CASE(RGBA4_UNORM);
+ DEFINE_CASE(RGBA8_UNORM);
+ DEFINE_CASE(RGBA16_UNORM);
+ DEFINE_CASE(RGBA32_UNORM);
+ DEFINE_CASE(RGBA32F);
+
+ DEFINE_CASE(R8I);
+ DEFINE_CASE(R16I);
+ DEFINE_CASE(R32I);
+ DEFINE_CASE(RG8I);
+ DEFINE_CASE(R16F);
+ DEFINE_CASE(RG16I);
+ DEFINE_CASE(RG32I);
+ DEFINE_CASE(RG16F);
+ DEFINE_CASE(RGB8I);
+ DEFINE_CASE(RGB16I);
+ DEFINE_CASE(RGB32I);
+ DEFINE_CASE(RGB16F);
+ DEFINE_CASE(RGBA8I);
+ DEFINE_CASE(RGBA16I);
+ DEFINE_CASE(RGBA32I);
+ DEFINE_CASE(RGBA16F);
+
+ DEFINE_CASE(RGBA4);
+ DEFINE_CASE(RGBA8_2);
+ DEFINE_CASE(RGB10_A2_2);
+ default:
+ snprintf(unk_format_str, sizeof(unk_format_str), "0x%02x", format);
+ return unk_format_str;
+ }
+}
+
+#undef DEFINE_CASE
+
+/* Helper to dump fixed-function blend part for debugging */
+
+static const char *
+panfrost_factor_name(enum mali_dominant_factor factor)
+{
+ switch (factor) {
+ case MALI_DOMINANT_UNK0:
+ return "unk0";
+
+ case MALI_DOMINANT_ZERO:
+ return "zero";
+
+ case MALI_DOMINANT_SRC_COLOR:
+ return "source color";
+
+ case MALI_DOMINANT_DST_COLOR:
+ return "dest color";
+
+ case MALI_DOMINANT_UNK4:
+ return "unk4";
+
+ case MALI_DOMINANT_SRC_ALPHA:
+ return "source alpha";
+
+ case MALI_DOMINANT_DST_ALPHA:
+ return "dest alpha";
+
+ case MALI_DOMINANT_CONSTANT:
+ return "constant";
+ }
+
+ return "unreachable";
+}
+
+static const char *
+panfrost_modifier_name(enum mali_blend_modifier mod)
+{
+ switch (mod) {
+ case MALI_BLEND_MOD_UNK0:
+ return "unk0";
+
+ case MALI_BLEND_MOD_NORMAL:
+ return "normal";
+
+ case MALI_BLEND_MOD_SOURCE_ONE:
+ return "source one";
+
+ case MALI_BLEND_MOD_DEST_ONE:
+ return "dest one";
+ }
+
+ return "unreachable";
+}
+
+static void
+panfrost_print_fixed_part(const char *name, unsigned u)
+{
+ struct mali_blend_mode part;
+ memcpy(&part, &u, sizeof(part));
+
+ printf("%s blend mode (%X):\n", name, u);
+
+ printf(" %s dominant:\n",
+ (part.dominant == MALI_BLEND_DOM_SOURCE) ? "source" : "destination");
+
+ printf(" %s\n", panfrost_factor_name(part.dominant_factor));
+
+ if (part.complement_dominant)
+ printf(" complement\n");
+
+
+ printf(" nondominant %s\n",
+ (part.nondominant_mode == MALI_BLEND_NON_MIRROR) ? "mirror" : "zero");
+
+
+ printf(" mode: %s\n", panfrost_modifier_name(part.clip_modifier));
+
+ if (part.negate_source) printf(" negate source\n");
+
+ if (part.negate_dest) printf(" negate dest\n");
+
+ assert(!(part.unused_0 || part.unused_1));
+}
+
+void
+panfrost_print_blend_equation(struct mali_blend_equation eq)
+{
+ printf("\n");
+ panfrost_print_fixed_part("RGB", eq.rgb_mode);
+ panfrost_print_fixed_part("Alpha", eq.alpha_mode);
+
+ assert(!eq.zero1);
+
+ printf("Mask: %s%s%s%s\n",
+ (eq.color_mask & MALI_MASK_R) ? "R" : "",
+ (eq.color_mask & MALI_MASK_G) ? "G" : "",
+ (eq.color_mask & MALI_MASK_B) ? "B" : "",
+ (eq.color_mask & MALI_MASK_A) ? "A" : "");
+}
--- /dev/null
+/*
+ * © Copyright 2017-2098 The Panfrost Communiy
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __PAN_PRETTY_H
+#define __PAN_PRETTY_H
+
+#include "panfrost-job.h"
+
+char *pandecode_format_name(enum mali_format format);
+void panfrost_print_blend_equation(struct mali_blend_equation eq);
+
+#endif