From: Rob Clark Date: Fri, 25 Jul 2014 15:15:59 +0000 (-0400) Subject: freedreno/ir3: split out shader compiler from a3xx X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=db193e5ad06e7a2fbcffb3bb5df85d212eb12291;p=mesa.git freedreno/ir3: split out shader compiler from a3xx Move the bits we want to share between generations from fd3_program to ir3_shader. So overall structure is: fdN_shader_stateobj -> ir3_shader -> ir3_shader_variant -> ir3 |- ... \- ir3_shader_variant -> ir3 So the ir3_shader becomes the topmost generation neutral object, which manages the set of variants each of which generates, compiles, and assembles it's own ir. There is a bit of additional renaming to s/fd3_compiler/ir3_compiler/, etc. Keep the split between the gallium level stateobj and the shader helper object because it might be a good idea to pre-compute some generation specific register values (ie. anything that is independent of linking). Signed-off-by: Rob Clark --- diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am index 7947dd1a56e..7d9c6e4933a 100644 --- a/src/gallium/drivers/freedreno/Makefile.am +++ b/src/gallium/drivers/freedreno/Makefile.am @@ -7,6 +7,7 @@ AM_CFLAGS = \ -Wno-packed-bitfield-compat \ -I$(top_srcdir)/src/gallium/drivers/freedreno/a3xx \ -I$(top_srcdir)/src/gallium/drivers/freedreno/a2xx \ + -I$(top_srcdir)/src/gallium/drivers/freedreno/ir3 \ $(GALLIUM_DRIVER_CFLAGS) \ $(FREEDRENO_CFLAGS) @@ -15,4 +16,5 @@ noinst_LTLIBRARIES = libfreedreno.la libfreedreno_la_SOURCES = \ $(C_SOURCES) \ $(a2xx_SOURCES) \ - $(a3xx_SOURCES) + $(a3xx_SOURCES) \ + $(ir3_SOURCES) diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources index 0dc7fc08512..85e0b7eda6f 100644 --- a/src/gallium/drivers/freedreno/Makefile.sources +++ b/src/gallium/drivers/freedreno/Makefile.sources @@ -33,8 +33,6 @@ a2xx_SOURCES := \ a3xx_SOURCES := \ a3xx/fd3_blend.c \ - a3xx/fd3_compiler.c \ - a3xx/fd3_compiler_old.c \ a3xx/fd3_context.c \ a3xx/fd3_draw.c \ a3xx/fd3_emit.c \ @@ -45,12 +43,17 @@ a3xx_SOURCES := \ a3xx/fd3_screen.c \ a3xx/fd3_texture.c \ a3xx/fd3_util.c \ - a3xx/fd3_zsa.c \ - a3xx/disasm-a3xx.c \ - a3xx/ir3_cp.c \ - a3xx/ir3_depth.c \ - a3xx/ir3_dump.c \ - a3xx/ir3_flatten.c \ - a3xx/ir3_ra.c \ - a3xx/ir3_sched.c \ - a3xx/ir3.c + a3xx/fd3_zsa.c + +ir3_SOURCES := \ + ir3/disasm-a3xx.c \ + ir3/ir3_compiler.c \ + ir3/ir3_compiler_old.c \ + ir3/ir3_shader.c \ + ir3/ir3_cp.c \ + ir3/ir3_depth.c \ + ir3/ir3_dump.c \ + ir3/ir3_flatten.c \ + ir3/ir3_ra.c \ + ir3/ir3_sched.c \ + ir3/ir3.c diff --git a/src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c b/src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c deleted file mode 100644 index 8c3704bf658..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c +++ /dev/null @@ -1,805 +0,0 @@ -/* - * Copyright (c) 2013 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include - -#include "disasm.h" -#include "instr-a3xx.h" - -static enum debug_t debug; - -#define printf debug_printf - -static const char *levels[] = { - "", - "\t", - "\t\t", - "\t\t\t", - "\t\t\t\t", - "\t\t\t\t\t", - "\t\t\t\t\t\t", - "\t\t\t\t\t\t\t", - "\t\t\t\t\t\t\t\t", - "\t\t\t\t\t\t\t\t\t", - "x", - "x", - "x", - "x", - "x", - "x", -}; - -static const char *component = "xyzw"; - -static const char *type[] = { - [TYPE_F16] = "f16", - [TYPE_F32] = "f32", - [TYPE_U16] = "u16", - [TYPE_U32] = "u32", - [TYPE_S16] = "s16", - [TYPE_S32] = "s32", - [TYPE_U8] = "u8", - [TYPE_S8] = "s8", -}; - -static void print_reg(reg_t reg, bool full, bool r, bool c, bool im, - bool neg, bool abs, bool addr_rel) -{ - const char type = c ? 'c' : 'r'; - - // XXX I prefer - and || for neg/abs, but preserving format used - // by libllvm-a3xx for easy diffing.. - - if (abs && neg) - printf("(absneg)"); - else if (neg) - printf("(neg)"); - else if (abs) - printf("(abs)"); - - if (r) - printf("(r)"); - - if (im) { - printf("%d", reg.iim_val); - } else if (addr_rel) { - /* I would just use %+d but trying to make it diff'able with - * libllvm-a3xx... - */ - if (reg.iim_val < 0) - printf("%s%c", full ? "" : "h", type, -reg.iim_val); - else if (reg.iim_val > 0) - printf("%s%c", full ? "" : "h", type, reg.iim_val); - else - printf("%s%c", full ? "" : "h", type); - } else if ((reg.num == REG_A0) && !c) { - printf("a0.%c", component[reg.comp]); - } else if ((reg.num == REG_P0) && !c) { - printf("p0.%c", component[reg.comp]); - } else { - printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]); - } -} - - -/* current instruction repeat flag: */ -static unsigned repeat; - -static void print_reg_dst(reg_t reg, bool full, bool addr_rel) -{ - print_reg(reg, full, false, false, false, false, false, addr_rel); -} - -static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im, - bool neg, bool abs, bool addr_rel) -{ - print_reg(reg, full, r, c, im, neg, abs, addr_rel); -} - -static void print_instr_cat0(instr_t *instr) -{ - instr_cat0_t *cat0 = &instr->cat0; - - switch (cat0->opc) { - case OPC_KILL: - printf(" %sp0.%c", cat0->inv ? "!" : "", - component[cat0->comp]); - break; - case OPC_BR: - printf(" %sp0.%c, #%d", cat0->inv ? "!" : "", - component[cat0->comp], cat0->immed); - break; - case OPC_JUMP: - case OPC_CALL: - printf(" #%d", cat0->immed); - break; - } - - if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4)) - printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4); -} - -static void print_instr_cat1(instr_t *instr) -{ - instr_cat1_t *cat1 = &instr->cat1; - - if (cat1->ul) - printf("(ul)"); - - if (cat1->src_type == cat1->dst_type) { - if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) { - /* special case (nmemonic?): */ - printf("mova"); - } else { - printf("mov.%s%s", type[cat1->src_type], type[cat1->dst_type]); - } - } else { - printf("cov.%s%s", type[cat1->src_type], type[cat1->dst_type]); - } - - printf(" "); - - if (cat1->even) - printf("(even)"); - - if (cat1->pos_inf) - printf("(pos_infinity)"); - - print_reg_dst((reg_t)(cat1->dst), type_size(cat1->dst_type) == 32, - cat1->dst_rel); - - printf(", "); - - /* ugg, have to special case this.. vs print_reg().. */ - if (cat1->src_im) { - if (type_float(cat1->src_type)) - printf("(%f)", cat1->fim_val); - else - printf("%d", cat1->iim_val); - } else if (cat1->src_rel && !cat1->src_c) { - /* I would just use %+d but trying to make it diff'able with - * libllvm-a3xx... - */ - char type = cat1->src_rel_c ? 'c' : 'r'; - if (cat1->off < 0) - printf("%c", type, -cat1->off); - else if (cat1->off > 0) - printf("%c", type, cat1->off); - else - printf("c"); - } else { - print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32, - cat1->src_r, cat1->src_c, cat1->src_im, false, false, false); - } - - if ((debug & PRINT_VERBOSE) && (cat1->must_be_0)) - printf("\t{1: %x}", cat1->must_be_0); -} - -static void print_instr_cat2(instr_t *instr) -{ - instr_cat2_t *cat2 = &instr->cat2; - static const char *cond[] = { - "lt", - "le", - "gt", - "ge", - "eq", - "ne", - "?6?", - }; - - switch (cat2->opc) { - case OPC_CMPS_F: - case OPC_CMPS_U: - case OPC_CMPS_S: - case OPC_CMPV_F: - case OPC_CMPV_U: - case OPC_CMPV_S: - printf(".%s", cond[cat2->cond]); - break; - } - - printf(" "); - if (cat2->ei) - printf("(ei)"); - print_reg_dst((reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false); - printf(", "); - - if (cat2->c1.src1_c) { - print_reg_src((reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r, - cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg, - cat2->src1_abs, false); - } else if (cat2->rel1.src1_rel) { - print_reg_src((reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r, - cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg, - cat2->src1_abs, cat2->rel1.src1_rel); - } else { - print_reg_src((reg_t)(cat2->src1), cat2->full, cat2->src1_r, - false, cat2->src1_im, cat2->src1_neg, - cat2->src1_abs, false); - } - - switch (cat2->opc) { - case OPC_ABSNEG_F: - case OPC_ABSNEG_S: - case OPC_CLZ_B: - case OPC_CLZ_S: - case OPC_SIGN_F: - case OPC_FLOOR_F: - case OPC_CEIL_F: - case OPC_RNDNE_F: - case OPC_RNDAZ_F: - case OPC_TRUNC_F: - case OPC_NOT_B: - case OPC_BFREV_B: - case OPC_SETRM: - case OPC_CBITS_B: - /* these only have one src reg */ - break; - default: - printf(", "); - if (cat2->c2.src2_c) { - print_reg_src((reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r, - cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg, - cat2->src2_abs, false); - } else if (cat2->rel2.src2_rel) { - print_reg_src((reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r, - cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg, - cat2->src2_abs, cat2->rel2.src2_rel); - } else { - print_reg_src((reg_t)(cat2->src2), cat2->full, cat2->src2_r, - false, cat2->src2_im, cat2->src2_neg, - cat2->src2_abs, false); - } - break; - } -} - -static void print_instr_cat3(instr_t *instr) -{ - instr_cat3_t *cat3 = &instr->cat3; - bool full = instr_cat3_full(cat3); - - printf(" "); - print_reg_dst((reg_t)(cat3->dst), full ^ cat3->dst_half, false); - printf(", "); - if (cat3->c1.src1_c) { - print_reg_src((reg_t)(cat3->c1.src1), full, - cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg, - false, false); - } else if (cat3->rel1.src1_rel) { - print_reg_src((reg_t)(cat3->rel1.src1), full, - cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg, - false, cat3->rel1.src1_rel); - } else { - print_reg_src((reg_t)(cat3->src1), full, - cat3->src1_r, false, false, cat3->src1_neg, - false, false); - } - printf(", "); - print_reg_src((reg_t)cat3->src2, full, - cat3->src2_r, cat3->src2_c, false, cat3->src2_neg, - false, false); - printf(", "); - if (cat3->c2.src3_c) { - print_reg_src((reg_t)(cat3->c2.src3), full, - cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg, - false, false); - } else if (cat3->rel2.src3_rel) { - print_reg_src((reg_t)(cat3->rel2.src3), full, - cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg, - false, cat3->rel2.src3_rel); - } else { - print_reg_src((reg_t)(cat3->src3), full, - cat3->src3_r, false, false, cat3->src3_neg, - false, false); - } -} - -static void print_instr_cat4(instr_t *instr) -{ - instr_cat4_t *cat4 = &instr->cat4; - - printf(" "); - print_reg_dst((reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false); - printf(", "); - - if (cat4->c.src_c) { - print_reg_src((reg_t)(cat4->c.src), cat4->full, - cat4->src_r, cat4->c.src_c, cat4->src_im, - cat4->src_neg, cat4->src_abs, false); - } else if (cat4->rel.src_rel) { - print_reg_src((reg_t)(cat4->rel.src), cat4->full, - cat4->src_r, cat4->rel.src_c, cat4->src_im, - cat4->src_neg, cat4->src_abs, cat4->rel.src_rel); - } else { - print_reg_src((reg_t)(cat4->src), cat4->full, - cat4->src_r, false, cat4->src_im, - cat4->src_neg, cat4->src_abs, false); - } - - if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2)) - printf("\t{4: %x,%x}", cat4->dummy1, cat4->dummy2); -} - -static void print_instr_cat5(instr_t *instr) -{ - static const struct { - bool src1, src2, samp, tex; - } info[0x1f] = { - [OPC_ISAM] = { true, false, true, true, }, - [OPC_ISAML] = { true, true, true, true, }, - [OPC_ISAMM] = { true, false, true, true, }, - [OPC_SAM] = { true, false, true, true, }, - [OPC_SAMB] = { true, true, true, true, }, - [OPC_SAML] = { true, true, true, true, }, - [OPC_SAMGQ] = { true, false, true, true, }, - [OPC_GETLOD] = { true, false, true, true, }, - [OPC_CONV] = { true, true, true, true, }, - [OPC_CONVM] = { true, true, true, true, }, - [OPC_GETSIZE] = { true, false, false, true, }, - [OPC_GETBUF] = { false, false, false, true, }, - [OPC_GETPOS] = { true, false, false, true, }, - [OPC_GETINFO] = { false, false, false, true, }, - [OPC_DSX] = { true, false, false, false, }, - [OPC_DSY] = { true, false, false, false, }, - [OPC_GATHER4R] = { true, false, true, true, }, - [OPC_GATHER4G] = { true, false, true, true, }, - [OPC_GATHER4B] = { true, false, true, true, }, - [OPC_GATHER4A] = { true, false, true, true, }, - [OPC_SAMGP0] = { true, false, true, true, }, - [OPC_SAMGP1] = { true, false, true, true, }, - [OPC_SAMGP2] = { true, false, true, true, }, - [OPC_SAMGP3] = { true, false, true, true, }, - [OPC_DSXPP_1] = { true, false, false, false, }, - [OPC_DSYPP_1] = { true, false, false, false, }, - [OPC_RGETPOS] = { false, false, false, false, }, - [OPC_RGETINFO] = { false, false, false, false, }, - }; - instr_cat5_t *cat5 = &instr->cat5; - int i; - - if (cat5->is_3d) printf(".3d"); - if (cat5->is_a) printf(".a"); - if (cat5->is_o) printf(".o"); - if (cat5->is_p) printf(".p"); - if (cat5->is_s) printf(".s"); - if (cat5->is_s2en) printf(".s2en"); - - printf(" "); - - switch (cat5->opc) { - case OPC_DSXPP_1: - case OPC_DSYPP_1: - break; - default: - printf("(%s)", type[cat5->type]); - break; - } - - printf("("); - for (i = 0; i < 4; i++) - if (cat5->wrmask & (1 << i)) - printf("%c", "xyzw"[i]); - printf(")"); - - print_reg_dst((reg_t)(cat5->dst), type_size(cat5->type) == 32, false); - - if (info[cat5->opc].src1) { - printf(", "); - print_reg_src((reg_t)(cat5->src1), cat5->full, false, false, false, - false, false, false); - } - - if (cat5->is_s2en) { - printf(", "); - print_reg_src((reg_t)(cat5->s2en.src2), cat5->full, false, false, false, - false, false, false); - printf(", "); - print_reg_src((reg_t)(cat5->s2en.src3), false, false, false, false, - false, false, false); - } else { - if (cat5->is_o || info[cat5->opc].src2) { - printf(", "); - print_reg_src((reg_t)(cat5->norm.src2), cat5->full, - false, false, false, false, false, false); - } - if (info[cat5->opc].samp) - printf(", s#%d", cat5->norm.samp); - if (info[cat5->opc].tex) - printf(", t#%d", cat5->norm.tex); - } - - if (debug & PRINT_VERBOSE) { - if (cat5->is_s2en) { - if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2)) - printf("\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2); - } else { - if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2)) - printf("\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2); - } - } -} - -static int32_t u2i(uint32_t val, int nbits) -{ - return ((val >> (nbits-1)) * ~((1 << nbits) - 1)) | val; -} - -static void print_instr_cat6(instr_t *instr) -{ - instr_cat6_t *cat6 = &instr->cat6; - - printf(".%s ", type[cat6->type]); - - switch (cat6->opc) { - case OPC_LDG: - case OPC_LDP: - case OPC_LDL: - case OPC_LDLW: - case OPC_LDLV: - /* load instructions: */ - print_reg_dst((reg_t)(cat6->a.dst), type_size(cat6->type) == 32, false); - printf(","); - switch (cat6->opc) { - case OPC_LDG: - printf("g"); - break; - case OPC_LDP: - printf("p"); - break; - case OPC_LDL: - case OPC_LDLW: - case OPC_LDLV: - printf("l"); - break; - } - printf("["); - print_reg_src((reg_t)(cat6->a.src), true, - false, false, false, false, false, false); - if (cat6->a.off) - printf("%+d", cat6->a.off); - printf("]"); - break; - case OPC_PREFETCH: - /* similar to load instructions: */ - printf("g["); - print_reg_src((reg_t)(cat6->a.src), true, - false, false, false, false, false, false); - if (cat6->a.off) - printf("%+d", cat6->a.off); - printf("]"); - break; - case OPC_STG: - case OPC_STP: - case OPC_STL: - case OPC_STLW: - /* store instructions: */ - switch (cat6->opc) { - case OPC_STG: - printf("g"); - break; - case OPC_STP: - printf("p"); - break; - case OPC_STL: - case OPC_STLW: - printf("l"); - break; - } - printf("["); - print_reg_dst((reg_t)(cat6->b.dst), true, false); - if (cat6->b.off || cat6->b.off_hi) - printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13)); - printf("]"); - printf(","); - print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32, - false, false, false, false, false, false); - - break; - case OPC_STI: - /* sti has same encoding as other store instructions, but - * slightly different syntax: - */ - print_reg_dst((reg_t)(cat6->b.dst), false /* XXX is it always half? */, false); - if (cat6->b.off || cat6->b.off_hi) - printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13)); - printf(","); - print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32, - false, false, false, false, false, false); - break; - } - - printf(", %d", cat6->iim_val); - - if (debug & PRINT_VERBOSE) { - switch (cat6->opc) { - case OPC_LDG: - case OPC_LDP: - /* load instructions: */ - if (cat6->a.dummy1|cat6->a.dummy2|cat6->a.dummy3) - printf("\t{6: %x,%x,%x}", cat6->a.dummy1, cat6->a.dummy2, cat6->a.dummy3); - if ((cat6->a.must_be_one1 != 1) || (cat6->a.must_be_one2 != 1)) - printf("{?? %d,%d ??}", cat6->a.must_be_one1, cat6->a.must_be_one2); - break; - case OPC_STG: - case OPC_STP: - case OPC_STI: - /* store instructions: */ - if (cat6->b.dummy1|cat6->b.dummy2) - printf("\t{6: %x,%x}", cat6->b.dummy1, cat6->b.dummy2); - if ((cat6->b.must_be_one1 != 1) || (cat6->b.must_be_one2 != 1) || - (cat6->b.must_be_zero1 != 0)) - printf("{?? %d,%d,%d ??}", cat6->b.must_be_one1, cat6->b.must_be_one2, - cat6->b.must_be_zero1); - break; - } - } -} - -/* size of largest OPC field of all the instruction categories: */ -#define NOPC_BITS 6 - -struct opc_info { - uint16_t cat; - uint16_t opc; - const char *name; - void (*print)(instr_t *instr); -} opcs[1 << (3+NOPC_BITS)] = { -#define OPC(cat, opc, name) [((cat) << NOPC_BITS) | (opc)] = { (cat), (opc), #name, print_instr_cat##cat } - /* category 0: */ - OPC(0, OPC_NOP, nop), - OPC(0, OPC_BR, br), - OPC(0, OPC_JUMP, jump), - OPC(0, OPC_CALL, call), - OPC(0, OPC_RET, ret), - OPC(0, OPC_KILL, kill), - OPC(0, OPC_END, end), - OPC(0, OPC_EMIT, emit), - OPC(0, OPC_CUT, cut), - OPC(0, OPC_CHMASK, chmask), - OPC(0, OPC_CHSH, chsh), - OPC(0, OPC_FLOW_REV, flow_rev), - - /* category 1: */ - OPC(1, 0, ), - - /* category 2: */ - OPC(2, OPC_ADD_F, add.f), - OPC(2, OPC_MIN_F, min.f), - OPC(2, OPC_MAX_F, max.f), - OPC(2, OPC_MUL_F, mul.f), - OPC(2, OPC_SIGN_F, sign.f), - OPC(2, OPC_CMPS_F, cmps.f), - OPC(2, OPC_ABSNEG_F, absneg.f), - OPC(2, OPC_CMPV_F, cmpv.f), - OPC(2, OPC_FLOOR_F, floor.f), - OPC(2, OPC_CEIL_F, ceil.f), - OPC(2, OPC_RNDNE_F, rndne.f), - OPC(2, OPC_RNDAZ_F, rndaz.f), - OPC(2, OPC_TRUNC_F, trunc.f), - OPC(2, OPC_ADD_U, add.u), - OPC(2, OPC_ADD_S, add.s), - OPC(2, OPC_SUB_U, sub.u), - OPC(2, OPC_SUB_S, sub.s), - OPC(2, OPC_CMPS_U, cmps.u), - OPC(2, OPC_CMPS_S, cmps.s), - OPC(2, OPC_MIN_U, min.u), - OPC(2, OPC_MIN_S, min.s), - OPC(2, OPC_MAX_U, max.u), - OPC(2, OPC_MAX_S, max.s), - OPC(2, OPC_ABSNEG_S, absneg.s), - OPC(2, OPC_AND_B, and.b), - OPC(2, OPC_OR_B, or.b), - OPC(2, OPC_NOT_B, not.b), - OPC(2, OPC_XOR_B, xor.b), - OPC(2, OPC_CMPV_U, cmpv.u), - OPC(2, OPC_CMPV_S, cmpv.s), - OPC(2, OPC_MUL_U, mul.u), - OPC(2, OPC_MUL_S, mul.s), - OPC(2, OPC_MULL_U, mull.u), - OPC(2, OPC_BFREV_B, bfrev.b), - OPC(2, OPC_CLZ_S, clz.s), - OPC(2, OPC_CLZ_B, clz.b), - OPC(2, OPC_SHL_B, shl.b), - OPC(2, OPC_SHR_B, shr.b), - OPC(2, OPC_ASHR_B, ashr.b), - OPC(2, OPC_BARY_F, bary.f), - OPC(2, OPC_MGEN_B, mgen.b), - OPC(2, OPC_GETBIT_B, getbit.b), - OPC(2, OPC_SETRM, setrm), - OPC(2, OPC_CBITS_B, cbits.b), - OPC(2, OPC_SHB, shb), - OPC(2, OPC_MSAD, msad), - - /* category 3: */ - OPC(3, OPC_MAD_U16, mad.u16), - OPC(3, OPC_MADSH_U16, madsh.u16), - OPC(3, OPC_MAD_S16, mad.s16), - OPC(3, OPC_MADSH_M16, madsh.m16), - OPC(3, OPC_MAD_U24, mad.u24), - OPC(3, OPC_MAD_S24, mad.s24), - OPC(3, OPC_MAD_F16, mad.f16), - OPC(3, OPC_MAD_F32, mad.f32), - OPC(3, OPC_SEL_B16, sel.b16), - OPC(3, OPC_SEL_B32, sel.b32), - OPC(3, OPC_SEL_S16, sel.s16), - OPC(3, OPC_SEL_S32, sel.s32), - OPC(3, OPC_SEL_F16, sel.f16), - OPC(3, OPC_SEL_F32, sel.f32), - OPC(3, OPC_SAD_S16, sad.s16), - OPC(3, OPC_SAD_S32, sad.s32), - - /* category 4: */ - OPC(4, OPC_RCP, rcp), - OPC(4, OPC_RSQ, rsq), - OPC(4, OPC_LOG2, log2), - OPC(4, OPC_EXP2, exp2), - OPC(4, OPC_SIN, sin), - OPC(4, OPC_COS, cos), - OPC(4, OPC_SQRT, sqrt), - - /* category 5: */ - OPC(5, OPC_ISAM, isam), - OPC(5, OPC_ISAML, isaml), - OPC(5, OPC_ISAMM, isamm), - OPC(5, OPC_SAM, sam), - OPC(5, OPC_SAMB, samb), - OPC(5, OPC_SAML, saml), - OPC(5, OPC_SAMGQ, samgq), - OPC(5, OPC_GETLOD, getlod), - OPC(5, OPC_CONV, conv), - OPC(5, OPC_CONVM, convm), - OPC(5, OPC_GETSIZE, getsize), - OPC(5, OPC_GETBUF, getbuf), - OPC(5, OPC_GETPOS, getpos), - OPC(5, OPC_GETINFO, getinfo), - OPC(5, OPC_DSX, dsx), - OPC(5, OPC_DSY, dsy), - OPC(5, OPC_GATHER4R, gather4r), - OPC(5, OPC_GATHER4G, gather4g), - OPC(5, OPC_GATHER4B, gather4b), - OPC(5, OPC_GATHER4A, gather4a), - OPC(5, OPC_SAMGP0, samgp0), - OPC(5, OPC_SAMGP1, samgp1), - OPC(5, OPC_SAMGP2, samgp2), - OPC(5, OPC_SAMGP3, samgp3), - OPC(5, OPC_DSXPP_1, dsxpp.1), - OPC(5, OPC_DSYPP_1, dsypp.1), - OPC(5, OPC_RGETPOS, rgetpos), - OPC(5, OPC_RGETINFO, rgetinfo), - - - /* category 6: */ - OPC(6, OPC_LDG, ldg), - OPC(6, OPC_LDL, ldl), - OPC(6, OPC_LDP, ldp), - OPC(6, OPC_STG, stg), - OPC(6, OPC_STL, stl), - OPC(6, OPC_STP, stp), - OPC(6, OPC_STI, sti), - OPC(6, OPC_G2L, g2l), - OPC(6, OPC_L2G, l2g), - OPC(6, OPC_PREFETCH, prefetch), - OPC(6, OPC_LDLW, ldlw), - OPC(6, OPC_STLW, stlw), - OPC(6, OPC_RESFMT, resfmt), - OPC(6, OPC_RESINFO, resinf), - OPC(6, OPC_ATOMIC_ADD_L, atomic.add.l), - OPC(6, OPC_ATOMIC_SUB_L, atomic.sub.l), - OPC(6, OPC_ATOMIC_XCHG_L, atomic.xchg.l), - OPC(6, OPC_ATOMIC_INC_L, atomic.inc.l), - OPC(6, OPC_ATOMIC_DEC_L, atomic.dec.l), - OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l), - OPC(6, OPC_ATOMIC_MIN_L, atomic.min.l), - OPC(6, OPC_ATOMIC_MAX_L, atomic.max.l), - OPC(6, OPC_ATOMIC_AND_L, atomic.and.l), - OPC(6, OPC_ATOMIC_OR_L, atomic.or.l), - OPC(6, OPC_ATOMIC_XOR_L, atomic.xor.l), - OPC(6, OPC_LDGB_TYPED_4D, ldgb.typed.4d), - OPC(6, OPC_STGB_4D_4, stgb.4d.4), - OPC(6, OPC_STIB, stib), - OPC(6, OPC_LDC_4, ldc.4), - OPC(6, OPC_LDLV, ldlv), - - -#undef OPC -}; - -#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)])) - -// XXX hack.. probably should move this table somewhere common: -#include "ir3.h" -const char *ir3_instr_name(struct ir3_instruction *instr) -{ - if (instr->category == -1) return "??meta??"; - return opcs[(instr->category << NOPC_BITS) | instr->opc].name; -} - -static void print_instr(uint32_t *dwords, int level, int n) -{ - instr_t *instr = (instr_t *)dwords; - uint32_t opc = instr_opc(instr); - const char *name; - - printf("%s%04d[%08xx_%08xx] ", levels[level], n, dwords[1], dwords[0]); - -#if 0 - /* print unknown bits: */ - if (debug & PRINT_RAW) - printf("[%08xx_%08xx] ", dwords[1] & 0x001ff800, dwords[0] & 0x00000000); - - if (debug & PRINT_VERBOSE) - printf("%d,%02d ", instr->opc_cat, opc); -#endif - - /* NOTE: order flags are printed is a bit fugly.. but for now I - * try to match the order in llvm-a3xx disassembler for easy - * diff'ing.. - */ - - if (instr->sync) - printf("(sy)"); - if (instr->ss && (instr->opc_cat <= 4)) - printf("(ss)"); - if (instr->jmp_tgt) - printf("(jp)"); - if (instr->repeat && (instr->opc_cat <= 4)) { - printf("(rpt%d)", instr->repeat); - repeat = instr->repeat; - } else { - repeat = 0; - } - if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4))) - printf("(ul)"); - - name = GETINFO(instr)->name; - - if (name) { - printf("%s", name); - GETINFO(instr)->print(instr); - } else { - printf("unknown(%d,%d)", instr->opc_cat, opc); - } - - printf("\n"); -} - -int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type) -{ - int i; - - assert((sizedwords % 2) == 0); - - for (i = 0; i < sizedwords; i += 2) - print_instr(&dwords[i], level, i/2); - - return 0; -} diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c deleted file mode 100644 index 0c22e55711b..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ /dev/null @@ -1,2638 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2013 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include - -#include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_ureg.h" -#include "tgsi/tgsi_info.h" -#include "tgsi/tgsi_strings.h" -#include "tgsi/tgsi_dump.h" -#include "tgsi/tgsi_scan.h" - -#include "freedreno_lowering.h" - -#include "fd3_compiler.h" -#include "fd3_program.h" - -#include "instr-a3xx.h" -#include "ir3.h" - -struct fd3_compile_context { - const struct tgsi_token *tokens; - bool free_tokens; - struct ir3 *ir; - struct fd3_shader_variant *so; - - struct ir3_block *block; - struct ir3_instruction *current_instr; - - /* we need to defer updates to block->outputs[] until the end - * of an instruction (so we don't see new value until *after* - * the src registers are processed) - */ - struct { - struct ir3_instruction *instr, **instrp; - } output_updates[16]; - unsigned num_output_updates; - - /* are we in a sequence of "atomic" instructions? - */ - bool atomic; - - /* For fragment shaders, from the hw perspective the only - * actual input is r0.xy position register passed to bary.f. - * But TGSI doesn't know that, it still declares things as - * IN[] registers. So we do all the input tracking normally - * and fix things up after compile_instructions() - * - * NOTE that frag_pos is the hardware position (possibly it - * is actually an index or tag or some such.. it is *not* - * values that can be directly used for gl_FragCoord..) - */ - struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4]; - - struct tgsi_parse_context parser; - unsigned type; - - struct tgsi_shader_info info; - - /* for calculating input/output positions/linkages: */ - unsigned next_inloc; - - unsigned num_internal_temps; - struct tgsi_src_register internal_temps[6]; - - /* idx/slot for last compiler generated immediate */ - unsigned immediate_idx; - - /* stack of branch instructions that mark (potentially nested) - * branch if/else/loop/etc - */ - struct { - struct ir3_instruction *instr, *cond; - bool inv; /* true iff in else leg of branch */ - } branch[16]; - unsigned int branch_count; - - /* list of kill instructions: */ - struct ir3_instruction *kill[16]; - unsigned int kill_count; - - /* used when dst is same as one of the src, to avoid overwriting a - * src element before the remaining scalar instructions that make - * up the vector operation - */ - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; -}; - - -static void vectorize(struct fd3_compile_context *ctx, - struct ir3_instruction *instr, struct tgsi_dst_register *dst, - int nsrcs, ...); -static void create_mov(struct fd3_compile_context *ctx, - struct tgsi_dst_register *dst, struct tgsi_src_register *src); -static type_t get_ftype(struct fd3_compile_context *ctx); - -static unsigned -compile_init(struct fd3_compile_context *ctx, struct fd3_shader_variant *so, - const struct tgsi_token *tokens) -{ - unsigned ret; - struct tgsi_shader_info *info = &ctx->info; - const struct fd_lowering_config lconfig = { - .color_two_side = so->key.color_two_side, - .lower_DST = true, - .lower_XPD = true, - .lower_SCS = true, - .lower_LRP = true, - .lower_FRC = true, - .lower_POW = true, - .lower_LIT = true, - .lower_EXP = true, - .lower_LOG = true, - .lower_DP4 = true, - .lower_DP3 = true, - .lower_DPH = true, - .lower_DP2 = true, - .lower_DP2A = true, - }; - - ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info); - ctx->free_tokens = !!ctx->tokens; - if (!ctx->tokens) { - /* no lowering */ - ctx->tokens = tokens; - } - ctx->ir = so->ir; - ctx->so = so; - ctx->next_inloc = 8; - ctx->num_internal_temps = 0; - ctx->branch_count = 0; - ctx->kill_count = 0; - ctx->block = NULL; - ctx->current_instr = NULL; - ctx->num_output_updates = 0; - ctx->atomic = false; - ctx->frag_pos = NULL; - ctx->frag_face = NULL; - - memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord)); - -#define FM(x) (1 << TGSI_FILE_##x) - /* optimize can't deal with relative addressing: */ - if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) - return TGSI_PARSE_ERROR; - - /* Immediates go after constants: */ - so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1; - ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); - - ret = tgsi_parse_init(&ctx->parser, ctx->tokens); - if (ret != TGSI_PARSE_OK) - return ret; - - ctx->type = ctx->parser.FullHeader.Processor.Processor; - - return ret; -} - -static void -compile_error(struct fd3_compile_context *ctx, const char *format, ...) -{ - va_list ap; - va_start(ap, format); - _debug_vprintf(format, ap); - va_end(ap); - tgsi_dump(ctx->tokens, 0); - debug_assert(0); -} - -#define compile_assert(ctx, cond) do { \ - if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ - } while (0) - -static void -compile_free(struct fd3_compile_context *ctx) -{ - if (ctx->free_tokens) - free((void *)ctx->tokens); - tgsi_parse_free(&ctx->parser); -} - -struct instr_translater { - void (*fxn)(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst); - unsigned tgsi_opc; - opc_t opc; - opc_t hopc; /* opc to use for half_precision mode, if different */ - unsigned arg; -}; - -static void -instr_finish(struct fd3_compile_context *ctx) -{ - unsigned i; - - if (ctx->atomic) - return; - - for (i = 0; i < ctx->num_output_updates; i++) - *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr; - - ctx->num_output_updates = 0; -} - -/* For "atomic" groups of instructions, for example the four scalar - * instructions to perform a vec4 operation. Basically this just - * blocks out handling of output_updates so the next scalar instruction - * still sees the result from before the start of the atomic group. - * - * NOTE: when used properly, this could probably replace get/put_dst() - * stuff. - */ -static void -instr_atomic_start(struct fd3_compile_context *ctx) -{ - ctx->atomic = true; -} - -static void -instr_atomic_end(struct fd3_compile_context *ctx) -{ - ctx->atomic = false; - instr_finish(ctx); -} - -static struct ir3_instruction * -instr_create(struct fd3_compile_context *ctx, int category, opc_t opc) -{ - instr_finish(ctx); - return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc)); -} - -static struct ir3_instruction * -instr_clone(struct fd3_compile_context *ctx, struct ir3_instruction *instr) -{ - instr_finish(ctx); - return (ctx->current_instr = ir3_instr_clone(instr)); -} - -static struct ir3_block * -push_block(struct fd3_compile_context *ctx) -{ - struct ir3_block *block; - unsigned ntmp, nin, nout; - -#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1)) - - /* hmm, give ourselves room to create 4 extra temporaries (vec4): - */ - ntmp = SCALAR_REGS(TEMPORARY); - ntmp += 4 * 4; - - nout = SCALAR_REGS(OUTPUT); - nin = SCALAR_REGS(INPUT); - - /* for outermost block, 'inputs' are the actual shader INPUT - * register file. Reads from INPUT registers always go back to - * top block. For nested blocks, 'inputs' is used to track any - * TEMPORARY file register from one of the enclosing blocks that - * is ready in this block. - */ - if (!ctx->block) { - /* NOTE: fragment shaders actually have two inputs (r0.xy, the - * position) - */ - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - int n = 2; - if (ctx->info.reads_position) - n += 4; - if (ctx->info.uses_frontface) - n += 4; - nin = MAX2(n, nin); - nout += ARRAY_SIZE(ctx->kill); - } - } else { - nin = ntmp; - } - - block = ir3_block_create(ctx->ir, ntmp, nin, nout); - - if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block) - block->noutputs -= ARRAY_SIZE(ctx->kill); - - block->parent = ctx->block; - ctx->block = block; - - return block; -} - -static void -pop_block(struct fd3_compile_context *ctx) -{ - ctx->block = ctx->block->parent; - compile_assert(ctx, ctx->block); -} - -static struct ir3_instruction * -create_output(struct ir3_block *block, struct ir3_instruction *instr, - unsigned n) -{ - struct ir3_instruction *out; - - out = ir3_instr_create(block, -1, OPC_META_OUTPUT); - out->inout.block = block; - ir3_reg_create(out, n, 0); - if (instr) - ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr; - - return out; -} - -static struct ir3_instruction * -create_input(struct ir3_block *block, struct ir3_instruction *instr, - unsigned n) -{ - struct ir3_instruction *in; - - in = ir3_instr_create(block, -1, OPC_META_INPUT); - in->inout.block = block; - ir3_reg_create(in, n, 0); - if (instr) - ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; - - return in; -} - -static struct ir3_instruction * -block_input(struct ir3_block *block, unsigned n) -{ - /* references to INPUT register file always go back up to - * top level: - */ - if (block->parent) - return block_input(block->parent, n); - return block->inputs[n]; -} - -/* return temporary in scope, creating if needed meta-input node - * to track block inputs - */ -static struct ir3_instruction * -block_temporary(struct ir3_block *block, unsigned n) -{ - /* references to TEMPORARY register file, find the nearest - * enclosing block which has already assigned this temporary, - * creating meta-input instructions along the way to keep - * track of block inputs - */ - if (block->parent && !block->temporaries[n]) { - /* if already have input for this block, reuse: */ - if (!block->inputs[n]) - block->inputs[n] = block_temporary(block->parent, n); - - /* and create new input to return: */ - return create_input(block, block->inputs[n], n); - } - return block->temporaries[n]; -} - -static struct ir3_instruction * -create_immed(struct fd3_compile_context *ctx, float val) -{ - /* NOTE: *don't* use instr_create() here! - */ - struct ir3_instruction *instr; - instr = ir3_instr_create(ctx->block, 1, 0); - instr->cat1.src_type = get_ftype(ctx); - instr->cat1.dst_type = get_ftype(ctx); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val; - return instr; -} - -static void -ssa_dst(struct fd3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_dst_register *dst, unsigned chan) -{ - unsigned n = regid(dst->Index, chan); - unsigned idx = ctx->num_output_updates; - - compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates)); - - /* NOTE: defer update of temporaries[idx] or output[idx] - * until instr_finish(), so that if the current instruction - * reads the same TEMP/OUT[] it gets the old value: - * - * bleh.. this might be a bit easier to just figure out - * in instr_finish(). But at that point we've already - * lost information about OUTPUT vs TEMPORARY register - * file.. - */ - - switch (dst->File) { - case TGSI_FILE_OUTPUT: - compile_assert(ctx, n < ctx->block->noutputs); - ctx->output_updates[idx].instrp = &ctx->block->outputs[n]; - ctx->output_updates[idx].instr = instr; - ctx->num_output_updates++; - break; - case TGSI_FILE_TEMPORARY: - compile_assert(ctx, n < ctx->block->ntemporaries); - ctx->output_updates[idx].instrp = &ctx->block->temporaries[n]; - ctx->output_updates[idx].instr = instr; - ctx->num_output_updates++; - break; - case TGSI_FILE_ADDRESS: - compile_assert(ctx, n < 1); - ctx->output_updates[idx].instrp = &ctx->block->address; - ctx->output_updates[idx].instr = instr; - ctx->num_output_updates++; - break; - } -} - -static void -ssa_src(struct fd3_compile_context *ctx, struct ir3_register *reg, - const struct tgsi_src_register *src, unsigned chan) -{ - struct ir3_block *block = ctx->block; - unsigned n = regid(src->Index, chan); - - switch (src->File) { - case TGSI_FILE_INPUT: - reg->flags |= IR3_REG_SSA; - reg->instr = block_input(ctx->block, n); - break; - case TGSI_FILE_OUTPUT: - /* really this should just happen in case of 'MOV_SAT OUT[n], ..', - * for the following clamp instructions: - */ - reg->flags |= IR3_REG_SSA; - reg->instr = block->outputs[n]; - /* we don't have to worry about read from an OUTPUT that was - * assigned outside of the current block, because the _SAT - * clamp instructions will always be in the same block as - * the original instruction which wrote the OUTPUT - */ - compile_assert(ctx, reg->instr); - break; - case TGSI_FILE_TEMPORARY: - reg->flags |= IR3_REG_SSA; - reg->instr = block_temporary(ctx->block, n); - break; - } - - if ((reg->flags & IR3_REG_SSA) && !reg->instr) { - /* this can happen when registers (or components of a TGSI - * register) are used as src before they have been assigned - * (undefined contents). To avoid confusing the rest of the - * compiler, and to generally keep things peachy, substitute - * an instruction that sets the src to 0.0. Or to keep - * things undefined, I could plug in a random number? :-P - * - * NOTE: *don't* use instr_create() here! - */ - reg->instr = create_immed(ctx, 0.0); - } -} - -static struct ir3_register * -add_dst_reg_wrmask(struct fd3_compile_context *ctx, - struct ir3_instruction *instr, const struct tgsi_dst_register *dst, - unsigned chan, unsigned wrmask) -{ - unsigned flags = 0, num = 0; - struct ir3_register *reg; - - switch (dst->File) { - case TGSI_FILE_OUTPUT: - case TGSI_FILE_TEMPORARY: - /* uses SSA */ - break; - case TGSI_FILE_ADDRESS: - flags |= IR3_REG_ADDR; - /* uses SSA */ - break; - default: - compile_error(ctx, "unsupported dst register file: %s\n", - tgsi_file_name(dst->File)); - break; - } - - if (dst->Indirect) - flags |= IR3_REG_RELATIV; - - reg = ir3_reg_create(instr, regid(num, chan), flags); - - /* NOTE: do not call ssa_dst() if atomic.. vectorize() - * itself will call ssa_dst(). This is to filter out - * the (initially bogus) .x component dst which is - * created (but not necessarily used, ie. if the net - * result of the vector operation does not write to - * the .x component) - */ - - reg->wrmask = wrmask; - if (wrmask == 0x1) { - /* normal case */ - if (!ctx->atomic) - ssa_dst(ctx, instr, dst, chan); - } else if ((dst->File == TGSI_FILE_TEMPORARY) || - (dst->File == TGSI_FILE_OUTPUT) || - (dst->File == TGSI_FILE_ADDRESS)) { - unsigned i; - - /* if instruction writes multiple, we need to create - * some place-holder collect the registers: - */ - for (i = 0; i < 4; i++) { - if (wrmask & (1 << i)) { - struct ir3_instruction *collect = - ir3_instr_create(ctx->block, -1, OPC_META_FO); - collect->fo.off = i; - /* unused dst reg: */ - ir3_reg_create(collect, 0, 0); - /* and src reg used to hold original instr */ - ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr; - if (!ctx->atomic) - ssa_dst(ctx, collect, dst, chan+i); - } - } - } - - return reg; -} - -static struct ir3_register * -add_dst_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_dst_register *dst, unsigned chan) -{ - return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1); -} - -static struct ir3_register * -add_src_reg_wrmask(struct fd3_compile_context *ctx, - struct ir3_instruction *instr, const struct tgsi_src_register *src, - unsigned chan, unsigned wrmask) -{ - unsigned flags = 0, num = 0; - struct ir3_register *reg; - struct ir3_instruction *orig = NULL; - - /* TODO we need to use a mov to temp for const >= 64.. or maybe - * we could use relative addressing.. - */ - compile_assert(ctx, src->Index < 64); - - switch (src->File) { - case TGSI_FILE_IMMEDIATE: - /* TODO if possible, use actual immediate instead of const.. but - * TGSI has vec4 immediates, we can only embed scalar (of limited - * size, depending on instruction..) - */ - flags |= IR3_REG_CONST; - num = src->Index + ctx->so->first_immediate; - break; - case TGSI_FILE_CONSTANT: - flags |= IR3_REG_CONST; - num = src->Index; - break; - case TGSI_FILE_OUTPUT: - /* NOTE: we should only end up w/ OUTPUT file for things like - * clamp()'ing saturated dst instructions - */ - case TGSI_FILE_INPUT: - case TGSI_FILE_TEMPORARY: - /* uses SSA */ - break; - default: - compile_error(ctx, "unsupported src register file: %s\n", - tgsi_file_name(src->File)); - break; - } - - if (src->Absolute) - flags |= IR3_REG_ABS; - if (src->Negate) - flags |= IR3_REG_NEGATE; - - if (src->Indirect) { - flags |= IR3_REG_RELATIV; - - /* shouldn't happen, and we can't cope with it below: */ - compile_assert(ctx, wrmask == 0x1); - - /* wrap in a meta-deref to track both the src and address: */ - orig = instr; - - instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address; - } - - reg = ir3_reg_create(instr, regid(num, chan), flags); - - reg->wrmask = wrmask; - if (wrmask == 0x1) { - /* normal case */ - ssa_src(ctx, reg, src, chan); - } else if ((src->File == TGSI_FILE_TEMPORARY) || - (src->File == TGSI_FILE_OUTPUT) || - (src->File == TGSI_FILE_INPUT)) { - struct ir3_instruction *collect; - unsigned i; - - compile_assert(ctx, !src->Indirect); - - /* if instruction reads multiple, we need to create - * some place-holder collect the registers: - */ - collect = ir3_instr_create(ctx->block, -1, OPC_META_FI); - ir3_reg_create(collect, 0, 0); /* unused dst reg */ - - for (i = 0; i < 4; i++) { - if (wrmask & (1 << i)) { - /* and src reg used point to the original instr */ - ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), - src, chan + i); - } else if (wrmask & ~((i << i) - 1)) { - /* if any remaining components, then dummy - * placeholder src reg to fill in the blanks: - */ - ir3_reg_create(collect, 0, 0); - } - } - - reg->flags |= IR3_REG_SSA; - reg->instr = collect; - } - - if (src->Indirect) { - reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA); - reg->instr = instr; - } - return reg; -} - -static struct ir3_register * -add_src_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_src_register *src, unsigned chan) -{ - return add_src_reg_wrmask(ctx, instr, src, chan, 0x1); -} - -static void -src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) -{ - src->File = dst->File; - src->Indirect = dst->Indirect; - src->Dimension = dst->Dimension; - src->Index = dst->Index; - src->Absolute = 0; - src->Negate = 0; - src->SwizzleX = TGSI_SWIZZLE_X; - src->SwizzleY = TGSI_SWIZZLE_Y; - src->SwizzleZ = TGSI_SWIZZLE_Z; - src->SwizzleW = TGSI_SWIZZLE_W; -} - -/* Get internal-temp src/dst to use for a sequence of instructions - * generated by a single TGSI op. - */ -static struct tgsi_src_register * -get_internal_temp(struct fd3_compile_context *ctx, - struct tgsi_dst_register *tmp_dst) -{ - struct tgsi_src_register *tmp_src; - int n; - - tmp_dst->File = TGSI_FILE_TEMPORARY; - tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; - tmp_dst->Indirect = 0; - tmp_dst->Dimension = 0; - - /* assign next temporary: */ - n = ctx->num_internal_temps++; - compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); - tmp_src = &ctx->internal_temps[n]; - - tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1; - - src_from_dst(tmp_src, tmp_dst); - - return tmp_src; -} - -static inline bool -is_const(struct tgsi_src_register *src) -{ - return (src->File == TGSI_FILE_CONSTANT) || - (src->File == TGSI_FILE_IMMEDIATE); -} - -static inline bool -is_relative(struct tgsi_src_register *src) -{ - return src->Indirect; -} - -static inline bool -is_rel_or_const(struct tgsi_src_register *src) -{ - return is_relative(src) || is_const(src); -} - -static type_t -get_ftype(struct fd3_compile_context *ctx) -{ - return TYPE_F32; -} - -static type_t -get_utype(struct fd3_compile_context *ctx) -{ - return TYPE_U32; -} - -static unsigned -src_swiz(struct tgsi_src_register *src, int chan) -{ - switch (chan) { - case 0: return src->SwizzleX; - case 1: return src->SwizzleY; - case 2: return src->SwizzleZ; - case 3: return src->SwizzleW; - } - assert(0); - return 0; -} - -/* for instructions that cannot take a const register as src, if needed - * generate a move to temporary gpr: - */ -static struct tgsi_src_register * -get_unconst(struct fd3_compile_context *ctx, struct tgsi_src_register *src) -{ - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - - compile_assert(ctx, is_rel_or_const(src)); - - tmp_src = get_internal_temp(ctx, &tmp_dst); - - create_mov(ctx, &tmp_dst, src); - - return tmp_src; -} - -static void -get_immediate(struct fd3_compile_context *ctx, - struct tgsi_src_register *reg, uint32_t val) -{ - unsigned neg, swiz, idx, i; - /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ - static const unsigned swiz2tgsi[] = { - TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, - }; - - for (i = 0; i < ctx->immediate_idx; i++) { - swiz = i % 4; - idx = i / 4; - - if (ctx->so->immediates[idx].val[swiz] == val) { - neg = 0; - break; - } - - if (ctx->so->immediates[idx].val[swiz] == -val) { - neg = 1; - break; - } - } - - if (i == ctx->immediate_idx) { - /* need to generate a new immediate: */ - swiz = i % 4; - idx = i / 4; - neg = 0; - ctx->so->immediates[idx].val[swiz] = val; - ctx->so->immediates_count = idx + 1; - ctx->immediate_idx++; - } - - reg->File = TGSI_FILE_IMMEDIATE; - reg->Indirect = 0; - reg->Dimension = 0; - reg->Index = idx; - reg->Absolute = 0; - reg->Negate = neg; - reg->SwizzleX = swiz2tgsi[swiz]; - reg->SwizzleY = swiz2tgsi[swiz]; - reg->SwizzleZ = swiz2tgsi[swiz]; - reg->SwizzleW = swiz2tgsi[swiz]; -} - -static void -create_mov(struct fd3_compile_context *ctx, struct tgsi_dst_register *dst, - struct tgsi_src_register *src) -{ - type_t type_mov = get_ftype(ctx); - unsigned i; - - for (i = 0; i < 4; i++) { - /* move to destination: */ - if (dst->WriteMask & (1 << i)) { - struct ir3_instruction *instr; - - if (src->Absolute || src->Negate) { - /* can't have abs or neg on a mov instr, so use - * absneg.f instead to handle these cases: - */ - instr = instr_create(ctx, 2, OPC_ABSNEG_F); - } else { - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - } - - add_dst_reg(ctx, instr, dst, i); - add_src_reg(ctx, instr, src, src_swiz(src, i)); - } - } -} - -static void -create_clamp(struct fd3_compile_context *ctx, - struct tgsi_dst_register *dst, struct tgsi_src_register *val, - struct tgsi_src_register *minval, struct tgsi_src_register *maxval) -{ - struct ir3_instruction *instr; - - instr = instr_create(ctx, 2, OPC_MAX_F); - vectorize(ctx, instr, dst, 2, val, 0, minval, 0); - - instr = instr_create(ctx, 2, OPC_MIN_F); - vectorize(ctx, instr, dst, 2, val, 0, maxval, 0); -} - -static void -create_clamp_imm(struct fd3_compile_context *ctx, - struct tgsi_dst_register *dst, - uint32_t minval, uint32_t maxval) -{ - struct tgsi_src_register minconst, maxconst; - struct tgsi_src_register src; - - src_from_dst(&src, dst); - - get_immediate(ctx, &minconst, minval); - get_immediate(ctx, &maxconst, maxval); - - create_clamp(ctx, dst, &src, &minconst, &maxconst); -} - -static struct tgsi_dst_register * -get_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - unsigned i; - for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - struct tgsi_src_register *src = &inst->Src[i].Register; - if ((src->File == dst->File) && (src->Index == dst->Index)) { - if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) && - (src->SwizzleX == TGSI_SWIZZLE_X) && - (src->SwizzleY == TGSI_SWIZZLE_Y) && - (src->SwizzleZ == TGSI_SWIZZLE_Z) && - (src->SwizzleW == TGSI_SWIZZLE_W)) - continue; - ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst); - ctx->tmp_dst.WriteMask = dst->WriteMask; - dst = &ctx->tmp_dst; - break; - } - } - return dst; -} - -static void -put_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst, - struct tgsi_dst_register *dst) -{ - /* if necessary, add mov back into original dst: */ - if (dst != &inst->Dst[0].Register) { - create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src); - } -} - -/* helper to generate the necessary repeat and/or additional instructions - * to turn a scalar instruction into a vector operation: - */ -static void -vectorize(struct fd3_compile_context *ctx, struct ir3_instruction *instr, - struct tgsi_dst_register *dst, int nsrcs, ...) -{ - va_list ap; - int i, j, n = 0; - - instr_atomic_start(ctx); - - add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X); - - va_start(ap, nsrcs); - for (j = 0; j < nsrcs; j++) { - struct tgsi_src_register *src = - va_arg(ap, struct tgsi_src_register *); - unsigned flags = va_arg(ap, unsigned); - struct ir3_register *reg; - if (flags & IR3_REG_IMMED) { - reg = ir3_reg_create(instr, 0, IR3_REG_IMMED); - /* this is an ugly cast.. should have put flags first! */ - reg->iim_val = *(int *)&src; - } else { - reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X); - } - reg->flags |= flags & ~IR3_REG_NEGATE; - if (flags & IR3_REG_NEGATE) - reg->flags ^= IR3_REG_NEGATE; - } - va_end(ap); - - for (i = 0; i < 4; i++) { - if (dst->WriteMask & (1 << i)) { - struct ir3_instruction *cur; - - if (n++ == 0) { - cur = instr; - } else { - cur = instr_clone(ctx, instr); - } - - ssa_dst(ctx, cur, dst, i); - - /* fix-up dst register component: */ - cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i); - - /* fix-up src register component: */ - va_start(ap, nsrcs); - for (j = 0; j < nsrcs; j++) { - struct ir3_register *reg = cur->regs[j+1]; - struct tgsi_src_register *src = - va_arg(ap, struct tgsi_src_register *); - unsigned flags = va_arg(ap, unsigned); - if (reg->flags & IR3_REG_SSA) { - ssa_src(ctx, reg, src, src_swiz(src, i)); - } else if (!(flags & IR3_REG_IMMED)) { - reg->num = regid(reg->num >> 2, src_swiz(src, i)); - } - } - va_end(ap); - } - } - - instr_atomic_end(ctx); -} - -/* - * Handlers for TGSI instructions which do not have a 1:1 mapping to - * native instructions: - */ - -static void -trans_clamp(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src0 = &inst->Src[0].Register; - struct tgsi_src_register *src1 = &inst->Src[1].Register; - struct tgsi_src_register *src2 = &inst->Src[2].Register; - - create_clamp(ctx, dst, src0, src1, src2); - - put_dst(ctx, inst, dst); -} - -/* ARL(x) = x, but mova from hrN.x to a0.. */ -static void -trans_arl(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - struct tgsi_src_register *src = &inst->Src[0].Register; - unsigned chan = src->SwizzleX; - - compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); - - /* NOTE: we allocate a temporary from a flat register - * namespace (ignoring half vs full). It turns out - * not to really matter since registers get reassigned - * later in ir3_ra which (hopefully!) can deal a bit - * better with mixed half and full precision. - */ - tmp_src = get_internal_temp(ctx, &tmp_dst); - - /* cov.f{32,16}s16 Rtmp, Rsrc */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_ftype(ctx); - instr->cat1.dst_type = TYPE_S16; - add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; - add_src_reg(ctx, instr, src, chan); - - /* shl.b Rtmp, Rtmp, 2 */ - instr = instr_create(ctx, 2, OPC_SHL_B); - add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; - add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; - - /* mova a0, Rtmp */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = TYPE_S16; - instr->cat1.dst_type = TYPE_S16; - add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; - add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; -} - -/* - * texture fetch/sample instructions: - */ - -struct tex_info { - int8_t order[4]; - unsigned src_wrmask, flags; -}; - -static const struct tex_info * -get_tex_info(struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - static const struct tex_info tex1d = { - .order = { 0, -1, -1, -1 }, /* coord.x */ - .src_wrmask = TGSI_WRITEMASK_XY, - .flags = 0, - }; - static const struct tex_info tex1ds = { - .order = { 0, -1, 2, -1 }, /* coord.xz */ - .src_wrmask = TGSI_WRITEMASK_XYZ, - .flags = IR3_INSTR_S, - }; - static const struct tex_info tex2d = { - .order = { 0, 1, -1, -1 }, /* coord.xy */ - .src_wrmask = TGSI_WRITEMASK_XY, - .flags = 0, - }; - static const struct tex_info tex2ds = { - .order = { 0, 1, 2, -1 }, /* coord.xyz */ - .src_wrmask = TGSI_WRITEMASK_XYZ, - .flags = IR3_INSTR_S, - }; - static const struct tex_info tex3d = { - .order = { 0, 1, 2, -1 }, /* coord.xyz */ - .src_wrmask = TGSI_WRITEMASK_XYZ, - .flags = IR3_INSTR_3D, - }; - static const struct tex_info tex3ds = { - .order = { 0, 1, 2, 3 }, /* coord.xyzw */ - .src_wrmask = TGSI_WRITEMASK_XYZW, - .flags = IR3_INSTR_S | IR3_INSTR_3D, - }; - static const struct tex_info txp1d = { - .order = { 0, -1, 3, -1 }, /* coord.xw */ - .src_wrmask = TGSI_WRITEMASK_XYZ, - .flags = IR3_INSTR_P, - }; - static const struct tex_info txp1ds = { - .order = { 0, -1, 2, 3 }, /* coord.xzw */ - .src_wrmask = TGSI_WRITEMASK_XYZW, - .flags = IR3_INSTR_P | IR3_INSTR_S, - }; - static const struct tex_info txp2d = { - .order = { 0, 1, 3, -1 }, /* coord.xyw */ - .src_wrmask = TGSI_WRITEMASK_XYZ, - .flags = IR3_INSTR_P, - }; - static const struct tex_info txp2ds = { - .order = { 0, 1, 2, 3 }, /* coord.xyzw */ - .src_wrmask = TGSI_WRITEMASK_XYZW, - .flags = IR3_INSTR_P | IR3_INSTR_S, - }; - static const struct tex_info txp3d = { - .order = { 0, 1, 2, 3 }, /* coord.xyzw */ - .src_wrmask = TGSI_WRITEMASK_XYZW, - .flags = IR3_INSTR_P | IR3_INSTR_3D, - }; - - unsigned tex = inst->Texture.Texture; - - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_TEX: - switch (tex) { - case TGSI_TEXTURE_1D: - return &tex1d; - case TGSI_TEXTURE_SHADOW1D: - return &tex1ds; - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_RECT: - return &tex2d; - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_SHADOWRECT: - return &tex2ds; - case TGSI_TEXTURE_3D: - case TGSI_TEXTURE_CUBE: - return &tex3d; - case TGSI_TEXTURE_SHADOWCUBE: - return &tex3ds; - default: - compile_error(ctx, "unknown texture type: %s\n", - tgsi_texture_names[tex]); - return NULL; - } - break; - case TGSI_OPCODE_TXP: - switch (tex) { - case TGSI_TEXTURE_1D: - return &txp1d; - case TGSI_TEXTURE_SHADOW1D: - return &txp1ds; - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_RECT: - return &txp2d; - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_SHADOWRECT: - return &txp2ds; - case TGSI_TEXTURE_3D: - case TGSI_TEXTURE_CUBE: - return &txp3d; - default: - compile_error(ctx, "unknown texture type: %s\n", - tgsi_texture_names[tex]); - break; - } - break; - } - compile_assert(ctx, 0); - return NULL; -} - -static struct tgsi_src_register * -get_tex_coord(struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst, - const struct tex_info *tinf) -{ - struct tgsi_src_register *coord = &inst->Src[0].Register; - struct ir3_instruction *instr; - unsigned tex = inst->Texture.Texture; - bool needs_mov = false; - unsigned i; - - /* cat5 instruction cannot seem to handle const or relative: */ - if (is_rel_or_const(coord)) - needs_mov = true; - - /* 1D textures we fix up w/ 0.0 as 2nd coord: */ - if ((tex == TGSI_TEXTURE_1D) || (tex == TGSI_TEXTURE_SHADOW1D)) - needs_mov = true; - - /* The texture sample instructions need to coord in successive - * registers/components (ie. src.xy but not src.yx). And TXP - * needs the .w component in .z for 2D.. so in some cases we - * might need to emit some mov instructions to shuffle things - * around: - */ - for (i = 1; (i < 4) && (tinf->order[i] >= 0) && !needs_mov; i++) - if (src_swiz(coord, i) != (src_swiz(coord, 0) + tinf->order[i])) - needs_mov = true; - - if (needs_mov) { - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - unsigned j; - - type_t type_mov = get_ftype(ctx); - - /* need to move things around: */ - tmp_src = get_internal_temp(ctx, &tmp_dst); - - for (j = 0; j < 4; j++) { - if (tinf->order[j] < 0) - continue; - instr = instr_create(ctx, 1, 0); /* mov */ - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - add_dst_reg(ctx, instr, &tmp_dst, j); - add_src_reg(ctx, instr, coord, - src_swiz(coord, tinf->order[j])); - } - - /* fix up .y coord: */ - if ((tex == TGSI_TEXTURE_1D) || - (tex == TGSI_TEXTURE_SHADOW1D)) { - instr = instr_create(ctx, 1, 0); /* mov */ - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - add_dst_reg(ctx, instr, &tmp_dst, 1); /* .y */ - ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5; - } - - coord = tmp_src; - } - - return coord; -} - -static void -trans_samp(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - struct tgsi_src_register *coord; - struct tgsi_src_register *samp = &inst->Src[1].Register; - const struct tex_info *tinf; - - tinf = get_tex_info(ctx, inst); - coord = get_tex_coord(ctx, inst, tinf); - - instr = instr_create(ctx, 5, t->opc); - instr->cat5.type = get_ftype(ctx); - instr->cat5.samp = samp->Index; - instr->cat5.tex = samp->Index; - instr->flags |= tinf->flags; - - add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask); - add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf->src_wrmask); -} - -/* - * SEQ(a,b) = (a == b) ? 1.0 : 0.0 - * cmps.f.eq tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SNE(a,b) = (a != b) ? 1.0 : 0.0 - * cmps.f.ne tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SGE(a,b) = (a >= b) ? 1.0 : 0.0 - * cmps.f.ge tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SLE(a,b) = (a <= b) ? 1.0 : 0.0 - * cmps.f.le tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SGT(a,b) = (a > b) ? 1.0 : 0.0 - * cmps.f.gt tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SLT(a,b) = (a < b) ? 1.0 : 0.0 - * cmps.f.lt tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * CMP(a,b,c) = (a < 0.0) ? b : c - * cmps.f.lt tmp0, a, {0.0} - * sel.b16 dst, b, tmp0, c - */ -static void -trans_cmp(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - struct tgsi_src_register constval0; - /* final instruction for CMP() uses orig src1 and src2: */ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *a0, *a1, *a2; - unsigned condition; - - tmp_src = get_internal_temp(ctx, &tmp_dst); - - a0 = &inst->Src[0].Register; /* a */ - a1 = &inst->Src[1].Register; /* b */ - - switch (t->tgsi_opc) { - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_FSEQ: - condition = IR3_COND_EQ; - break; - case TGSI_OPCODE_SNE: - case TGSI_OPCODE_FSNE: - condition = IR3_COND_NE; - break; - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_FSGE: - condition = IR3_COND_GE; - break; - case TGSI_OPCODE_SLT: - case TGSI_OPCODE_FSLT: - condition = IR3_COND_LT; - break; - case TGSI_OPCODE_SLE: - condition = IR3_COND_LE; - break; - case TGSI_OPCODE_SGT: - condition = IR3_COND_GT; - break; - case TGSI_OPCODE_CMP: - get_immediate(ctx, &constval0, fui(0.0)); - a0 = &inst->Src[0].Register; /* a */ - a1 = &constval0; /* {0.0} */ - condition = IR3_COND_LT; - break; - default: - compile_assert(ctx, 0); - return; - } - - if (is_const(a0) && is_const(a1)) - a0 = get_unconst(ctx, a0); - - /* cmps.f. tmp, a0, a1 */ - instr = instr_create(ctx, 2, OPC_CMPS_F); - instr->cat2.condition = condition; - vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); - - switch (t->tgsi_opc) { - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_FSEQ: - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_FSGE: - case TGSI_OPCODE_SLE: - case TGSI_OPCODE_SNE: - case TGSI_OPCODE_FSNE: - case TGSI_OPCODE_SGT: - case TGSI_OPCODE_SLT: - case TGSI_OPCODE_FSLT: - /* cov.u16f16 dst, tmp0 */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_utype(ctx); - instr->cat1.dst_type = get_ftype(ctx); - vectorize(ctx, instr, dst, 1, tmp_src, 0); - break; - case TGSI_OPCODE_CMP: - a1 = &inst->Src[1].Register; - a2 = &inst->Src[2].Register; - /* sel.{b32,b16} dst, src2, tmp, src1 */ - instr = instr_create(ctx, 3, OPC_SEL_B32); - vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); - - break; - } - - put_dst(ctx, inst, dst); -} - -/* - * USNE(a,b) = (a != b) ? 1 : 0 - * cmps.u32.ne dst, a, b - * - * USEQ(a,b) = (a == b) ? 1 : 0 - * cmps.u32.eq dst, a, b - * - * ISGE(a,b) = (a > b) ? 1 : 0 - * cmps.s32.ge dst, a, b - * - * USGE(a,b) = (a > b) ? 1 : 0 - * cmps.u32.ge dst, a, b - * - * ISLT(a,b) = (a < b) ? 1 : 0 - * cmps.s32.lt dst, a, b - * - * USLT(a,b) = (a < b) ? 1 : 0 - * cmps.u32.lt dst, a, b - * - * UCMP(a,b,c) = (a < 0) ? b : c - * cmps.u32.lt tmp0, a, {0} - * sel.b16 dst, b, tmp0, c - */ -static void -trans_icmp(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register constval0; - struct tgsi_src_register *a0, *a1, *a2; - unsigned condition; - - a0 = &inst->Src[0].Register; /* a */ - a1 = &inst->Src[1].Register; /* b */ - - switch (t->tgsi_opc) { - case TGSI_OPCODE_USNE: - condition = IR3_COND_NE; - break; - case TGSI_OPCODE_USEQ: - condition = IR3_COND_EQ; - break; - case TGSI_OPCODE_ISGE: - case TGSI_OPCODE_USGE: - condition = IR3_COND_GE; - break; - case TGSI_OPCODE_ISLT: - case TGSI_OPCODE_USLT: - condition = IR3_COND_LT; - break; - case TGSI_OPCODE_UCMP: - get_immediate(ctx, &constval0, 0); - a0 = &inst->Src[0].Register; /* a */ - a1 = &constval0; /* {0} */ - condition = IR3_COND_LT; - break; - - default: - compile_assert(ctx, 0); - return; - } - - if (is_const(a0) && is_const(a1)) - a0 = get_unconst(ctx, a0); - - if (t->tgsi_opc == TGSI_OPCODE_UCMP) { - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - tmp_src = get_internal_temp(ctx, &tmp_dst); - /* cmps.u32.lt tmp, a0, a1 */ - instr = instr_create(ctx, 2, t->opc); - instr->cat2.condition = condition; - vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); - - a1 = &inst->Src[1].Register; - a2 = &inst->Src[2].Register; - /* sel.{b32,b16} dst, src2, tmp, src1 */ - instr = instr_create(ctx, 3, OPC_SEL_B32); - vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); - } else { - /* cmps.{u32,s32}. dst, a0, a1 */ - instr = instr_create(ctx, 2, t->opc); - instr->cat2.condition = condition; - vectorize(ctx, instr, dst, 2, a0, 0, a1, 0); - } - put_dst(ctx, inst, dst); -} - -/* - * Conditional / Flow control - */ - -static void -push_branch(struct fd3_compile_context *ctx, bool inv, - struct ir3_instruction *instr, struct ir3_instruction *cond) -{ - unsigned int idx = ctx->branch_count++; - compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch)); - ctx->branch[idx].instr = instr; - ctx->branch[idx].inv = inv; - /* else side of branch has same condition: */ - if (!inv) - ctx->branch[idx].cond = cond; -} - -static struct ir3_instruction * -pop_branch(struct fd3_compile_context *ctx) -{ - unsigned int idx = --ctx->branch_count; - return ctx->branch[idx].instr; -} - -static void -trans_if(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr, *cond; - struct tgsi_src_register *src = &inst->Src[0].Register; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - struct tgsi_src_register constval; - - get_immediate(ctx, &constval, fui(0.0)); - tmp_src = get_internal_temp(ctx, &tmp_dst); - - if (is_const(src)) - src = get_unconst(ctx, src); - - /* cmps.f.ne tmp0, b, {0.0} */ - instr = instr_create(ctx, 2, OPC_CMPS_F); - add_dst_reg(ctx, instr, &tmp_dst, 0); - add_src_reg(ctx, instr, src, src->SwizzleX); - add_src_reg(ctx, instr, &constval, constval.SwizzleX); - instr->cat2.condition = IR3_COND_NE; - - compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */ - cond = instr->regs[1]->instr; - - /* meta:flow tmp0 */ - instr = instr_create(ctx, -1, OPC_META_FLOW); - ir3_reg_create(instr, 0, 0); /* dummy dst */ - add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); - - push_branch(ctx, false, instr, cond); - instr->flow.if_block = push_block(ctx); -} - -static void -trans_else(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - - pop_block(ctx); - - instr = pop_branch(ctx); - - compile_assert(ctx, (instr->category == -1) && - (instr->opc == OPC_META_FLOW)); - - push_branch(ctx, true, instr, NULL); - instr->flow.else_block = push_block(ctx); -} - -static struct ir3_instruction * -find_temporary(struct ir3_block *block, unsigned n) -{ - if (block->parent && !block->temporaries[n]) - return find_temporary(block->parent, n); - return block->temporaries[n]; -} - -static struct ir3_instruction * -find_output(struct ir3_block *block, unsigned n) -{ - if (block->parent && !block->outputs[n]) - return find_output(block->parent, n); - return block->outputs[n]; -} - -static struct ir3_instruction * -create_phi(struct fd3_compile_context *ctx, struct ir3_instruction *cond, - struct ir3_instruction *a, struct ir3_instruction *b) -{ - struct ir3_instruction *phi; - - compile_assert(ctx, cond); - - /* Either side of the condition could be null.. which - * indicates a variable written on only one side of the - * branch. Normally this should only be variables not - * used outside of that side of the branch. So we could - * just 'return a ? a : b;' in that case. But for better - * defined undefined behavior we just stick in imm{0.0}. - * In the common case of a value only used within the - * one side of the branch, the PHI instruction will not - * get scheduled - */ - if (!a) - a = create_immed(ctx, 0.0); - if (!b) - b = create_immed(ctx, 0.0); - - phi = instr_create(ctx, -1, OPC_META_PHI); - ir3_reg_create(phi, 0, 0); /* dummy dst */ - ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond; - ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a; - ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b; - - return phi; -} - -static void -trans_endif(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct ir3_block *ifb, *elseb; - struct ir3_instruction **ifout, **elseout; - unsigned i, ifnout = 0, elsenout = 0; - - pop_block(ctx); - - instr = pop_branch(ctx); - - compile_assert(ctx, (instr->category == -1) && - (instr->opc == OPC_META_FLOW)); - - ifb = instr->flow.if_block; - elseb = instr->flow.else_block; - /* if there is no else block, the parent block is used for the - * branch-not-taken src of the PHI instructions: - */ - if (!elseb) - elseb = ifb->parent; - - /* worst case sizes: */ - ifnout = ifb->ntemporaries + ifb->noutputs; - elsenout = elseb->ntemporaries + elseb->noutputs; - - ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout); - if (elseb != ifb->parent) - elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout); - - ifnout = 0; - elsenout = 0; - - /* generate PHI instructions for any temporaries written: */ - for (i = 0; i < ifb->ntemporaries; i++) { - struct ir3_instruction *a = ifb->temporaries[i]; - struct ir3_instruction *b = elseb->temporaries[i]; - - /* if temporary written in if-block, or if else block - * is present and temporary written in else-block: - */ - if (a || ((elseb != ifb->parent) && b)) { - struct ir3_instruction *phi; - - /* if only written on one side, find the closest - * enclosing update on other side: - */ - if (!a) - a = find_temporary(ifb, i); - if (!b) - b = find_temporary(elseb, i); - - ifout[ifnout] = a; - a = create_output(ifb, a, ifnout++); - - if (elseb != ifb->parent) { - elseout[elsenout] = b; - b = create_output(elseb, b, elsenout++); - } - - phi = create_phi(ctx, instr, a, b); - ctx->block->temporaries[i] = phi; - } - } - - compile_assert(ctx, ifb->noutputs == elseb->noutputs); - - /* .. and any outputs written: */ - for (i = 0; i < ifb->noutputs; i++) { - struct ir3_instruction *a = ifb->outputs[i]; - struct ir3_instruction *b = elseb->outputs[i]; - - /* if output written in if-block, or if else block - * is present and output written in else-block: - */ - if (a || ((elseb != ifb->parent) && b)) { - struct ir3_instruction *phi; - - /* if only written on one side, find the closest - * enclosing update on other side: - */ - if (!a) - a = find_output(ifb, i); - if (!b) - b = find_output(elseb, i); - - ifout[ifnout] = a; - a = create_output(ifb, a, ifnout++); - - if (elseb != ifb->parent) { - elseout[elsenout] = b; - b = create_output(elseb, b, elsenout++); - } - - phi = create_phi(ctx, instr, a, b); - ctx->block->outputs[i] = phi; - } - } - - ifb->noutputs = ifnout; - ifb->outputs = ifout; - - if (elseb != ifb->parent) { - elseb->noutputs = elsenout; - elseb->outputs = elseout; - } - - // TODO maybe we want to compact block->inputs? -} - -/* - * Kill - */ - -static void -trans_kill(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr, *immed, *cond = NULL; - bool inv = false; - - switch (t->tgsi_opc) { - case TGSI_OPCODE_KILL: - /* unconditional kill, use enclosing if condition: */ - if (ctx->branch_count > 0) { - unsigned int idx = ctx->branch_count - 1; - cond = ctx->branch[idx].cond; - inv = ctx->branch[idx].inv; - } else { - cond = create_immed(ctx, 1.0); - } - - break; - } - - compile_assert(ctx, cond); - - immed = create_immed(ctx, 0.0); - - /* cmps.f.ne p0.x, cond, {0.0} */ - instr = instr_create(ctx, 2, OPC_CMPS_F); - instr->cat2.condition = IR3_COND_NE; - ir3_reg_create(instr, regid(REG_P0, 0), 0); - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; - cond = instr; - - /* kill p0.x */ - instr = instr_create(ctx, 0, OPC_KILL); - instr->cat0.inv = inv; - ir3_reg_create(instr, 0, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; - - ctx->kill[ctx->kill_count++] = instr; -} - -/* - * Kill-If - */ - -static void -trans_killif(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_src_register *src = &inst->Src[0].Register; - struct ir3_instruction *instr, *immed, *cond = NULL; - bool inv = false; - - immed = create_immed(ctx, 0.0); - - /* cmps.f.ne p0.x, cond, {0.0} */ - instr = instr_create(ctx, 2, OPC_CMPS_F); - instr->cat2.condition = IR3_COND_NE; - ir3_reg_create(instr, regid(REG_P0, 0), 0); - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; - add_src_reg(ctx, instr, src, src->SwizzleX); - - cond = instr; - - /* kill p0.x */ - instr = instr_create(ctx, 0, OPC_KILL); - instr->cat0.inv = inv; - ir3_reg_create(instr, 0, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; - - ctx->kill[ctx->kill_count++] = instr; - -} -/* - * I2F / U2F / F2I / F2U - */ - -static void -trans_cov(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src = &inst->Src[0].Register; - - // cov.f32s32 dst, tmp0 / - instr = instr_create(ctx, 1, 0); - switch (t->tgsi_opc) { - case TGSI_OPCODE_U2F: - instr->cat1.src_type = TYPE_U32; - instr->cat1.dst_type = TYPE_F32; - break; - case TGSI_OPCODE_I2F: - instr->cat1.src_type = TYPE_S32; - instr->cat1.dst_type = TYPE_F32; - break; - case TGSI_OPCODE_F2U: - instr->cat1.src_type = TYPE_F32; - instr->cat1.dst_type = TYPE_U32; - break; - case TGSI_OPCODE_F2I: - instr->cat1.src_type = TYPE_F32; - instr->cat1.dst_type = TYPE_S32; - break; - - } - vectorize(ctx, instr, dst, 1, src, 0); -} - -/* - * Handlers for TGSI instructions which do have 1:1 mapping to native - * instructions: - */ - -static void -instr_cat0(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - instr_create(ctx, 0, t->opc); -} - -static void -instr_cat1(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src = &inst->Src[0].Register; - create_mov(ctx, dst, src); - put_dst(ctx, inst, dst); -} - -static void -instr_cat2(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src0 = &inst->Src[0].Register; - struct tgsi_src_register *src1 = &inst->Src[1].Register; - struct ir3_instruction *instr; - unsigned src0_flags = 0, src1_flags = 0; - - switch (t->tgsi_opc) { - case TGSI_OPCODE_ABS: - case TGSI_OPCODE_IABS: - src0_flags = IR3_REG_ABS; - break; - case TGSI_OPCODE_SUB: - case TGSI_OPCODE_INEG: - src1_flags = IR3_REG_NEGATE; - break; - } - - switch (t->opc) { - case OPC_ABSNEG_F: - case OPC_ABSNEG_S: - case OPC_CLZ_B: - case OPC_CLZ_S: - case OPC_SIGN_F: - case OPC_FLOOR_F: - case OPC_CEIL_F: - case OPC_RNDNE_F: - case OPC_RNDAZ_F: - case OPC_TRUNC_F: - case OPC_NOT_B: - case OPC_BFREV_B: - case OPC_SETRM: - case OPC_CBITS_B: - /* these only have one src reg */ - instr = instr_create(ctx, 2, t->opc); - vectorize(ctx, instr, dst, 1, src0, src0_flags); - break; - default: - if (is_const(src0) && is_const(src1)) - src0 = get_unconst(ctx, src0); - - instr = instr_create(ctx, 2, t->opc); - vectorize(ctx, instr, dst, 2, src0, src0_flags, - src1, src1_flags); - break; - } - - put_dst(ctx, inst, dst); -} - -static void -instr_cat3(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src0 = &inst->Src[0].Register; - struct tgsi_src_register *src1 = &inst->Src[1].Register; - struct ir3_instruction *instr; - - /* in particular, can't handle const for src1 for cat3.. - * for mad, we can swap first two src's if needed: - */ - if (is_rel_or_const(src1)) { - if (is_mad(t->opc) && !is_rel_or_const(src0)) { - struct tgsi_src_register *tmp; - tmp = src0; - src0 = src1; - src1 = tmp; - } else { - src1 = get_unconst(ctx, src1); - } - } - - instr = instr_create(ctx, 3, t->opc); - vectorize(ctx, instr, dst, 3, src0, 0, src1, 0, - &inst->Src[2].Register, 0); - put_dst(ctx, inst, dst); -} - -static void -instr_cat4(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src = &inst->Src[0].Register; - struct ir3_instruction *instr; - unsigned i; - - /* seems like blob compiler avoids const as src.. */ - if (is_const(src)) - src = get_unconst(ctx, src); - - /* we need to replicate into each component: */ - for (i = 0; i < 4; i++) { - if (dst->WriteMask & (1 << i)) { - instr = instr_create(ctx, 4, t->opc); - add_dst_reg(ctx, instr, dst, i); - add_src_reg(ctx, instr, src, src->SwizzleX); - } - } - - put_dst(ctx, inst, dst); -} - -static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { -#define INSTR(n, f, ...) \ - [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ } - - INSTR(MOV, instr_cat1), - INSTR(RCP, instr_cat4, .opc = OPC_RCP), - INSTR(RSQ, instr_cat4, .opc = OPC_RSQ), - INSTR(SQRT, instr_cat4, .opc = OPC_SQRT), - INSTR(MUL, instr_cat2, .opc = OPC_MUL_F), - INSTR(ADD, instr_cat2, .opc = OPC_ADD_F), - INSTR(SUB, instr_cat2, .opc = OPC_ADD_F), - INSTR(MIN, instr_cat2, .opc = OPC_MIN_F), - INSTR(MAX, instr_cat2, .opc = OPC_MAX_F), - INSTR(UADD, instr_cat2, .opc = OPC_ADD_U), - INSTR(IMIN, instr_cat2, .opc = OPC_MIN_S), - INSTR(UMIN, instr_cat2, .opc = OPC_MIN_U), - INSTR(IMAX, instr_cat2, .opc = OPC_MAX_S), - INSTR(UMAX, instr_cat2, .opc = OPC_MAX_U), - INSTR(AND, instr_cat2, .opc = OPC_AND_B), - INSTR(OR, instr_cat2, .opc = OPC_OR_B), - INSTR(NOT, instr_cat2, .opc = OPC_NOT_B), - INSTR(XOR, instr_cat2, .opc = OPC_XOR_B), - INSTR(UMUL, instr_cat2, .opc = OPC_MUL_U), - INSTR(SHL, instr_cat2, .opc = OPC_SHL_B), - INSTR(USHR, instr_cat2, .opc = OPC_SHR_B), - INSTR(ISHR, instr_cat2, .opc = OPC_ASHR_B), - INSTR(IABS, instr_cat2, .opc = OPC_ABSNEG_S), - INSTR(INEG, instr_cat2, .opc = OPC_ABSNEG_S), - INSTR(AND, instr_cat2, .opc = OPC_AND_B), - INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16), - INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F), - INSTR(CLAMP, trans_clamp), - INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F), - INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F), - INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F), - INSTR(CEIL, instr_cat2, .opc = OPC_CEIL_F), - INSTR(ARL, trans_arl), - INSTR(EX2, instr_cat4, .opc = OPC_EXP2), - INSTR(LG2, instr_cat4, .opc = OPC_LOG2), - INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F), - INSTR(COS, instr_cat4, .opc = OPC_COS), - INSTR(SIN, instr_cat4, .opc = OPC_SIN), - INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX), - INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP), - INSTR(SGT, trans_cmp), - INSTR(SLT, trans_cmp), - INSTR(FSLT, trans_cmp), - INSTR(SGE, trans_cmp), - INSTR(FSGE, trans_cmp), - INSTR(SLE, trans_cmp), - INSTR(SNE, trans_cmp), - INSTR(FSNE, trans_cmp), - INSTR(SEQ, trans_cmp), - INSTR(FSEQ, trans_cmp), - INSTR(CMP, trans_cmp), - INSTR(USNE, trans_icmp, .opc = OPC_CMPS_U), - INSTR(USEQ, trans_icmp, .opc = OPC_CMPS_U), - INSTR(ISGE, trans_icmp, .opc = OPC_CMPS_S), - INSTR(USGE, trans_icmp, .opc = OPC_CMPS_U), - INSTR(ISLT, trans_icmp, .opc = OPC_CMPS_S), - INSTR(USLT, trans_icmp, .opc = OPC_CMPS_U), - INSTR(UCMP, trans_icmp, .opc = OPC_CMPS_U), - INSTR(IF, trans_if), - INSTR(UIF, trans_if), - INSTR(ELSE, trans_else), - INSTR(ENDIF, trans_endif), - INSTR(END, instr_cat0, .opc = OPC_END), - INSTR(KILL, trans_kill, .opc = OPC_KILL), - INSTR(KILL_IF, trans_killif, .opc = OPC_KILL), - INSTR(I2F, trans_cov), - INSTR(U2F, trans_cov), - INSTR(F2I, trans_cov), - INSTR(F2U, trans_cov), -}; - -static fd3_semantic -decl_semantic(const struct tgsi_declaration_semantic *sem) -{ - return fd3_semantic_name(sem->Name, sem->Index); -} - -static struct ir3_instruction * -decl_in_frag_bary(struct fd3_compile_context *ctx, unsigned regid, - unsigned j, unsigned inloc) -{ - struct ir3_instruction *instr; - struct ir3_register *src; - - /* bary.f dst, #inloc, r0.x */ - instr = instr_create(ctx, 2, OPC_BARY_F); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc; - src = ir3_reg_create(instr, 0, IR3_REG_SSA); - src->wrmask = 0x3; - src->instr = ctx->frag_pos; - - return instr; -} - -/* TGSI_SEMANTIC_POSITION - * """""""""""""""""""""" - * - * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that - * fragment shader input contains the fragment's window position. The X - * component starts at zero and always increases from left to right. - * The Y component starts at zero and always increases but Y=0 may either - * indicate the top of the window or the bottom depending on the fragment - * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN). - * The Z coordinate ranges from 0 to 1 to represent depth from the front - * to the back of the Z buffer. The W component contains the reciprocol - * of the interpolated vertex position W component. - */ -static struct ir3_instruction * -decl_in_frag_coord(struct fd3_compile_context *ctx, unsigned regid, - unsigned j) -{ - struct ir3_instruction *instr, *src; - - compile_assert(ctx, !ctx->frag_coord[j]); - - ctx->frag_coord[j] = create_input(ctx->block, NULL, 0); - - - switch (j) { - case 0: /* .x */ - case 1: /* .y */ - /* for frag_coord, we get unsigned values.. we need - * to subtract (integer) 8 and divide by 16 (right- - * shift by 4) then convert to float: - */ - - /* add.s tmp, src, -8 */ - instr = instr_create(ctx, 2, OPC_ADD_S); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j]; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8; - src = instr; - - /* shr.b tmp, tmp, 4 */ - instr = instr_create(ctx, 2, OPC_SHR_B); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4; - src = instr; - - /* mov.u32f32 dst, tmp */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = TYPE_U32; - instr->cat1.dst_type = TYPE_F32; - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; - - break; - case 2: /* .z */ - case 3: /* .w */ - /* seems that we can use these as-is: */ - instr = ctx->frag_coord[j]; - break; - default: - compile_error(ctx, "invalid channel\n"); - instr = create_immed(ctx, 0.0); - break; - } - - return instr; -} - -/* TGSI_SEMANTIC_FACE - * """""""""""""""""" - * - * This label applies to fragment shader inputs only and indicates that - * the register contains front/back-face information of the form (F, 0, - * 0, 1). The first component will be positive when the fragment belongs - * to a front-facing polygon, and negative when the fragment belongs to a - * back-facing polygon. - */ -static struct ir3_instruction * -decl_in_frag_face(struct fd3_compile_context *ctx, unsigned regid, - unsigned j) -{ - struct ir3_instruction *instr, *src; - - switch (j) { - case 0: /* .x */ - compile_assert(ctx, !ctx->frag_face); - - ctx->frag_face = create_input(ctx->block, NULL, 0); - - /* for faceness, we always get -1 or 0 (int).. but TGSI expects - * positive vs negative float.. and piglit further seems to - * expect -1.0 or 1.0: - * - * mul.s tmp, hr0.x, 2 - * add.s tmp, tmp, 1 - * mov.s16f32, dst, tmp - * - */ - - instr = instr_create(ctx, 2, OPC_MUL_S); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; - src = instr; - - instr = instr_create(ctx, 2, OPC_ADD_S); - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; - src = instr; - - instr = instr_create(ctx, 1, 0); /* mov */ - instr->cat1.src_type = TYPE_S32; - instr->cat1.dst_type = TYPE_F32; - ir3_reg_create(instr, regid, 0); /* dummy dst */ - ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; - - break; - case 1: /* .y */ - case 2: /* .z */ - instr = create_immed(ctx, 0.0); - break; - case 3: /* .w */ - instr = create_immed(ctx, 1.0); - break; - default: - compile_error(ctx, "invalid channel\n"); - instr = create_immed(ctx, 0.0); - break; - } - - return instr; -} - -static void -decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) -{ - struct fd3_shader_variant *so = ctx->so; - unsigned name = decl->Semantic.Name; - unsigned i; - - /* I don't think we should get frag shader input without - * semantic info? Otherwise how do inputs get linked to - * vert outputs? - */ - compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) || - decl->Declaration.Semantic); - - for (i = decl->Range.First; i <= decl->Range.Last; i++) { - unsigned n = so->inputs_count++; - unsigned r = regid(i, 0); - unsigned ncomp, j; - - /* we'll figure out the actual components used after scheduling */ - ncomp = 4; - - DBG("decl in -> r%d", i); - - compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); - - so->inputs[n].semantic = decl_semantic(&decl->Semantic); - so->inputs[n].compmask = (1 << ncomp) - 1; - so->inputs[n].regid = r; - so->inputs[n].inloc = ctx->next_inloc; - - for (j = 0; j < ncomp; j++) { - struct ir3_instruction *instr = NULL; - - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - /* for fragment shaders, POSITION and FACE are handled - * specially, not using normal varying / bary.f - */ - if (name == TGSI_SEMANTIC_POSITION) { - so->inputs[n].bary = false; - so->frag_coord = true; - instr = decl_in_frag_coord(ctx, r + j, j); - } else if (name == TGSI_SEMANTIC_FACE) { - so->inputs[n].bary = false; - so->frag_face = true; - instr = decl_in_frag_face(ctx, r + j, j); - } else { - so->inputs[n].bary = true; - instr = decl_in_frag_bary(ctx, r + j, j, - so->inputs[n].inloc + j - 8); - } - } else { - instr = create_input(ctx->block, NULL, (i * 4) + j); - } - - ctx->block->inputs[(i * 4) + j] = instr; - } - - if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) { - ctx->next_inloc += ncomp; - so->total_in += ncomp; - } - } -} - -static void -decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) -{ - struct fd3_shader_variant *so = ctx->so; - unsigned comp = 0; - unsigned name = decl->Semantic.Name; - unsigned i; - - compile_assert(ctx, decl->Declaration.Semantic); - - DBG("decl out[%d] -> r%d", name, decl->Range.First); - - if (ctx->type == TGSI_PROCESSOR_VERTEX) { - switch (name) { - case TGSI_SEMANTIC_POSITION: - so->writes_pos = true; - break; - case TGSI_SEMANTIC_PSIZE: - so->writes_psize = true; - break; - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_BCOLOR: - case TGSI_SEMANTIC_GENERIC: - case TGSI_SEMANTIC_FOG: - case TGSI_SEMANTIC_TEXCOORD: - break; - default: - compile_error(ctx, "unknown VS semantic name: %s\n", - tgsi_semantic_names[name]); - } - } else { - switch (name) { - case TGSI_SEMANTIC_POSITION: - comp = 2; /* tgsi will write to .z component */ - so->writes_pos = true; - break; - case TGSI_SEMANTIC_COLOR: - break; - default: - compile_error(ctx, "unknown FS semantic name: %s\n", - tgsi_semantic_names[name]); - } - } - - for (i = decl->Range.First; i <= decl->Range.Last; i++) { - unsigned n = so->outputs_count++; - unsigned ncomp, j; - - ncomp = 4; - - compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); - - so->outputs[n].semantic = decl_semantic(&decl->Semantic); - so->outputs[n].regid = regid(i, comp); - - /* avoid undefined outputs, stick a dummy mov from imm{0.0}, - * which if the output is actually assigned will be over- - * written - */ - for (j = 0; j < ncomp; j++) - ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0); - } -} - -/* from TGSI perspective, we actually have inputs. But most of the "inputs" - * for a fragment shader are just bary.f instructions. The *actual* inputs - * from the hw perspective are the frag_pos and optionally frag_coord and - * frag_face. - */ -static void -fixup_frag_inputs(struct fd3_compile_context *ctx) -{ - struct fd3_shader_variant *so = ctx->so; - struct ir3_block *block = ctx->block; - struct ir3_instruction **inputs; - struct ir3_instruction *instr; - int n, regid = 0; - - block->ninputs = 0; - - n = 4; /* always have frag_pos */ - n += COND(so->frag_face, 4); - n += COND(so->frag_coord, 4); - - inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *))); - - if (so->frag_face) { - /* this ultimately gets assigned to hr0.x so doesn't conflict - * with frag_coord/frag_pos.. - */ - inputs[block->ninputs++] = ctx->frag_face; - ctx->frag_face->regs[0]->num = 0; - - /* remaining channels not used, but let's avoid confusing - * other parts that expect inputs to come in groups of vec4 - */ - inputs[block->ninputs++] = NULL; - inputs[block->ninputs++] = NULL; - inputs[block->ninputs++] = NULL; - } - - /* since we don't know where to set the regid for frag_coord, - * we have to use r0.x for it. But we don't want to *always* - * use r1.x for frag_pos as that could increase the register - * footprint on simple shaders: - */ - if (so->frag_coord) { - ctx->frag_coord[0]->regs[0]->num = regid++; - ctx->frag_coord[1]->regs[0]->num = regid++; - ctx->frag_coord[2]->regs[0]->num = regid++; - ctx->frag_coord[3]->regs[0]->num = regid++; - - inputs[block->ninputs++] = ctx->frag_coord[0]; - inputs[block->ninputs++] = ctx->frag_coord[1]; - inputs[block->ninputs++] = ctx->frag_coord[2]; - inputs[block->ninputs++] = ctx->frag_coord[3]; - } - - /* we always have frag_pos: */ - so->pos_regid = regid; - - /* r0.x */ - instr = create_input(block, NULL, block->ninputs); - instr->regs[0]->num = regid++; - inputs[block->ninputs++] = instr; - ctx->frag_pos->regs[1]->instr = instr; - - /* r0.y */ - instr = create_input(block, NULL, block->ninputs); - instr->regs[0]->num = regid++; - inputs[block->ninputs++] = instr; - ctx->frag_pos->regs[2]->instr = instr; - - block->inputs = inputs; -} - -static void -compile_instructions(struct fd3_compile_context *ctx) -{ - push_block(ctx); - - /* for fragment shader, we have a single input register (usually - * r0.xy) which is used as the base for bary.f varying fetch instrs: - */ - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - struct ir3_instruction *instr; - instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); - ir3_reg_create(instr, 0, 0); - ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ - ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ - ctx->frag_pos = instr; - } - - while (!tgsi_parse_end_of_tokens(&ctx->parser)) { - tgsi_parse_token(&ctx->parser); - - switch (ctx->parser.FullToken.Token.Type) { - case TGSI_TOKEN_TYPE_DECLARATION: { - struct tgsi_full_declaration *decl = - &ctx->parser.FullToken.FullDeclaration; - if (decl->Declaration.File == TGSI_FILE_OUTPUT) { - decl_out(ctx, decl); - } else if (decl->Declaration.File == TGSI_FILE_INPUT) { - decl_in(ctx, decl); - } - break; - } - case TGSI_TOKEN_TYPE_IMMEDIATE: { - /* TODO: if we know the immediate is small enough, and only - * used with instructions that can embed an immediate, we - * can skip this: - */ - struct tgsi_full_immediate *imm = - &ctx->parser.FullToken.FullImmediate; - unsigned n = ctx->so->immediates_count++; - compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates)); - memcpy(ctx->so->immediates[n].val, imm->u, 16); - break; - } - case TGSI_TOKEN_TYPE_INSTRUCTION: { - struct tgsi_full_instruction *inst = - &ctx->parser.FullToken.FullInstruction; - unsigned opc = inst->Instruction.Opcode; - const struct instr_translater *t = &translaters[opc]; - - if (t->fxn) { - t->fxn(t, ctx, inst); - ctx->num_internal_temps = 0; - } else { - compile_error(ctx, "unknown TGSI opc: %s\n", - tgsi_get_opcode_name(opc)); - } - - switch (inst->Instruction.Saturate) { - case TGSI_SAT_ZERO_ONE: - create_clamp_imm(ctx, &inst->Dst[0].Register, - fui(0.0), fui(1.0)); - break; - case TGSI_SAT_MINUS_PLUS_ONE: - create_clamp_imm(ctx, &inst->Dst[0].Register, - fui(-1.0), fui(1.0)); - break; - } - - instr_finish(ctx); - - break; - } - default: - break; - } - } -} - -static void -compile_dump(struct fd3_compile_context *ctx) -{ - const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag"; - static unsigned n = 0; - char fname[16]; - FILE *f; - snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++); - f = fopen(fname, "w"); - if (!f) - return; - ir3_block_depth(ctx->block); - ir3_dump(ctx->ir, name, ctx->block, f); - fclose(f); -} - -int -fd3_compile_shader(struct fd3_shader_variant *so, - const struct tgsi_token *tokens, struct fd3_shader_key key) -{ - struct fd3_compile_context ctx; - struct ir3_block *block; - struct ir3_instruction **inputs; - unsigned i, j, actual_in; - int ret = 0; - - assert(!so->ir); - - so->ir = ir3_create(); - - assert(so->ir); - - if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) { - ret = -1; - goto out; - } - - compile_instructions(&ctx); - - block = ctx.block; - - /* keep track of the inputs from TGSI perspective.. */ - inputs = block->inputs; - - /* but fixup actual inputs for frag shader: */ - if (ctx.type == TGSI_PROCESSOR_FRAGMENT) - fixup_frag_inputs(&ctx); - - /* at this point, for binning pass, throw away unneeded outputs: */ - if (key.binning_pass) { - for (i = 0, j = 0; i < so->outputs_count; i++) { - unsigned name = sem2name(so->outputs[i].semantic); - unsigned idx = sem2name(so->outputs[i].semantic); - - /* throw away everything but first position/psize */ - if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || - (name == TGSI_SEMANTIC_PSIZE))) { - if (i != j) { - so->outputs[j] = so->outputs[i]; - block->outputs[(j*4)+0] = block->outputs[(i*4)+0]; - block->outputs[(j*4)+1] = block->outputs[(i*4)+1]; - block->outputs[(j*4)+2] = block->outputs[(i*4)+2]; - block->outputs[(j*4)+3] = block->outputs[(i*4)+3]; - } - j++; - } - } - so->outputs_count = j; - block->noutputs = j * 4; - } - - /* at this point, we want the kill's in the outputs array too, - * so that they get scheduled (since they have no dst).. we've - * already ensured that the array is big enough in push_block(): - */ - if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { - for (i = 0; i < ctx.kill_count; i++) - block->outputs[block->noutputs++] = ctx.kill[i]; - } - - if (fd_mesa_debug & FD_DBG_OPTDUMP) - compile_dump(&ctx); - - ret = ir3_block_flatten(block); - if (ret < 0) - goto out; - if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP)) - compile_dump(&ctx); - - ir3_block_cp(block); - - if (fd_mesa_debug & FD_DBG_OPTDUMP) - compile_dump(&ctx); - - ir3_block_depth(block); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("AFTER DEPTH:\n"); - ir3_dump_instr_list(block->head); - } - - ir3_block_sched(block); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("AFTER SCHED:\n"); - ir3_dump_instr_list(block->head); - } - - ret = ir3_block_ra(block, so->type, key.half_precision, - so->frag_coord, so->frag_face, &so->has_samp); - if (ret) - goto out; - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("AFTER RA:\n"); - ir3_dump_instr_list(block->head); - } - - /* fixup input/outputs: */ - for (i = 0; i < so->outputs_count; i++) { - so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; - /* preserve hack for depth output.. tgsi writes depth to .z, - * but what we give the hw is the scalar register: - */ - if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) && - (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) - so->outputs[i].regid += 2; - } - /* Note that some or all channels of an input may be unused: */ - actual_in = 0; - for (i = 0; i < so->inputs_count; i++) { - unsigned j, regid = ~0, compmask = 0; - so->inputs[i].ncomp = 0; - for (j = 0; j < 4; j++) { - struct ir3_instruction *in = inputs[(i*4) + j]; - if (in) { - compmask |= (1 << j); - regid = in->regs[0]->num - j; - actual_in++; - so->inputs[i].ncomp++; - } - } - so->inputs[i].regid = regid; - so->inputs[i].compmask = compmask; - } - - /* fragment shader always gets full vec4's even if it doesn't - * fetch all components, but vertex shader we need to update - * with the actual number of components fetch, otherwise thing - * will hang due to mismaptch between VFD_DECODE's and - * TOTALATTRTOVS - */ - if (so->type == SHADER_VERTEX) - so->total_in = actual_in; - -out: - if (ret) { - ir3_destroy(so->ir); - so->ir = NULL; - } - compile_free(&ctx); - - return ret; -} diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.h b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.h deleted file mode 100644 index a53bb3ee9a5..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.h +++ /dev/null @@ -1,43 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2013 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#ifndef FD3_COMPILER_H_ -#define FD3_COMPILER_H_ - -#include "fd3_program.h" -#include "fd3_util.h" - - -int fd3_compile_shader(struct fd3_shader_variant *so, - const struct tgsi_token *tokens, - struct fd3_shader_key key); -int fd3_compile_shader_old(struct fd3_shader_variant *so, - const struct tgsi_token *tokens, - struct fd3_shader_key key); - -#endif /* FD3_COMPILER_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c deleted file mode 100644 index 66f724b35c0..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c +++ /dev/null @@ -1,1524 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2013 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include - -#include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_ureg.h" -#include "tgsi/tgsi_info.h" -#include "tgsi/tgsi_strings.h" -#include "tgsi/tgsi_dump.h" -#include "tgsi/tgsi_scan.h" - -#include "freedreno_lowering.h" - -#include "fd3_compiler.h" -#include "fd3_program.h" -#include "fd3_util.h" - -#include "instr-a3xx.h" -#include "ir3.h" - - -struct fd3_compile_context { - const struct tgsi_token *tokens; - bool free_tokens; - struct ir3 *ir; - struct ir3_block *block; - struct fd3_shader_variant *so; - - struct tgsi_parse_context parser; - unsigned type; - - struct tgsi_shader_info info; - - /* last input dst (for setting (ei) flag): */ - struct ir3_register *last_input; - - /* last instruction with relative addressing: */ - struct ir3_instruction *last_rel; - - /* for calculating input/output positions/linkages: */ - unsigned next_inloc; - - unsigned num_internal_temps; - struct tgsi_src_register internal_temps[6]; - - /* track registers which need to synchronize w/ "complex alu" cat3 - * instruction pipeline: - */ - regmask_t needs_ss; - - /* track registers which need to synchronize with texture fetch - * pipeline: - */ - regmask_t needs_sy; - - /* inputs start at r0, temporaries start after last input, and - * outputs start after last temporary. - * - * We could be more clever, because this is not a hw restriction, - * but probably best just to implement an optimizing pass to - * reduce the # of registers used and get rid of redundant mov's - * (to output register). - */ - unsigned base_reg[TGSI_FILE_COUNT]; - - /* idx/slot for last compiler generated immediate */ - unsigned immediate_idx; - - /* stack of branch instructions that start (potentially nested) - * branch instructions, so that we can fix up the branch targets - * so that we can fix up the branch target on the corresponding - * END instruction - */ - struct ir3_instruction *branch[16]; - unsigned int branch_count; - - /* used when dst is same as one of the src, to avoid overwriting a - * src element before the remaining scalar instructions that make - * up the vector operation - */ - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; -}; - - -static void vectorize(struct fd3_compile_context *ctx, - struct ir3_instruction *instr, struct tgsi_dst_register *dst, - int nsrcs, ...); -static void create_mov(struct fd3_compile_context *ctx, - struct tgsi_dst_register *dst, struct tgsi_src_register *src); - -static unsigned -compile_init(struct fd3_compile_context *ctx, struct fd3_shader_variant *so, - const struct tgsi_token *tokens) -{ - unsigned ret, base = 0; - struct tgsi_shader_info *info = &ctx->info; - const struct fd_lowering_config lconfig = { - .color_two_side = so->key.color_two_side, - .lower_DST = true, - .lower_XPD = true, - .lower_SCS = true, - .lower_LRP = true, - .lower_FRC = true, - .lower_POW = true, - .lower_LIT = true, - .lower_EXP = true, - .lower_LOG = true, - .lower_DP4 = true, - .lower_DP3 = true, - .lower_DPH = true, - .lower_DP2 = true, - .lower_DP2A = true, - }; - - ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info); - ctx->free_tokens = !!ctx->tokens; - if (!ctx->tokens) { - /* no lowering */ - ctx->tokens = tokens; - } - ctx->ir = so->ir; - ctx->block = ir3_block_create(ctx->ir, 0, 0, 0); - ctx->so = so; - ctx->last_input = NULL; - ctx->last_rel = NULL; - ctx->next_inloc = 8; - ctx->num_internal_temps = 0; - ctx->branch_count = 0; - - regmask_init(&ctx->needs_ss); - regmask_init(&ctx->needs_sy); - memset(ctx->base_reg, 0, sizeof(ctx->base_reg)); - - /* Immediates go after constants: */ - ctx->base_reg[TGSI_FILE_CONSTANT] = 0; - ctx->base_reg[TGSI_FILE_IMMEDIATE] = - info->file_max[TGSI_FILE_CONSTANT] + 1; - - /* if full precision and fragment shader, don't clobber - * r0.x w/ bary fetch: - */ - if ((so->type == SHADER_FRAGMENT) && !so->key.half_precision) - base = 1; - - /* Temporaries after outputs after inputs: */ - ctx->base_reg[TGSI_FILE_INPUT] = base; - ctx->base_reg[TGSI_FILE_OUTPUT] = base + - info->file_max[TGSI_FILE_INPUT] + 1; - ctx->base_reg[TGSI_FILE_TEMPORARY] = base + - info->file_max[TGSI_FILE_INPUT] + 1 + - info->file_max[TGSI_FILE_OUTPUT] + 1; - - so->first_immediate = ctx->base_reg[TGSI_FILE_IMMEDIATE]; - ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); - - ret = tgsi_parse_init(&ctx->parser, ctx->tokens); - if (ret != TGSI_PARSE_OK) - return ret; - - ctx->type = ctx->parser.FullHeader.Processor.Processor; - - return ret; -} - -static void -compile_error(struct fd3_compile_context *ctx, const char *format, ...) -{ - va_list ap; - va_start(ap, format); - _debug_vprintf(format, ap); - va_end(ap); - tgsi_dump(ctx->tokens, 0); - debug_assert(0); -} - -#define compile_assert(ctx, cond) do { \ - if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ - } while (0) - -static void -compile_free(struct fd3_compile_context *ctx) -{ - if (ctx->free_tokens) - free((void *)ctx->tokens); - tgsi_parse_free(&ctx->parser); -} - -struct instr_translater { - void (*fxn)(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst); - unsigned tgsi_opc; - opc_t opc; - opc_t hopc; /* opc to use for half_precision mode, if different */ - unsigned arg; -}; - -static void -handle_last_rel(struct fd3_compile_context *ctx) -{ - if (ctx->last_rel) { - ctx->last_rel->flags |= IR3_INSTR_UL; - ctx->last_rel = NULL; - } -} - -static struct ir3_instruction * -instr_create(struct fd3_compile_context *ctx, int category, opc_t opc) -{ - return ir3_instr_create(ctx->block, category, opc); -} - -static void -add_nop(struct fd3_compile_context *ctx, unsigned count) -{ - while (count-- > 0) - instr_create(ctx, 0, OPC_NOP); -} - -static unsigned -src_flags(struct fd3_compile_context *ctx, struct ir3_register *reg) -{ - unsigned flags = 0; - - if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) - return flags; - - if (regmask_get(&ctx->needs_ss, reg)) { - flags |= IR3_INSTR_SS; - regmask_init(&ctx->needs_ss); - } - - if (regmask_get(&ctx->needs_sy, reg)) { - flags |= IR3_INSTR_SY; - regmask_init(&ctx->needs_sy); - } - - return flags; -} - -static struct ir3_register * -add_dst_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_dst_register *dst, unsigned chan) -{ - unsigned flags = 0, num = 0; - struct ir3_register *reg; - - switch (dst->File) { - case TGSI_FILE_OUTPUT: - case TGSI_FILE_TEMPORARY: - num = dst->Index + ctx->base_reg[dst->File]; - break; - case TGSI_FILE_ADDRESS: - num = REG_A0; - break; - default: - compile_error(ctx, "unsupported dst register file: %s\n", - tgsi_file_name(dst->File)); - break; - } - - if (dst->Indirect) - flags |= IR3_REG_RELATIV; - if (ctx->so->key.half_precision) - flags |= IR3_REG_HALF; - - reg = ir3_reg_create(instr, regid(num, chan), flags); - - if (dst->Indirect) - ctx->last_rel = instr; - - return reg; -} - -static struct ir3_register * -add_src_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_src_register *src, unsigned chan) -{ - unsigned flags = 0, num = 0; - struct ir3_register *reg; - - /* TODO we need to use a mov to temp for const >= 64.. or maybe - * we could use relative addressing.. - */ - compile_assert(ctx, src->Index < 64); - - switch (src->File) { - case TGSI_FILE_IMMEDIATE: - /* TODO if possible, use actual immediate instead of const.. but - * TGSI has vec4 immediates, we can only embed scalar (of limited - * size, depending on instruction..) - */ - case TGSI_FILE_CONSTANT: - flags |= IR3_REG_CONST; - num = src->Index + ctx->base_reg[src->File]; - break; - case TGSI_FILE_OUTPUT: - /* NOTE: we should only end up w/ OUTPUT file for things like - * clamp()'ing saturated dst instructions - */ - case TGSI_FILE_INPUT: - case TGSI_FILE_TEMPORARY: - num = src->Index + ctx->base_reg[src->File]; - break; - default: - compile_error(ctx, "unsupported src register file: %s\n", - tgsi_file_name(src->File)); - break; - } - - if (src->Absolute) - flags |= IR3_REG_ABS; - if (src->Negate) - flags |= IR3_REG_NEGATE; - if (src->Indirect) - flags |= IR3_REG_RELATIV; - if (ctx->so->key.half_precision) - flags |= IR3_REG_HALF; - - reg = ir3_reg_create(instr, regid(num, chan), flags); - - if (src->Indirect) - ctx->last_rel = instr; - - instr->flags |= src_flags(ctx, reg); - - return reg; -} - -static void -src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) -{ - src->File = dst->File; - src->Indirect = dst->Indirect; - src->Dimension = dst->Dimension; - src->Index = dst->Index; - src->Absolute = 0; - src->Negate = 0; - src->SwizzleX = TGSI_SWIZZLE_X; - src->SwizzleY = TGSI_SWIZZLE_Y; - src->SwizzleZ = TGSI_SWIZZLE_Z; - src->SwizzleW = TGSI_SWIZZLE_W; -} - -/* Get internal-temp src/dst to use for a sequence of instructions - * generated by a single TGSI op. - */ -static struct tgsi_src_register * -get_internal_temp(struct fd3_compile_context *ctx, - struct tgsi_dst_register *tmp_dst) -{ - struct tgsi_src_register *tmp_src; - int n; - - tmp_dst->File = TGSI_FILE_TEMPORARY; - tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; - tmp_dst->Indirect = 0; - tmp_dst->Dimension = 0; - - /* assign next temporary: */ - n = ctx->num_internal_temps++; - compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); - tmp_src = &ctx->internal_temps[n]; - - tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1; - - src_from_dst(tmp_src, tmp_dst); - - return tmp_src; -} - -/* Get internal half-precision temp src/dst to use for a sequence of - * instructions generated by a single TGSI op. - */ -static struct tgsi_src_register * -get_internal_temp_hr(struct fd3_compile_context *ctx, - struct tgsi_dst_register *tmp_dst) -{ - struct tgsi_src_register *tmp_src; - int n; - - if (ctx->so->key.half_precision) - return get_internal_temp(ctx, tmp_dst); - - tmp_dst->File = TGSI_FILE_TEMPORARY; - tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; - tmp_dst->Indirect = 0; - tmp_dst->Dimension = 0; - - /* assign next temporary: */ - n = ctx->num_internal_temps++; - compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); - tmp_src = &ctx->internal_temps[n]; - - /* just use hr0 because no one else should be using half- - * precision regs: - */ - tmp_dst->Index = 0; - - src_from_dst(tmp_src, tmp_dst); - - return tmp_src; -} - -static inline bool -is_const(struct tgsi_src_register *src) -{ - return (src->File == TGSI_FILE_CONSTANT) || - (src->File == TGSI_FILE_IMMEDIATE); -} - -static inline bool -is_relative(struct tgsi_src_register *src) -{ - return src->Indirect; -} - -static inline bool -is_rel_or_const(struct tgsi_src_register *src) -{ - return is_relative(src) || is_const(src); -} - -static type_t -get_ftype(struct fd3_compile_context *ctx) -{ - return ctx->so->key.half_precision ? TYPE_F16 : TYPE_F32; -} - -static type_t -get_utype(struct fd3_compile_context *ctx) -{ - return ctx->so->key.half_precision ? TYPE_U16 : TYPE_U32; -} - -static unsigned -src_swiz(struct tgsi_src_register *src, int chan) -{ - switch (chan) { - case 0: return src->SwizzleX; - case 1: return src->SwizzleY; - case 2: return src->SwizzleZ; - case 3: return src->SwizzleW; - } - assert(0); - return 0; -} - -/* for instructions that cannot take a const register as src, if needed - * generate a move to temporary gpr: - */ -static struct tgsi_src_register * -get_unconst(struct fd3_compile_context *ctx, struct tgsi_src_register *src) -{ - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - - compile_assert(ctx, is_rel_or_const(src)); - - tmp_src = get_internal_temp(ctx, &tmp_dst); - - create_mov(ctx, &tmp_dst, src); - - return tmp_src; -} - -static void -get_immediate(struct fd3_compile_context *ctx, - struct tgsi_src_register *reg, uint32_t val) -{ - unsigned neg, swiz, idx, i; - /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ - static const unsigned swiz2tgsi[] = { - TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, - }; - - for (i = 0; i < ctx->immediate_idx; i++) { - swiz = i % 4; - idx = i / 4; - - if (ctx->so->immediates[idx].val[swiz] == val) { - neg = 0; - break; - } - - if (ctx->so->immediates[idx].val[swiz] == -val) { - neg = 1; - break; - } - } - - if (i == ctx->immediate_idx) { - /* need to generate a new immediate: */ - swiz = i % 4; - idx = i / 4; - neg = 0; - ctx->so->immediates[idx].val[swiz] = val; - ctx->so->immediates_count = idx + 1; - ctx->immediate_idx++; - } - - reg->File = TGSI_FILE_IMMEDIATE; - reg->Indirect = 0; - reg->Dimension = 0; - reg->Index = idx; - reg->Absolute = 0; - reg->Negate = neg; - reg->SwizzleX = swiz2tgsi[swiz]; - reg->SwizzleY = swiz2tgsi[swiz]; - reg->SwizzleZ = swiz2tgsi[swiz]; - reg->SwizzleW = swiz2tgsi[swiz]; -} - -static void -create_mov(struct fd3_compile_context *ctx, struct tgsi_dst_register *dst, - struct tgsi_src_register *src) -{ - type_t type_mov = get_ftype(ctx); - unsigned i; - - for (i = 0; i < 4; i++) { - /* move to destination: */ - if (dst->WriteMask & (1 << i)) { - struct ir3_instruction *instr; - - if (src->Absolute || src->Negate) { - /* can't have abs or neg on a mov instr, so use - * absneg.f instead to handle these cases: - */ - instr = instr_create(ctx, 2, OPC_ABSNEG_F); - } else { - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - } - - add_dst_reg(ctx, instr, dst, i); - add_src_reg(ctx, instr, src, src_swiz(src, i)); - } else { - add_nop(ctx, 1); - } - } -} - -static void -create_clamp(struct fd3_compile_context *ctx, - struct tgsi_dst_register *dst, struct tgsi_src_register *val, - struct tgsi_src_register *minval, struct tgsi_src_register *maxval) -{ - struct ir3_instruction *instr; - - instr = instr_create(ctx, 2, OPC_MAX_F); - vectorize(ctx, instr, dst, 2, val, 0, minval, 0); - - instr = instr_create(ctx, 2, OPC_MIN_F); - vectorize(ctx, instr, dst, 2, val, 0, maxval, 0); -} - -static void -create_clamp_imm(struct fd3_compile_context *ctx, - struct tgsi_dst_register *dst, - uint32_t minval, uint32_t maxval) -{ - struct tgsi_src_register minconst, maxconst; - struct tgsi_src_register src; - - src_from_dst(&src, dst); - - get_immediate(ctx, &minconst, minval); - get_immediate(ctx, &maxconst, maxval); - - create_clamp(ctx, dst, &src, &minconst, &maxconst); -} - -static struct tgsi_dst_register * -get_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - unsigned i; - for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - struct tgsi_src_register *src = &inst->Src[i].Register; - if ((src->File == dst->File) && (src->Index == dst->Index)) { - if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) && - (src->SwizzleX == TGSI_SWIZZLE_X) && - (src->SwizzleY == TGSI_SWIZZLE_Y) && - (src->SwizzleZ == TGSI_SWIZZLE_Z) && - (src->SwizzleW == TGSI_SWIZZLE_W)) - continue; - ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst); - ctx->tmp_dst.WriteMask = dst->WriteMask; - dst = &ctx->tmp_dst; - break; - } - } - return dst; -} - -static void -put_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst, - struct tgsi_dst_register *dst) -{ - /* if necessary, add mov back into original dst: */ - if (dst != &inst->Dst[0].Register) { - create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src); - } -} - -/* helper to generate the necessary repeat and/or additional instructions - * to turn a scalar instruction into a vector operation: - */ -static void -vectorize(struct fd3_compile_context *ctx, struct ir3_instruction *instr, - struct tgsi_dst_register *dst, int nsrcs, ...) -{ - va_list ap; - int i, j, n = 0; - bool indirect = dst->Indirect; - - add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X); - - va_start(ap, nsrcs); - for (j = 0; j < nsrcs; j++) { - struct tgsi_src_register *src = - va_arg(ap, struct tgsi_src_register *); - unsigned flags = va_arg(ap, unsigned); - struct ir3_register *reg; - if (flags & IR3_REG_IMMED) { - reg = ir3_reg_create(instr, 0, IR3_REG_IMMED); - /* this is an ugly cast.. should have put flags first! */ - reg->iim_val = *(int *)&src; - } else { - reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X); - indirect |= src->Indirect; - } - reg->flags |= flags & ~IR3_REG_NEGATE; - if (flags & IR3_REG_NEGATE) - reg->flags ^= IR3_REG_NEGATE; - } - va_end(ap); - - for (i = 0; i < 4; i++) { - if (dst->WriteMask & (1 << i)) { - struct ir3_instruction *cur; - - if (n++ == 0) { - cur = instr; - } else { - cur = ir3_instr_clone(instr); - cur->flags &= ~(IR3_INSTR_SY | IR3_INSTR_SS | IR3_INSTR_JP); - } - - /* fix-up dst register component: */ - cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i); - - /* fix-up src register component: */ - va_start(ap, nsrcs); - for (j = 0; j < nsrcs; j++) { - struct tgsi_src_register *src = - va_arg(ap, struct tgsi_src_register *); - unsigned flags = va_arg(ap, unsigned); - if (!(flags & IR3_REG_IMMED)) { - cur->regs[j+1]->num = - regid(cur->regs[j+1]->num >> 2, - src_swiz(src, i)); - cur->flags |= src_flags(ctx, cur->regs[j+1]); - } - } - va_end(ap); - - if (indirect) - ctx->last_rel = cur; - } - } - - /* pad w/ nop's.. at least until we are clever enough to - * figure out if we really need to.. - */ - add_nop(ctx, 4 - n); -} - -/* - * Handlers for TGSI instructions which do not have a 1:1 mapping to - * native instructions: - */ - -static void -trans_clamp(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src0 = &inst->Src[0].Register; - struct tgsi_src_register *src1 = &inst->Src[1].Register; - struct tgsi_src_register *src2 = &inst->Src[2].Register; - - create_clamp(ctx, dst, src0, src1, src2); - - put_dst(ctx, inst, dst); -} - -/* ARL(x) = x, but mova from hrN.x to a0.. */ -static void -trans_arl(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - struct tgsi_dst_register *dst = &inst->Dst[0].Register; - struct tgsi_src_register *src = &inst->Src[0].Register; - unsigned chan = src->SwizzleX; - compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); - - handle_last_rel(ctx); - - tmp_src = get_internal_temp_hr(ctx, &tmp_dst); - - /* cov.{f32,f16}s16 Rtmp, Rsrc */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_ftype(ctx); - instr->cat1.dst_type = TYPE_S16; - add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; - add_src_reg(ctx, instr, src, chan); - - add_nop(ctx, 3); - - /* shl.b Rtmp, Rtmp, 2 */ - instr = instr_create(ctx, 2, OPC_SHL_B); - add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; - add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; - - add_nop(ctx, 3); - - /* mova a0, Rtmp */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = TYPE_S16; - instr->cat1.dst_type = TYPE_S16; - add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; - add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; - - /* need to ensure 5 instr slots before a0 is used: */ - add_nop(ctx, 6); -} - -/* texture fetch/sample instructions: */ -static void -trans_samp(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_register *r; - struct ir3_instruction *instr; - struct tgsi_src_register *coord = &inst->Src[0].Register; - struct tgsi_src_register *samp = &inst->Src[1].Register; - unsigned tex = inst->Texture.Texture; - int8_t *order; - unsigned i, flags = 0, src_wrmask; - bool needs_mov = false; - - switch (t->arg) { - case TGSI_OPCODE_TEX: - if (tex == TGSI_TEXTURE_2D) { - order = (int8_t[4]){ 0, 1, -1, -1 }; - src_wrmask = TGSI_WRITEMASK_XY; - } else { - order = (int8_t[4]){ 0, 1, 2, -1 }; - src_wrmask = TGSI_WRITEMASK_XYZ; - } - break; - case TGSI_OPCODE_TXP: - if (tex == TGSI_TEXTURE_2D) { - order = (int8_t[4]){ 0, 1, 3, -1 }; - src_wrmask = TGSI_WRITEMASK_XYZ; - } else { - order = (int8_t[4]){ 0, 1, 2, 3 }; - src_wrmask = TGSI_WRITEMASK_XYZW; - } - flags |= IR3_INSTR_P; - break; - default: - compile_assert(ctx, 0); - break; - } - - if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE)) { - add_nop(ctx, 3); - flags |= IR3_INSTR_3D; - } - - /* cat5 instruction cannot seem to handle const or relative: */ - if (is_rel_or_const(coord)) - needs_mov = true; - - /* The texture sample instructions need to coord in successive - * registers/components (ie. src.xy but not src.yx). And TXP - * needs the .w component in .z for 2D.. so in some cases we - * might need to emit some mov instructions to shuffle things - * around: - */ - for (i = 1; (i < 4) && (order[i] >= 0) && !needs_mov; i++) - if (src_swiz(coord, i) != (src_swiz(coord, 0) + order[i])) - needs_mov = true; - - if (needs_mov) { - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - unsigned j; - - type_t type_mov = get_ftype(ctx); - - /* need to move things around: */ - tmp_src = get_internal_temp(ctx, &tmp_dst); - - for (j = 0; (j < 4) && (order[j] >= 0); j++) { - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = type_mov; - instr->cat1.dst_type = type_mov; - add_dst_reg(ctx, instr, &tmp_dst, j); - add_src_reg(ctx, instr, coord, - src_swiz(coord, order[j])); - } - - coord = tmp_src; - - add_nop(ctx, 4 - j); - } - - instr = instr_create(ctx, 5, t->opc); - instr->cat5.type = get_ftype(ctx); - instr->cat5.samp = samp->Index; - instr->cat5.tex = samp->Index; - instr->flags |= flags; - - r = add_dst_reg(ctx, instr, &inst->Dst[0].Register, 0); - r->wrmask = inst->Dst[0].Register.WriteMask; - - add_src_reg(ctx, instr, coord, coord->SwizzleX)->wrmask = src_wrmask; - - /* after add_src_reg() so we don't set (sy) on sam instr itself! */ - regmask_set(&ctx->needs_sy, r); -} - -/* - * SEQ(a,b) = (a == b) ? 1.0 : 0.0 - * cmps.f.eq tmp0, b, a - * cov.u16f16 dst, tmp0 - * - * SNE(a,b) = (a != b) ? 1.0 : 0.0 - * cmps.f.eq tmp0, b, a - * add.s tmp0, tmp0, -1 - * sel.f16 dst, {0.0}, tmp0, {1.0} - * - * SGE(a,b) = (a >= b) ? 1.0 : 0.0 - * cmps.f.ge tmp0, a, b - * cov.u16f16 dst, tmp0 - * - * SLE(a,b) = (a <= b) ? 1.0 : 0.0 - * cmps.f.ge tmp0, b, a - * cov.u16f16 dst, tmp0 - * - * SGT(a,b) = (a > b) ? 1.0 : 0.0 - * cmps.f.ge tmp0, b, a - * add.s tmp0, tmp0, -1 - * sel.f16 dst, {0.0}, tmp0, {1.0} - * - * SLT(a,b) = (a < b) ? 1.0 : 0.0 - * cmps.f.ge tmp0, a, b - * add.s tmp0, tmp0, -1 - * sel.f16 dst, {0.0}, tmp0, {1.0} - * - * CMP(a,b,c) = (a < 0.0) ? b : c - * cmps.f.ge tmp0, a, {0.0} - * add.s tmp0, tmp0, -1 - * sel.f16 dst, c, tmp0, b - */ -static void -trans_cmp(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_dst_register tmp_dst; - struct tgsi_src_register *tmp_src; - struct tgsi_src_register constval0, constval1; - /* final instruction for CMP() uses orig src1 and src2: */ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *a0, *a1; - unsigned condition; - - tmp_src = get_internal_temp(ctx, &tmp_dst); - - switch (t->tgsi_opc) { - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_SNE: - a0 = &inst->Src[1].Register; /* b */ - a1 = &inst->Src[0].Register; /* a */ - condition = IR3_COND_EQ; - break; - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_SLT: - a0 = &inst->Src[0].Register; /* a */ - a1 = &inst->Src[1].Register; /* b */ - condition = IR3_COND_GE; - break; - case TGSI_OPCODE_SLE: - case TGSI_OPCODE_SGT: - a0 = &inst->Src[1].Register; /* b */ - a1 = &inst->Src[0].Register; /* a */ - condition = IR3_COND_GE; - break; - case TGSI_OPCODE_CMP: - get_immediate(ctx, &constval0, fui(0.0)); - a0 = &inst->Src[0].Register; /* a */ - a1 = &constval0; /* {0.0} */ - condition = IR3_COND_GE; - break; - default: - compile_assert(ctx, 0); - return; - } - - if (is_const(a0) && is_const(a1)) - a0 = get_unconst(ctx, a0); - - /* cmps.f.ge tmp, a0, a1 */ - instr = instr_create(ctx, 2, OPC_CMPS_F); - instr->cat2.condition = condition; - vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); - - switch (t->tgsi_opc) { - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_SLE: - /* cov.u16f16 dst, tmp0 */ - instr = instr_create(ctx, 1, 0); - instr->cat1.src_type = get_utype(ctx); - instr->cat1.dst_type = get_ftype(ctx); - vectorize(ctx, instr, dst, 1, tmp_src, 0); - break; - case TGSI_OPCODE_SNE: - case TGSI_OPCODE_SGT: - case TGSI_OPCODE_SLT: - case TGSI_OPCODE_CMP: - /* add.s tmp, tmp, -1 */ - instr = instr_create(ctx, 2, OPC_ADD_S); - vectorize(ctx, instr, &tmp_dst, 2, tmp_src, 0, -1, IR3_REG_IMMED); - - if (t->tgsi_opc == TGSI_OPCODE_CMP) { - /* sel.{f32,f16} dst, src2, tmp, src1 */ - instr = instr_create(ctx, 3, - ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32); - vectorize(ctx, instr, dst, 3, - &inst->Src[2].Register, 0, - tmp_src, 0, - &inst->Src[1].Register, 0); - } else { - get_immediate(ctx, &constval0, fui(0.0)); - get_immediate(ctx, &constval1, fui(1.0)); - /* sel.{f32,f16} dst, {0.0}, tmp0, {1.0} */ - instr = instr_create(ctx, 3, - ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32); - vectorize(ctx, instr, dst, 3, - &constval0, 0, tmp_src, 0, &constval1, 0); - } - - break; - } - - put_dst(ctx, inst, dst); -} - -/* - * Conditional / Flow control - */ - -static unsigned -find_instruction(struct fd3_compile_context *ctx, struct ir3_instruction *instr) -{ - unsigned i; - for (i = 0; i < ctx->ir->instrs_count; i++) - if (ctx->ir->instrs[i] == instr) - return i; - return ~0; -} - -static void -push_branch(struct fd3_compile_context *ctx, struct ir3_instruction *instr) -{ - ctx->branch[ctx->branch_count++] = instr; -} - -static void -pop_branch(struct fd3_compile_context *ctx) -{ - struct ir3_instruction *instr; - - /* if we were clever enough, we'd patch this up after the fact, - * and set (jp) flag on whatever the next instruction was, rather - * than inserting an extra nop.. - */ - instr = instr_create(ctx, 0, OPC_NOP); - instr->flags |= IR3_INSTR_JP; - - /* pop the branch instruction from the stack and fix up branch target: */ - instr = ctx->branch[--ctx->branch_count]; - instr->cat0.immed = ctx->ir->instrs_count - find_instruction(ctx, instr) - 1; -} - -/* We probably don't really want to translate if/else/endif into branches.. - * the blob driver evaluates both legs of the if and then uses the sel - * instruction to pick which sides of the branch to "keep".. but figuring - * that out will take somewhat more compiler smarts. So hopefully branches - * don't kill performance too badly. - */ -static void -trans_if(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - struct tgsi_src_register *src = &inst->Src[0].Register; - struct tgsi_src_register constval; - - get_immediate(ctx, &constval, fui(0.0)); - - if (is_const(src)) - src = get_unconst(ctx, src); - - instr = instr_create(ctx, 2, OPC_CMPS_F); - ir3_reg_create(instr, regid(REG_P0, 0), 0); - add_src_reg(ctx, instr, src, src->SwizzleX); - add_src_reg(ctx, instr, &constval, constval.SwizzleX); - instr->cat2.condition = IR3_COND_EQ; - - instr = instr_create(ctx, 0, OPC_BR); - push_branch(ctx, instr); -} - -static void -trans_else(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct ir3_instruction *instr; - - /* for first half of if/else/endif, generate a jump past the else: */ - instr = instr_create(ctx, 0, OPC_JUMP); - - pop_branch(ctx); - push_branch(ctx, instr); -} - -static void -trans_endif(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - pop_branch(ctx); -} - -/* - * Handlers for TGSI instructions which do have 1:1 mapping to native - * instructions: - */ - -static void -instr_cat0(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - instr_create(ctx, 0, t->opc); -} - -static void -instr_cat1(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src = &inst->Src[0].Register; - - /* mov instructions can't handle a negate on src: */ - if (src->Negate) { - struct tgsi_src_register constval; - struct ir3_instruction *instr; - - /* since right now, we are using uniformly either TYPE_F16 or - * TYPE_F32, and we don't utilize the conversion possibilities - * of mov instructions, we can get away with substituting an - * add.f which can handle negate. Might need to revisit this - * in the future if we start supporting widening/narrowing or - * conversion to/from integer.. - */ - instr = instr_create(ctx, 2, OPC_ADD_F); - get_immediate(ctx, &constval, fui(0.0)); - vectorize(ctx, instr, dst, 2, src, 0, &constval, 0); - } else { - create_mov(ctx, dst, src); - /* create_mov() generates vector sequence, so no vectorize() */ - } - put_dst(ctx, inst, dst); -} - -static void -instr_cat2(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src0 = &inst->Src[0].Register; - struct tgsi_src_register *src1 = &inst->Src[1].Register; - struct ir3_instruction *instr; - unsigned src0_flags = 0, src1_flags = 0; - - switch (t->tgsi_opc) { - case TGSI_OPCODE_ABS: - src0_flags = IR3_REG_ABS; - break; - case TGSI_OPCODE_SUB: - src1_flags = IR3_REG_NEGATE; - break; - } - - switch (t->opc) { - case OPC_ABSNEG_F: - case OPC_ABSNEG_S: - case OPC_CLZ_B: - case OPC_CLZ_S: - case OPC_SIGN_F: - case OPC_FLOOR_F: - case OPC_CEIL_F: - case OPC_RNDNE_F: - case OPC_RNDAZ_F: - case OPC_TRUNC_F: - case OPC_NOT_B: - case OPC_BFREV_B: - case OPC_SETRM: - case OPC_CBITS_B: - /* these only have one src reg */ - instr = instr_create(ctx, 2, t->opc); - vectorize(ctx, instr, dst, 1, src0, src0_flags); - break; - default: - if (is_const(src0) && is_const(src1)) - src0 = get_unconst(ctx, src0); - - instr = instr_create(ctx, 2, t->opc); - vectorize(ctx, instr, dst, 2, src0, src0_flags, - src1, src1_flags); - break; - } - - put_dst(ctx, inst, dst); -} - -static void -instr_cat3(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src0 = &inst->Src[0].Register; - struct tgsi_src_register *src1 = &inst->Src[1].Register; - struct ir3_instruction *instr; - - /* in particular, can't handle const for src1 for cat3.. - * for mad, we can swap first two src's if needed: - */ - if (is_rel_or_const(src1)) { - if (is_mad(t->opc) && !is_rel_or_const(src0)) { - struct tgsi_src_register *tmp; - tmp = src0; - src0 = src1; - src1 = tmp; - } else { - src1 = get_unconst(ctx, src1); - } - } - - instr = instr_create(ctx, 3, - ctx->so->key.half_precision ? t->hopc : t->opc); - vectorize(ctx, instr, dst, 3, src0, 0, src1, 0, - &inst->Src[2].Register, 0); - put_dst(ctx, inst, dst); -} - -static void -instr_cat4(const struct instr_translater *t, - struct fd3_compile_context *ctx, - struct tgsi_full_instruction *inst) -{ - struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *src = &inst->Src[0].Register; - struct ir3_instruction *instr; - unsigned i, n; - - /* seems like blob compiler avoids const as src.. */ - if (is_const(src)) - src = get_unconst(ctx, src); - - /* worst case: */ - add_nop(ctx, 6); - - /* we need to replicate into each component: */ - for (i = 0, n = 0; i < 4; i++) { - if (dst->WriteMask & (1 << i)) { - if (n++) - add_nop(ctx, 1); - instr = instr_create(ctx, 4, t->opc); - add_dst_reg(ctx, instr, dst, i); - add_src_reg(ctx, instr, src, src->SwizzleX); - } - } - - regmask_set(&ctx->needs_ss, instr->regs[0]); - put_dst(ctx, inst, dst); -} - -static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { -#define INSTR(n, f, ...) \ - [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ } - - INSTR(MOV, instr_cat1), - INSTR(RCP, instr_cat4, .opc = OPC_RCP), - INSTR(RSQ, instr_cat4, .opc = OPC_RSQ), - INSTR(SQRT, instr_cat4, .opc = OPC_SQRT), - INSTR(MUL, instr_cat2, .opc = OPC_MUL_F), - INSTR(ADD, instr_cat2, .opc = OPC_ADD_F), - INSTR(SUB, instr_cat2, .opc = OPC_ADD_F), - INSTR(MIN, instr_cat2, .opc = OPC_MIN_F), - INSTR(MAX, instr_cat2, .opc = OPC_MAX_F), - INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16), - INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F), - INSTR(CLAMP, trans_clamp), - INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F), - INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F), - INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F), - INSTR(ARL, trans_arl), - INSTR(EX2, instr_cat4, .opc = OPC_EXP2), - INSTR(LG2, instr_cat4, .opc = OPC_LOG2), - INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F), - INSTR(COS, instr_cat4, .opc = OPC_COS), - INSTR(SIN, instr_cat4, .opc = OPC_SIN), - INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX), - INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP), - INSTR(SGT, trans_cmp), - INSTR(SLT, trans_cmp), - INSTR(SGE, trans_cmp), - INSTR(SLE, trans_cmp), - INSTR(SNE, trans_cmp), - INSTR(SEQ, trans_cmp), - INSTR(CMP, trans_cmp), - INSTR(IF, trans_if), - INSTR(ELSE, trans_else), - INSTR(ENDIF, trans_endif), - INSTR(END, instr_cat0, .opc = OPC_END), - INSTR(KILL, instr_cat0, .opc = OPC_KILL), -}; - -static fd3_semantic -decl_semantic(const struct tgsi_declaration_semantic *sem) -{ - return fd3_semantic_name(sem->Name, sem->Index); -} - -static int -decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) -{ - struct fd3_shader_variant *so = ctx->so; - unsigned base = ctx->base_reg[TGSI_FILE_INPUT]; - unsigned i, flags = 0; - int nop = 0; - - /* I don't think we should get frag shader input without - * semantic info? Otherwise how do inputs get linked to - * vert outputs? - */ - compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) || - decl->Declaration.Semantic); - - if (ctx->so->key.half_precision) - flags |= IR3_REG_HALF; - - for (i = decl->Range.First; i <= decl->Range.Last; i++) { - unsigned n = so->inputs_count++; - unsigned r = regid(i + base, 0); - unsigned ncomp; - - /* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */ - ncomp = 4; - - DBG("decl in -> r%d", i + base); // XXX - - compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); - - so->inputs[n].semantic = decl_semantic(&decl->Semantic); - so->inputs[n].compmask = (1 << ncomp) - 1; - so->inputs[n].ncomp = ncomp; - so->inputs[n].regid = r; - so->inputs[n].inloc = ctx->next_inloc; - so->inputs[n].bary = true; /* all that is supported */ - ctx->next_inloc += ncomp; - - so->total_in += ncomp; - - /* for frag shaders, we need to generate the corresponding bary instr: */ - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - unsigned j; - - for (j = 0; j < ncomp; j++) { - struct ir3_instruction *instr; - struct ir3_register *dst; - - instr = instr_create(ctx, 2, OPC_BARY_F); - - /* dst register: */ - dst = ir3_reg_create(instr, r + j, flags); - ctx->last_input = dst; - - /* input position: */ - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = - so->inputs[n].inloc + j - 8; - - /* input base (always r0.xy): */ - ir3_reg_create(instr, regid(0,0), 0)->wrmask = 0x3; - } - - nop = 6; - } - } - - return nop; -} - -static void -decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) -{ - struct fd3_shader_variant *so = ctx->so; - unsigned base = ctx->base_reg[TGSI_FILE_OUTPUT]; - unsigned comp = 0; - unsigned name = decl->Semantic.Name; - unsigned i; - - compile_assert(ctx, decl->Declaration.Semantic); // TODO is this ever not true? - - DBG("decl out[%d] -> r%d", name, decl->Range.First + base); // XXX - - if (ctx->type == TGSI_PROCESSOR_VERTEX) { - switch (name) { - case TGSI_SEMANTIC_POSITION: - so->writes_pos = true; - break; - case TGSI_SEMANTIC_PSIZE: - so->writes_psize = true; - break; - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_BCOLOR: - case TGSI_SEMANTIC_GENERIC: - case TGSI_SEMANTIC_FOG: - case TGSI_SEMANTIC_TEXCOORD: - break; - default: - compile_error(ctx, "unknown VS semantic name: %s\n", - tgsi_semantic_names[name]); - } - } else { - switch (name) { - case TGSI_SEMANTIC_POSITION: - comp = 2; /* tgsi will write to .z component */ - so->writes_pos = true; - break; - case TGSI_SEMANTIC_COLOR: - break; - default: - compile_error(ctx, "unknown FS semantic name: %s\n", - tgsi_semantic_names[name]); - } - } - - for (i = decl->Range.First; i <= decl->Range.Last; i++) { - unsigned n = so->outputs_count++; - compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); - so->outputs[n].semantic = decl_semantic(&decl->Semantic); - so->outputs[n].regid = regid(i + base, comp); - } -} - -static void -decl_samp(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) -{ - ctx->so->has_samp = true; -} - -static void -compile_instructions(struct fd3_compile_context *ctx) -{ - struct ir3 *ir = ctx->ir; - int nop = 0; - - while (!tgsi_parse_end_of_tokens(&ctx->parser)) { - tgsi_parse_token(&ctx->parser); - - switch (ctx->parser.FullToken.Token.Type) { - case TGSI_TOKEN_TYPE_DECLARATION: { - struct tgsi_full_declaration *decl = - &ctx->parser.FullToken.FullDeclaration; - if (decl->Declaration.File == TGSI_FILE_OUTPUT) { - decl_out(ctx, decl); - } else if (decl->Declaration.File == TGSI_FILE_INPUT) { - nop = decl_in(ctx, decl); - } else if (decl->Declaration.File == TGSI_FILE_SAMPLER) { - decl_samp(ctx, decl); - } - break; - } - case TGSI_TOKEN_TYPE_IMMEDIATE: { - /* TODO: if we know the immediate is small enough, and only - * used with instructions that can embed an immediate, we - * can skip this: - */ - struct tgsi_full_immediate *imm = - &ctx->parser.FullToken.FullImmediate; - unsigned n = ctx->so->immediates_count++; - memcpy(ctx->so->immediates[n].val, imm->u, 16); - break; - } - case TGSI_TOKEN_TYPE_INSTRUCTION: { - struct tgsi_full_instruction *inst = - &ctx->parser.FullToken.FullInstruction; - unsigned opc = inst->Instruction.Opcode; - const struct instr_translater *t = &translaters[opc]; - - add_nop(ctx, nop); - nop = 0; - - if (t->fxn) { - t->fxn(t, ctx, inst); - ctx->num_internal_temps = 0; - } else { - compile_error(ctx, "unknown TGSI opc: %s\n", - tgsi_get_opcode_name(opc)); - } - - switch (inst->Instruction.Saturate) { - case TGSI_SAT_ZERO_ONE: - create_clamp_imm(ctx, &inst->Dst[0].Register, - fui(0.0), fui(1.0)); - break; - case TGSI_SAT_MINUS_PLUS_ONE: - create_clamp_imm(ctx, &inst->Dst[0].Register, - fui(-1.0), fui(1.0)); - break; - } - - break; - } - default: - break; - } - } - - if (ir->instrs_count > 0) - ir->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; - - if (ctx->last_input) - ctx->last_input->flags |= IR3_REG_EI; - - handle_last_rel(ctx); -} - -int -fd3_compile_shader_old(struct fd3_shader_variant *so, - const struct tgsi_token *tokens, struct fd3_shader_key key) -{ - struct fd3_compile_context ctx; - - assert(!so->ir); - - so->ir = ir3_create(); - - assert(so->ir); - - if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) - return -1; - - compile_instructions(&ctx); - - compile_free(&ctx); - - return 0; -} diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c index 4b2d94103f5..89af740c07c 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c @@ -44,7 +44,7 @@ static void emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd3_shader_key key) + struct ir3_shader_key key) { struct fd_vertex_stateobj *vtx = ctx->vtx; struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vertexbuf; @@ -70,7 +70,7 @@ emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring, static void draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, - struct fd_ringbuffer *ring, unsigned dirty, struct fd3_shader_key key) + struct fd_ringbuffer *ring, unsigned dirty, struct ir3_shader_key key) { fd3_emit_state(ctx, ring, &ctx->prog, dirty, key); @@ -99,7 +99,7 @@ static void fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info) { unsigned dirty = ctx->dirty; - struct fd3_shader_key key = { + struct ir3_shader_key key = { /* do binning pass first: */ .binning_pass = true, .color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false, @@ -127,7 +127,7 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty) { struct fd3_context *fd3_ctx = fd3_context(ctx); struct fd_ringbuffer *ring = ctx->binning_ring; - struct fd3_shader_key key = { + struct ir3_shader_key key = { .binning_pass = true, .half_precision = true, }; @@ -168,7 +168,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers, struct fd_ringbuffer *ring = ctx->ring; unsigned dirty = ctx->dirty; unsigned ce, i; - struct fd3_shader_key key = { + struct ir3_shader_key key = { .half_precision = true, }; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 1e4de26406a..44932dc241d 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -87,7 +87,7 @@ static void emit_constants(struct fd_ringbuffer *ring, enum adreno_state_block sb, struct fd_constbuf_stateobj *constbuf, - struct fd3_shader_variant *shader) + struct ir3_shader_variant *shader) { uint32_t enabled_mask = constbuf->enabled_mask; uint32_t first_immediate; @@ -291,7 +291,7 @@ fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, - struct fd3_shader_variant *vp, + struct ir3_shader_variant *vp, struct fd3_vertex_buf *vbufs, uint32_t n) { uint32_t i, j, last = 0; @@ -350,10 +350,10 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, struct fd_program_stateobj *prog, uint32_t dirty, - struct fd3_shader_key key) + struct ir3_shader_key key) { - struct fd3_shader_variant *vp; - struct fd3_shader_variant *fp; + struct ir3_shader_variant *vp; + struct ir3_shader_variant *fp; fp = fd3_shader_variant(prog->fp, key); vp = fd3_shader_variant(prog->vp, key); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h index f2ae4dc295e..5735c9f873d 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h @@ -33,7 +33,7 @@ #include "freedreno_context.h" #include "fd3_util.h" - +#include "ir3_shader.h" struct fd_ringbuffer; enum adreno_state_block; @@ -56,11 +56,11 @@ struct fd3_vertex_buf { }; void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, - struct fd3_shader_variant *vp, + struct ir3_shader_variant *vp, struct fd3_vertex_buf *vbufs, uint32_t n); void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, struct fd_program_stateobj *prog, uint32_t dirty, - struct fd3_shader_key key); + struct ir3_shader_key key); void fd3_emit_restore(struct fd_context *ctx); #endif /* FD3_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 8519a90ccfa..6828d0e1fb4 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -43,7 +43,7 @@ #include "fd3_util.h" #include "fd3_zsa.h" -static const struct fd3_shader_key key = { +static const struct ir3_shader_key key = { // XXX should set this based on render target format! We don't // want half_precision if float32 render target!!! .half_precision = true, diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index 164b1521a89..78c71d42e39 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -38,176 +38,23 @@ #include "freedreno_program.h" #include "fd3_program.h" -#include "fd3_compiler.h" #include "fd3_emit.h" #include "fd3_texture.h" #include "fd3_util.h" static void -delete_variant(struct fd3_shader_variant *v) +delete_shader_stateobj(struct fd3_shader_stateobj *so) { - ir3_destroy(v->ir); - fd_bo_del(v->bo); - free(v); -} - -static void -assemble_variant(struct fd3_shader_variant *so) -{ - struct fd_context *ctx = fd_context(so->so->pctx); - uint32_t sz, *bin; - - bin = ir3_assemble(so->ir, &so->info); - sz = so->info.sizedwords * 4; - - so->bo = fd_bo_new(ctx->dev, sz, - DRM_FREEDRENO_GEM_CACHE_WCOMBINE | - DRM_FREEDRENO_GEM_TYPE_KMEM); - - memcpy(fd_bo_map(so->bo), bin, sz); - - free(bin); - - so->instrlen = so->info.sizedwords / 8; - so->constlen = so->info.max_const + 1; -} - -/* for vertex shader, the inputs are loaded into registers before the shader - * is executed, so max_regs from the shader instructions might not properly - * reflect the # of registers actually used: - */ -static void -fixup_vp_regfootprint(struct fd3_shader_variant *so) -{ - unsigned i; - for (i = 0; i < so->inputs_count; i++) { - if (so->inputs[i].compmask) { - uint32_t regid = (so->inputs[i].regid + 3) >> 2; - so->info.max_reg = MAX2(so->info.max_reg, regid); - } - } - for (i = 0; i < so->outputs_count; i++) { - uint32_t regid = (so->outputs[i].regid + 3) >> 2; - so->info.max_reg = MAX2(so->info.max_reg, regid); - } -} - -static struct fd3_shader_variant * -create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key) -{ - struct fd3_shader_variant *v = CALLOC_STRUCT(fd3_shader_variant); - const struct tgsi_token *tokens = so->tokens; - int ret; - - if (!v) - return NULL; - - v->so = so; - v->key = key; - v->type = so->type; - - if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type, - key.binning_pass, key.color_two_side, key.half_precision); - tgsi_dump(tokens, 0); - } - - if (!(fd_mesa_debug & FD_DBG_NOOPT)) { - ret = fd3_compile_shader(v, tokens, key); - if (ret) { - debug_error("new compiler failed, trying fallback!"); - - v->inputs_count = 0; - v->outputs_count = 0; - v->total_in = 0; - v->has_samp = false; - v->immediates_count = 0; - } - } else { - ret = -1; /* force fallback to old compiler */ - } - - if (ret) - ret = fd3_compile_shader_old(v, tokens, key); - - if (ret) { - debug_error("compile failed!"); - goto fail; - } - - assemble_variant(v); - if (!v->bo) { - debug_error("assemble failed!"); - goto fail; - } - - if (so->type == SHADER_VERTEX) - fixup_vp_regfootprint(v); - - if (fd_mesa_debug & FD_DBG_DISASM) { - DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type, - key.binning_pass, key.color_two_side, key.half_precision); - disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type); - } - - return v; - -fail: - delete_variant(v); - return NULL; -} - -struct fd3_shader_variant * -fd3_shader_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key) -{ - struct fd3_shader_variant *v; - - /* some shader key values only apply to vertex or frag shader, - * so normalize the key to avoid constructing multiple identical - * variants: - */ - if (so->type == SHADER_FRAGMENT) { - key.binning_pass = false; - } - if (so->type == SHADER_VERTEX) { - key.color_two_side = false; - key.half_precision = false; - } - - for (v = so->variants; v; v = v->next) - if (!memcmp(&key, &v->key, sizeof(key))) - return v; - - /* compile new variant if it doesn't exist already: */ - v = create_variant(so, key); - v->next = so->variants; - so->variants = v; - - return v; -} - - -static void -delete_shader(struct fd3_shader_stateobj *so) -{ - struct fd3_shader_variant *v, *t; - for (v = so->variants; v; ) { - t = v; - v = v->next; - delete_variant(t); - } - free((void *)so->tokens); + ir3_shader_destroy(so->shader); free(so); } static struct fd3_shader_stateobj * -create_shader(struct pipe_context *pctx, const struct pipe_shader_state *cso, +create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso, enum shader_t type) { struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj); - so->pctx = pctx; - so->type = type; - so->tokens = tgsi_dup_tokens(cso->tokens); + so->shader = ir3_shader_create(pctx, cso->tokens, type); return so; } @@ -215,32 +62,32 @@ static void * fd3_fp_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso) { - return create_shader(pctx, cso, SHADER_FRAGMENT); + return create_shader_stateobj(pctx, cso, SHADER_FRAGMENT); } static void fd3_fp_state_delete(struct pipe_context *pctx, void *hwcso) { struct fd3_shader_stateobj *so = hwcso; - delete_shader(so); + delete_shader_stateobj(so); } static void * fd3_vp_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso) { - return create_shader(pctx, cso, SHADER_VERTEX); + return create_shader_stateobj(pctx, cso, SHADER_VERTEX); } static void fd3_vp_state_delete(struct pipe_context *pctx, void *hwcso) { struct fd3_shader_stateobj *so = hwcso; - delete_shader(so); + delete_shader_stateobj(so); } static void -emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_variant *so) +emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) { const struct ir3_info *si = &so->info; enum adreno_state_block sb; @@ -281,7 +128,7 @@ emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_variant *so) } static int -find_output(const struct fd3_shader_variant *so, fd3_semantic semantic) +find_output(const struct ir3_shader_variant *so, ir3_semantic semantic) { int j; @@ -297,7 +144,7 @@ find_output(const struct fd3_shader_variant *so, fd3_semantic semantic) */ if (sem2name(semantic) == TGSI_SEMANTIC_BCOLOR) { unsigned idx = sem2idx(semantic); - return find_output(so, fd3_semantic_name(TGSI_SEMANTIC_COLOR, idx)); + return find_output(so, ir3_semantic_name(TGSI_SEMANTIC_COLOR, idx)); } debug_assert(0); @@ -306,7 +153,7 @@ find_output(const struct fd3_shader_variant *so, fd3_semantic semantic) } static int -next_varying(const struct fd3_shader_variant *so, int i) +next_varying(const struct ir3_shader_variant *so, int i) { while (++i < so->inputs_count) if (so->inputs[i].compmask && so->inputs[i].bary) @@ -315,7 +162,7 @@ next_varying(const struct fd3_shader_variant *so, int i) } static uint32_t -find_output_regid(const struct fd3_shader_variant *so, fd3_semantic semantic) +find_output_regid(const struct ir3_shader_variant *so, ir3_semantic semantic) { int j; for (j = 0; j < so->outputs_count; j++) @@ -326,9 +173,9 @@ find_output_regid(const struct fd3_shader_variant *so, fd3_semantic semantic) void fd3_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog, struct fd3_shader_key key) + struct fd_program_stateobj *prog, struct ir3_shader_key key) { - const struct fd3_shader_variant *vp, *fp; + const struct ir3_shader_variant *vp, *fp; const struct ir3_info *vsi, *fsi; uint32_t pos_regid, posz_regid, psize_regid, color_regid; int i, j, k; @@ -337,7 +184,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, if (key.binning_pass) { /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct fd3_shader_variant binning_fp = {}; + static const struct ir3_shader_variant binning_fp = {}; fp = &binning_fp; } else { fp = fd3_shader_variant(prog->fp, key); @@ -347,13 +194,13 @@ fd3_program_emit(struct fd_ringbuffer *ring, fsi = &fp->info; pos_regid = find_output_regid(vp, - fd3_semantic_name(TGSI_SEMANTIC_POSITION, 0)); + ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0)); posz_regid = find_output_regid(fp, - fd3_semantic_name(TGSI_SEMANTIC_POSITION, 0)); + ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0)); psize_regid = find_output_regid(vp, - fd3_semantic_name(TGSI_SEMANTIC_PSIZE, 0)); + ir3_semantic_name(TGSI_SEMANTIC_PSIZE, 0)); color_regid = find_output_regid(fp, - fd3_semantic_name(TGSI_SEMANTIC_COLOR, 0)); + ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0)); /* we could probably divide this up into things that need to be * emitted if frag-prog is dirty vs if vert-prog is dirty.. @@ -522,16 +369,16 @@ fd3_program_emit(struct fd_ringbuffer *ring, A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); - OUT_RING(ring, fp->so->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ - OUT_RING(ring, fp->so->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ - OUT_RING(ring, fp->so->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ - OUT_RING(ring, fp->so->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ + OUT_RING(ring, fp->shader->vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ + OUT_RING(ring, fp->shader->vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ + OUT_RING(ring, fp->shader->vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ + OUT_RING(ring, fp->shader->vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); - OUT_RING(ring, fp->so->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ - OUT_RING(ring, fp->so->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ - OUT_RING(ring, fp->so->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ - OUT_RING(ring, fp->so->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + OUT_RING(ring, fp->shader->vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ + OUT_RING(ring, fp->shader->vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ + OUT_RING(ring, fp->shader->vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ + OUT_RING(ring, fp->shader->vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ } OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); @@ -558,10 +405,10 @@ fix_blit_fp(struct pipe_context *pctx) struct fd_context *ctx = fd_context(pctx); struct fd3_shader_stateobj *so = ctx->blit_prog.fp; - so->vpsrepl[0] = 0x99999999; - so->vpsrepl[1] = 0x99999999; - so->vpsrepl[2] = 0x99999999; - so->vpsrepl[3] = 0x99999999; + so->shader->vpsrepl[0] = 0x99999999; + so->shader->vpsrepl[1] = 0x99999999; + so->shader->vpsrepl[2] = 0x99999999; + so->shader->vpsrepl[3] = 0x99999999; } void diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h index e2ed1cc3dda..cebaeecc5bc 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h @@ -30,127 +30,22 @@ #define FD3_PROGRAM_H_ #include "pipe/p_context.h" - #include "freedreno_context.h" -#include "fd3_util.h" -#include "ir3.h" -#include "disasm.h" - -typedef uint16_t fd3_semantic; /* semantic name + index */ -static inline fd3_semantic -fd3_semantic_name(uint8_t name, uint16_t index) -{ - return (name << 8) | (index & 0xff); -} - -static inline uint8_t sem2name(fd3_semantic sem) -{ - return sem >> 8; -} - -static inline uint16_t sem2idx(fd3_semantic sem) -{ - return sem & 0xff; -} - -struct fd3_shader_variant { - struct fd_bo *bo; - - struct fd3_shader_key key; - - struct ir3_info info; - struct ir3 *ir; - - /* the instructions length is in units of instruction groups - * (4 instructions, 8 dwords): - */ - unsigned instrlen; - - /* the constants length is in units of vec4's, and is the sum of - * the uniforms and the built-in compiler constants - */ - unsigned constlen; - - /* About Linkage: - * + Let the frag shader determine the position/compmask for the - * varyings, since it is the place where we know if the varying - * is actually used, and if so, which components are used. So - * what the hw calls "outloc" is taken from the "inloc" of the - * frag shader. - * + From the vert shader, we only need the output regid - */ - - /* for frag shader, pos_regid holds the frag_pos, ie. what is passed - * to bary.f instructions - */ - uint8_t pos_regid; - bool frag_coord, frag_face; - - /* varyings/outputs: */ - unsigned outputs_count; - struct { - fd3_semantic semantic; - uint8_t regid; - } outputs[16 + 2]; /* +POSITION +PSIZE */ - bool writes_pos, writes_psize; - - /* vertices/inputs: */ - unsigned inputs_count; - struct { - fd3_semantic semantic; - uint8_t regid; - uint8_t compmask; - uint8_t ncomp; - /* in theory inloc of fs should match outloc of vs: */ - uint8_t inloc; - uint8_t bary; - } inputs[16 + 2]; /* +POSITION +FACE */ - - unsigned total_in; /* sum of inputs (scalar) */ - - /* do we have one or more texture sample instructions: */ - bool has_samp; - - /* const reg # of first immediate, ie. 1 == c1 - * (not regid, because TGSI thinks in terms of vec4 registers, - * not scalar registers) - */ - unsigned first_immediate; - unsigned immediates_count; - struct { - uint32_t val[4]; - } immediates[64]; - - /* shader varients form a linked list: */ - struct fd3_shader_variant *next; - - /* replicated here to avoid passing extra ptrs everywhere: */ - enum shader_t type; - struct fd3_shader_stateobj *so; -}; +#include "ir3_shader.h" struct fd3_shader_stateobj { - enum shader_t type; - - struct pipe_context *pctx; - const struct tgsi_token *tokens; - - struct fd3_shader_variant *variants; - - /* so far, only used for blit_prog shader.. values for - * VPC_VARYING_INTERP[i].MODE and VPC_VARYING_PS_REPL[i].MODE - * - * Possibly should be in fd3_program_variant? - */ - uint32_t vinterp[4], vpsrepl[4]; + struct ir3_shader *shader; }; -struct fd3_shader_variant * fd3_shader_variant(struct fd3_shader_stateobj *so, - struct fd3_shader_key key); - void fd3_program_emit(struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog, struct fd3_shader_key key); + struct fd_program_stateobj *prog, struct ir3_shader_key key); void fd3_prog_init(struct pipe_context *pctx); +static inline struct ir3_shader_variant * +fd3_shader_variant(struct fd3_shader_stateobj *so, struct ir3_shader_key key) +{ + return ir3_shader_variant(so->shader, key); +} + #endif /* FD3_PROGRAM_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_util.h b/src/gallium/drivers/freedreno/a3xx/fd3_util.h index 6462d18f913..4681840b173 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_util.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_util.h @@ -43,22 +43,4 @@ enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format); uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r, unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a); -/* Configuration key used to identify a shader variant.. different - * shader variants can be used to implement features not supported - * in hw (two sided color), binning-pass vertex shader, etc. - * - * NOTE: this is declared here (rather than fd3_program.h) as it is - * passed around through a lot of the emit code in various parts - * which would otherwise not necessarily need to incl fd3_program.h - */ -struct fd3_shader_key { - /* vertex shader variant parameters: */ - unsigned binning_pass : 1; - - /* fragment shader variant parameters: */ - unsigned color_two_side : 1; - unsigned half_precision : 1; -}; -struct fd3_shader_variant; - #endif /* FD3_UTIL_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h deleted file mode 100644 index c67f1037ced..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h +++ /dev/null @@ -1,691 +0,0 @@ -/* - * Copyright (c) 2013 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef INSTR_A3XX_H_ -#define INSTR_A3XX_H_ - -#define PACKED __attribute__((__packed__)) - -#include -#include - -typedef enum { - /* category 0: */ - OPC_NOP = 0, - OPC_BR = 1, - OPC_JUMP = 2, - OPC_CALL = 3, - OPC_RET = 4, - OPC_KILL = 5, - OPC_END = 6, - OPC_EMIT = 7, - OPC_CUT = 8, - OPC_CHMASK = 9, - OPC_CHSH = 10, - OPC_FLOW_REV = 11, - - /* category 1: */ - /* no opc.. all category 1 are variants of mov */ - - /* category 2: */ - OPC_ADD_F = 0, - OPC_MIN_F = 1, - OPC_MAX_F = 2, - OPC_MUL_F = 3, - OPC_SIGN_F = 4, - OPC_CMPS_F = 5, - OPC_ABSNEG_F = 6, - OPC_CMPV_F = 7, - /* 8 - invalid */ - OPC_FLOOR_F = 9, - OPC_CEIL_F = 10, - OPC_RNDNE_F = 11, - OPC_RNDAZ_F = 12, - OPC_TRUNC_F = 13, - /* 14-15 - invalid */ - OPC_ADD_U = 16, - OPC_ADD_S = 17, - OPC_SUB_U = 18, - OPC_SUB_S = 19, - OPC_CMPS_U = 20, - OPC_CMPS_S = 21, - OPC_MIN_U = 22, - OPC_MIN_S = 23, - OPC_MAX_U = 24, - OPC_MAX_S = 25, - OPC_ABSNEG_S = 26, - /* 27 - invalid */ - OPC_AND_B = 28, - OPC_OR_B = 29, - OPC_NOT_B = 30, - OPC_XOR_B = 31, - /* 32 - invalid */ - OPC_CMPV_U = 33, - OPC_CMPV_S = 34, - /* 35-47 - invalid */ - OPC_MUL_U = 48, - OPC_MUL_S = 49, - OPC_MULL_U = 50, - OPC_BFREV_B = 51, - OPC_CLZ_S = 52, - OPC_CLZ_B = 53, - OPC_SHL_B = 54, - OPC_SHR_B = 55, - OPC_ASHR_B = 56, - OPC_BARY_F = 57, - OPC_MGEN_B = 58, - OPC_GETBIT_B = 59, - OPC_SETRM = 60, - OPC_CBITS_B = 61, - OPC_SHB = 62, - OPC_MSAD = 63, - - /* category 3: */ - OPC_MAD_U16 = 0, - OPC_MADSH_U16 = 1, - OPC_MAD_S16 = 2, - OPC_MADSH_M16 = 3, /* should this be .s16? */ - OPC_MAD_U24 = 4, - OPC_MAD_S24 = 5, - OPC_MAD_F16 = 6, - OPC_MAD_F32 = 7, - OPC_SEL_B16 = 8, - OPC_SEL_B32 = 9, - OPC_SEL_S16 = 10, - OPC_SEL_S32 = 11, - OPC_SEL_F16 = 12, - OPC_SEL_F32 = 13, - OPC_SAD_S16 = 14, - OPC_SAD_S32 = 15, - - /* category 4: */ - OPC_RCP = 0, - OPC_RSQ = 1, - OPC_LOG2 = 2, - OPC_EXP2 = 3, - OPC_SIN = 4, - OPC_COS = 5, - OPC_SQRT = 6, - // 7-63 - invalid - - /* category 5: */ - OPC_ISAM = 0, - OPC_ISAML = 1, - OPC_ISAMM = 2, - OPC_SAM = 3, - OPC_SAMB = 4, - OPC_SAML = 5, - OPC_SAMGQ = 6, - OPC_GETLOD = 7, - OPC_CONV = 8, - OPC_CONVM = 9, - OPC_GETSIZE = 10, - OPC_GETBUF = 11, - OPC_GETPOS = 12, - OPC_GETINFO = 13, - OPC_DSX = 14, - OPC_DSY = 15, - OPC_GATHER4R = 16, - OPC_GATHER4G = 17, - OPC_GATHER4B = 18, - OPC_GATHER4A = 19, - OPC_SAMGP0 = 20, - OPC_SAMGP1 = 21, - OPC_SAMGP2 = 22, - OPC_SAMGP3 = 23, - OPC_DSXPP_1 = 24, - OPC_DSYPP_1 = 25, - OPC_RGETPOS = 26, - OPC_RGETINFO = 27, - - /* category 6: */ - OPC_LDG = 0, /* load-global */ - OPC_LDL = 1, - OPC_LDP = 2, - OPC_STG = 3, /* store-global */ - OPC_STL = 4, - OPC_STP = 5, - OPC_STI = 6, - OPC_G2L = 7, - OPC_L2G = 8, - OPC_PREFETCH = 9, - OPC_LDLW = 10, - OPC_STLW = 11, - OPC_RESFMT = 14, - OPC_RESINFO = 15, - OPC_ATOMIC_ADD_L = 16, - OPC_ATOMIC_SUB_L = 17, - OPC_ATOMIC_XCHG_L = 18, - OPC_ATOMIC_INC_L = 19, - OPC_ATOMIC_DEC_L = 20, - OPC_ATOMIC_CMPXCHG_L = 21, - OPC_ATOMIC_MIN_L = 22, - OPC_ATOMIC_MAX_L = 23, - OPC_ATOMIC_AND_L = 24, - OPC_ATOMIC_OR_L = 25, - OPC_ATOMIC_XOR_L = 26, - OPC_LDGB_TYPED_4D = 27, - OPC_STGB_4D_4 = 28, - OPC_STIB = 29, - OPC_LDC_4 = 30, - OPC_LDLV = 31, - - /* meta instructions (category -1): */ - /* placeholder instr to mark inputs/outputs: */ - OPC_META_INPUT = 0, - OPC_META_OUTPUT = 1, - /* The "fan-in" and "fan-out" instructions are used for keeping - * track of instructions that write to multiple dst registers - * (fan-out) like texture sample instructions, or read multiple - * consecutive scalar registers (fan-in) (bary.f, texture samp) - */ - OPC_META_FO = 2, - OPC_META_FI = 3, - /* branches/flow control */ - OPC_META_FLOW = 4, - OPC_META_PHI = 5, - /* relative addressing */ - OPC_META_DEREF = 6, - - -} opc_t; - -typedef enum { - TYPE_F16 = 0, - TYPE_F32 = 1, - TYPE_U16 = 2, - TYPE_U32 = 3, - TYPE_S16 = 4, - TYPE_S32 = 5, - TYPE_U8 = 6, - TYPE_S8 = 7, // XXX I assume? -} type_t; - -static inline uint32_t type_size(type_t type) -{ - switch (type) { - case TYPE_F32: - case TYPE_U32: - case TYPE_S32: - return 32; - case TYPE_F16: - case TYPE_U16: - case TYPE_S16: - return 16; - case TYPE_U8: - case TYPE_S8: - return 8; - default: - assert(0); /* invalid type */ - return 0; - } -} - -static inline int type_float(type_t type) -{ - return (type == TYPE_F32) || (type == TYPE_F16); -} - -static inline int type_uint(type_t type) -{ - return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8); -} - -static inline int type_sint(type_t type) -{ - return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8); -} - -typedef union PACKED { - /* normal gpr or const src register: */ - struct PACKED { - uint32_t comp : 2; - uint32_t num : 10; - }; - /* for immediate val: */ - int32_t iim_val : 11; - /* to make compiler happy: */ - uint32_t dummy32; - uint32_t dummy10 : 10; - uint32_t dummy11 : 11; - uint32_t dummy12 : 12; - uint32_t dummy13 : 13; - uint32_t dummy8 : 8; -} reg_t; - -/* special registers: */ -#define REG_A0 61 /* address register */ -#define REG_P0 62 /* predicate register */ - -static inline int reg_special(reg_t reg) -{ - return (reg.num == REG_A0) || (reg.num == REG_P0); -} - -typedef struct PACKED { - /* dword0: */ - int16_t immed : 16; - uint32_t dummy1 : 16; - - /* dword1: */ - uint32_t dummy2 : 8; - uint32_t repeat : 3; - uint32_t dummy3 : 1; - uint32_t ss : 1; - uint32_t dummy4 : 7; - uint32_t inv : 1; - uint32_t comp : 2; - uint32_t opc : 4; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat0_t; - -typedef struct PACKED { - /* dword0: */ - union PACKED { - /* for normal src register: */ - struct PACKED { - uint32_t src : 11; - /* at least low bit of pad must be zero or it will - * look like a address relative src - */ - uint32_t pad : 21; - }; - /* for address relative: */ - struct PACKED { - int32_t off : 10; - uint32_t src_rel_c : 1; - uint32_t src_rel : 1; - uint32_t unknown : 20; - }; - /* for immediate: */ - int32_t iim_val; - float fim_val; - }; - - /* dword1: */ - uint32_t dst : 8; - uint32_t repeat : 3; - uint32_t src_r : 1; - uint32_t ss : 1; - uint32_t ul : 1; - uint32_t dst_type : 3; - uint32_t dst_rel : 1; - uint32_t src_type : 3; - uint32_t src_c : 1; - uint32_t src_im : 1; - uint32_t even : 1; - uint32_t pos_inf : 1; - uint32_t must_be_0 : 2; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat1_t; - -typedef struct PACKED { - /* dword0: */ - union PACKED { - struct PACKED { - uint32_t src1 : 11; - uint32_t must_be_zero1: 2; - uint32_t src1_im : 1; /* immediate */ - uint32_t src1_neg : 1; /* negate */ - uint32_t src1_abs : 1; /* absolute value */ - }; - struct PACKED { - uint32_t src1 : 10; - uint32_t src1_c : 1; /* relative-const */ - uint32_t src1_rel : 1; /* relative address */ - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel1; - struct PACKED { - uint32_t src1 : 12; - uint32_t src1_c : 1; /* const */ - uint32_t dummy : 3; - } c1; - }; - - union PACKED { - struct PACKED { - uint32_t src2 : 11; - uint32_t must_be_zero2: 2; - uint32_t src2_im : 1; /* immediate */ - uint32_t src2_neg : 1; /* negate */ - uint32_t src2_abs : 1; /* absolute value */ - }; - struct PACKED { - uint32_t src2 : 10; - uint32_t src2_c : 1; /* relative-const */ - uint32_t src2_rel : 1; /* relative address */ - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel2; - struct PACKED { - uint32_t src2 : 12; - uint32_t src2_c : 1; /* const */ - uint32_t dummy : 3; - } c2; - }; - - /* dword1: */ - uint32_t dst : 8; - uint32_t repeat : 3; - uint32_t src1_r : 1; - uint32_t ss : 1; - uint32_t ul : 1; /* dunno */ - uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ - uint32_t ei : 1; - uint32_t cond : 3; - uint32_t src2_r : 1; - uint32_t full : 1; /* not half */ - uint32_t opc : 6; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat2_t; - -typedef struct PACKED { - /* dword0: */ - union PACKED { - struct PACKED { - uint32_t src1 : 11; - uint32_t must_be_zero1: 2; - uint32_t src2_c : 1; - uint32_t src1_neg : 1; - uint32_t src2_r : 1; - }; - struct PACKED { - uint32_t src1 : 10; - uint32_t src1_c : 1; - uint32_t src1_rel : 1; - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel1; - struct PACKED { - uint32_t src1 : 12; - uint32_t src1_c : 1; - uint32_t dummy : 3; - } c1; - }; - - union PACKED { - struct PACKED { - uint32_t src3 : 11; - uint32_t must_be_zero2: 2; - uint32_t src3_r : 1; - uint32_t src2_neg : 1; - uint32_t src3_neg : 1; - }; - struct PACKED { - uint32_t src3 : 10; - uint32_t src3_c : 1; - uint32_t src3_rel : 1; - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel2; - struct PACKED { - uint32_t src3 : 12; - uint32_t src3_c : 1; - uint32_t dummy : 3; - } c2; - }; - - /* dword1: */ - uint32_t dst : 8; - uint32_t repeat : 3; - uint32_t src1_r : 1; - uint32_t ss : 1; - uint32_t ul : 1; - uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ - uint32_t src2 : 8; - uint32_t opc : 4; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat3_t; - -static inline bool instr_cat3_full(instr_cat3_t *cat3) -{ - switch (cat3->opc) { - case OPC_MAD_F16: - case OPC_MAD_U16: - case OPC_MAD_S16: - case OPC_SEL_B16: - case OPC_SEL_S16: - case OPC_SEL_F16: - case OPC_SAD_S16: - case OPC_SAD_S32: // really?? - return false; - default: - return true; - } -} - -typedef struct PACKED { - /* dword0: */ - union PACKED { - struct PACKED { - uint32_t src : 11; - uint32_t must_be_zero1: 2; - uint32_t src_im : 1; /* immediate */ - uint32_t src_neg : 1; /* negate */ - uint32_t src_abs : 1; /* absolute value */ - }; - struct PACKED { - uint32_t src : 10; - uint32_t src_c : 1; /* relative-const */ - uint32_t src_rel : 1; /* relative address */ - uint32_t must_be_zero : 1; - uint32_t dummy : 3; - } rel; - struct PACKED { - uint32_t src : 12; - uint32_t src_c : 1; /* const */ - uint32_t dummy : 3; - } c; - }; - uint32_t dummy1 : 16; /* seem to be ignored */ - - /* dword1: */ - uint32_t dst : 8; - uint32_t repeat : 3; - uint32_t src_r : 1; - uint32_t ss : 1; - uint32_t ul : 1; - uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ - uint32_t dummy2 : 5; /* seem to be ignored */ - uint32_t full : 1; /* not half */ - uint32_t opc : 6; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat4_t; - -typedef struct PACKED { - /* dword0: */ - union PACKED { - /* normal case: */ - struct PACKED { - uint32_t full : 1; /* not half */ - uint32_t src1 : 8; - uint32_t src2 : 8; - uint32_t dummy1 : 4; /* seem to be ignored */ - uint32_t samp : 4; - uint32_t tex : 7; - } norm; - /* s2en case: */ - struct PACKED { - uint32_t full : 1; /* not half */ - uint32_t src1 : 8; - uint32_t src2 : 11; - uint32_t dummy1 : 1; - uint32_t src3 : 8; - uint32_t dummy2 : 3; - } s2en; - /* same in either case: */ - // XXX I think, confirm this - struct PACKED { - uint32_t full : 1; /* not half */ - uint32_t src1 : 8; - uint32_t pad : 23; - }; - }; - - /* dword1: */ - uint32_t dst : 8; - uint32_t wrmask : 4; /* write-mask */ - uint32_t type : 3; - uint32_t dummy2 : 1; /* seems to be ignored */ - uint32_t is_3d : 1; - - uint32_t is_a : 1; - uint32_t is_s : 1; - uint32_t is_s2en : 1; - uint32_t is_o : 1; - uint32_t is_p : 1; - - uint32_t opc : 5; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat5_t; - -/* used for load instructions: */ -typedef struct PACKED { - /* dword0: */ - uint32_t must_be_one1 : 1; - int16_t off : 13; - uint32_t src : 8; - uint32_t dummy1 : 1; - uint32_t must_be_one2 : 1; - int32_t iim_val : 8; - - /* dword1: */ - uint32_t dst : 8; - uint32_t dummy2 : 9; - uint32_t type : 3; - uint32_t dummy3 : 2; - uint32_t opc : 5; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat6a_t; - -/* used for store instructions: */ -typedef struct PACKED { - /* dword0: */ - uint32_t must_be_zero1 : 1; - uint32_t src : 8; - uint32_t off_hi : 5; /* high bits of 'off'... ugly! */ - uint32_t dummy1 : 9; - uint32_t must_be_one1 : 1; - int32_t iim_val : 8; - - /* dword1: */ - uint16_t off : 8; - uint32_t must_be_one2 : 1; - uint32_t dst : 8; - uint32_t type : 3; - uint32_t dummy2 : 2; - uint32_t opc : 5; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; -} instr_cat6b_t; - -typedef union PACKED { - instr_cat6a_t a; - instr_cat6b_t b; - struct PACKED { - /* dword0: */ - uint32_t pad1 : 24; - int32_t iim_val : 8; - - /* dword1: */ - uint32_t pad2 : 17; - uint32_t type : 3; - uint32_t pad3 : 2; - uint32_t opc : 5; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; - }; -} instr_cat6_t; - -typedef union PACKED { - instr_cat0_t cat0; - instr_cat1_t cat1; - instr_cat2_t cat2; - instr_cat3_t cat3; - instr_cat4_t cat4; - instr_cat5_t cat5; - instr_cat6_t cat6; - struct PACKED { - /* dword0: */ - uint64_t pad1 : 40; - uint32_t repeat : 3; /* cat0-cat4 */ - uint32_t pad2 : 1; - uint32_t ss : 1; /* cat1-cat4 (cat0??) */ - uint32_t ul : 1; /* cat2-cat4 (and cat1 in blob.. which may be bug??) */ - uint32_t pad3 : 13; - uint32_t jmp_tgt : 1; - uint32_t sync : 1; - uint32_t opc_cat : 3; - - }; -} instr_t; - -static inline uint32_t instr_opc(instr_t *instr) -{ - switch (instr->opc_cat) { - case 0: return instr->cat0.opc; - case 1: return 0; - case 2: return instr->cat2.opc; - case 3: return instr->cat3.opc; - case 4: return instr->cat4.opc; - case 5: return instr->cat5.opc; - case 6: return instr->cat6.opc; - default: return 0; - } -} - -static inline bool is_mad(opc_t opc) -{ - switch (opc) { - case OPC_MAD_U16: - case OPC_MADSH_U16: - case OPC_MAD_S16: - case OPC_MADSH_M16: - case OPC_MAD_U24: - case OPC_MAD_S24: - case OPC_MAD_F16: - case OPC_MAD_F32: - return true; - default: - return false; - } -} - -#endif /* INSTR_A3XX_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.c b/src/gallium/drivers/freedreno/a3xx/ir3.c deleted file mode 100644 index ea2a9251b28..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/ir3.c +++ /dev/null @@ -1,675 +0,0 @@ -/* - * Copyright (c) 2012 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "ir3.h" - -#include -#include -#include -#include -#include -#include - -#include "freedreno_util.h" -#include "instr-a3xx.h" - -#define CHUNK_SZ 1020 - -struct ir3_heap_chunk { - struct ir3_heap_chunk *next; - uint32_t heap[CHUNK_SZ]; -}; - -static void grow_heap(struct ir3 *shader) -{ - struct ir3_heap_chunk *chunk = calloc(1, sizeof(*chunk)); - chunk->next = shader->chunk; - shader->chunk = chunk; - shader->heap_idx = 0; -} - -/* simple allocator to carve allocations out of an up-front allocated heap, - * so that we can free everything easily in one shot. - */ -void * ir3_alloc(struct ir3 *shader, int sz) -{ - void *ptr; - - sz = align(sz, 4) / 4; - - if ((shader->heap_idx + sz) > CHUNK_SZ) - grow_heap(shader); - - ptr = &shader->chunk->heap[shader->heap_idx]; - shader->heap_idx += sz; - - return ptr; -} - -struct ir3 * ir3_create(void) -{ - struct ir3 *shader = - calloc(1, sizeof(struct ir3)); - grow_heap(shader); - return shader; -} - -void ir3_destroy(struct ir3 *shader) -{ - while (shader->chunk) { - struct ir3_heap_chunk *chunk = shader->chunk; - shader->chunk = chunk->next; - free(chunk); - } - free(shader); -} - -#define iassert(cond) do { \ - if (!(cond)) { \ - assert(cond); \ - return -1; \ - } } while (0) - -static uint32_t reg(struct ir3_register *reg, struct ir3_info *info, - uint32_t repeat, uint32_t valid_flags) -{ - reg_t val = { .dummy32 = 0 }; - - assert(!(reg->flags & ~valid_flags)); - - if (!(reg->flags & IR3_REG_R)) - repeat = 0; - - if (reg->flags & IR3_REG_IMMED) { - val.iim_val = reg->iim_val; - } else { - int8_t components = util_last_bit(reg->wrmask); - int8_t max = (reg->num + repeat + components - 1) >> 2; - - val.comp = reg->num & 0x3; - val.num = reg->num >> 2; - - if (reg->flags & IR3_REG_CONST) { - info->max_const = MAX2(info->max_const, max); - } else if ((max != REG_A0) && (max != REG_P0)) { - if (reg->flags & IR3_REG_HALF) { - info->max_half_reg = MAX2(info->max_half_reg, max); - } else { - info->max_reg = MAX2(info->max_reg, max); - } - } - } - - return val.dummy32; -} - -static int emit_cat0(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - instr_cat0_t *cat0 = ptr; - - cat0->immed = instr->cat0.immed; - cat0->repeat = instr->repeat; - cat0->ss = !!(instr->flags & IR3_INSTR_SS); - cat0->inv = instr->cat0.inv; - cat0->comp = instr->cat0.comp; - cat0->opc = instr->opc; - cat0->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat0->sync = !!(instr->flags & IR3_INSTR_SY); - cat0->opc_cat = 0; - - return 0; -} - -static uint32_t type_flags(type_t type) -{ - return (type_size(type) == 32) ? 0 : IR3_REG_HALF; -} - -static int emit_cat1(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src = instr->regs[1]; - instr_cat1_t *cat1 = ptr; - - iassert(instr->regs_count == 2); - iassert(!((dst->flags ^ type_flags(instr->cat1.dst_type)) & IR3_REG_HALF)); - iassert((src->flags & IR3_REG_IMMED) || - !((src->flags ^ type_flags(instr->cat1.src_type)) & IR3_REG_HALF)); - - if (src->flags & IR3_REG_IMMED) { - cat1->iim_val = src->iim_val; - cat1->src_im = 1; - } else if (src->flags & IR3_REG_RELATIV) { - cat1->off = src->offset; - cat1->src_rel = 1; - cat1->src_rel_c = !!(src->flags & IR3_REG_CONST); - } else { - cat1->src = reg(src, info, instr->repeat, - IR3_REG_IMMED | IR3_REG_R | - IR3_REG_CONST | IR3_REG_HALF); - cat1->src_c = !!(src->flags & IR3_REG_CONST); - } - - cat1->dst = reg(dst, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_EVEN | - IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF); - cat1->repeat = instr->repeat; - cat1->src_r = !!(src->flags & IR3_REG_R); - cat1->ss = !!(instr->flags & IR3_INSTR_SS); - cat1->ul = !!(instr->flags & IR3_INSTR_UL); - cat1->dst_type = instr->cat1.dst_type; - cat1->dst_rel = !!(dst->flags & IR3_REG_RELATIV); - cat1->src_type = instr->cat1.src_type; - cat1->even = !!(dst->flags & IR3_REG_EVEN); - cat1->pos_inf = !!(dst->flags & IR3_REG_POS_INF); - cat1->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat1->sync = !!(instr->flags & IR3_INSTR_SY); - cat1->opc_cat = 1; - - return 0; -} - -static int emit_cat2(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src1 = instr->regs[1]; - struct ir3_register *src2 = instr->regs[2]; - instr_cat2_t *cat2 = ptr; - - iassert((instr->regs_count == 2) || (instr->regs_count == 3)); - - if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->num < (1 << 10)); - cat2->rel1.src1 = reg(src1, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | - IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF); - cat2->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); - cat2->rel1.src1_rel = 1; - } else if (src1->flags & IR3_REG_CONST) { - iassert(src1->num < (1 << 12)); - cat2->c1.src1 = reg(src1, info, instr->repeat, - IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS | - IR3_REG_R | IR3_REG_HALF); - cat2->c1.src1_c = 1; - } else { - iassert(src1->num < (1 << 11)); - cat2->src1 = reg(src1, info, instr->repeat, - IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS | - IR3_REG_R | IR3_REG_HALF); - } - cat2->src1_im = !!(src1->flags & IR3_REG_IMMED); - cat2->src1_neg = !!(src1->flags & IR3_REG_NEGATE); - cat2->src1_abs = !!(src1->flags & IR3_REG_ABS); - cat2->src1_r = !!(src1->flags & IR3_REG_R); - - if (src2) { - iassert((src2->flags & IR3_REG_IMMED) || - !((src1->flags ^ src2->flags) & IR3_REG_HALF)); - - if (src2->flags & IR3_REG_RELATIV) { - iassert(src2->num < (1 << 10)); - cat2->rel2.src2 = reg(src2, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | - IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF); - cat2->rel2.src2_c = !!(src2->flags & IR3_REG_CONST); - cat2->rel2.src2_rel = 1; - } else if (src2->flags & IR3_REG_CONST) { - iassert(src2->num < (1 << 12)); - cat2->c2.src2 = reg(src2, info, instr->repeat, - IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS | - IR3_REG_R | IR3_REG_HALF); - cat2->c2.src2_c = 1; - } else { - iassert(src2->num < (1 << 11)); - cat2->src2 = reg(src2, info, instr->repeat, - IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS | - IR3_REG_R | IR3_REG_HALF); - } - - cat2->src2_im = !!(src2->flags & IR3_REG_IMMED); - cat2->src2_neg = !!(src2->flags & IR3_REG_NEGATE); - cat2->src2_abs = !!(src2->flags & IR3_REG_ABS); - cat2->src2_r = !!(src2->flags & IR3_REG_R); - } - - cat2->dst = reg(dst, info, instr->repeat, - IR3_REG_R | IR3_REG_EI | IR3_REG_HALF); - cat2->repeat = instr->repeat; - cat2->ss = !!(instr->flags & IR3_INSTR_SS); - cat2->ul = !!(instr->flags & IR3_INSTR_UL); - cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF); - cat2->ei = !!(dst->flags & IR3_REG_EI); - cat2->cond = instr->cat2.condition; - cat2->full = ! (src1->flags & IR3_REG_HALF); - cat2->opc = instr->opc; - cat2->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat2->sync = !!(instr->flags & IR3_INSTR_SY); - cat2->opc_cat = 2; - - return 0; -} - -static int emit_cat3(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src1 = instr->regs[1]; - struct ir3_register *src2 = instr->regs[2]; - struct ir3_register *src3 = instr->regs[3]; - instr_cat3_t *cat3 = ptr; - uint32_t src_flags = 0; - - switch (instr->opc) { - case OPC_MAD_F16: - case OPC_MAD_U16: - case OPC_MAD_S16: - case OPC_SEL_B16: - case OPC_SEL_S16: - case OPC_SEL_F16: - case OPC_SAD_S16: - case OPC_SAD_S32: // really?? - src_flags |= IR3_REG_HALF; - break; - default: - break; - } - - iassert(instr->regs_count == 4); - iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF)); - iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF)); - iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF)); - - if (src1->flags & IR3_REG_RELATIV) { - iassert(src1->num < (1 << 10)); - cat3->rel1.src1 = reg(src1, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | - IR3_REG_R | IR3_REG_HALF); - cat3->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); - cat3->rel1.src1_rel = 1; - } else if (src1->flags & IR3_REG_CONST) { - iassert(src1->num < (1 << 12)); - cat3->c1.src1 = reg(src1, info, instr->repeat, - IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R | - IR3_REG_HALF); - cat3->c1.src1_c = 1; - } else { - iassert(src1->num < (1 << 11)); - cat3->src1 = reg(src1, info, instr->repeat, - IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF); - } - - cat3->src1_neg = !!(src1->flags & IR3_REG_NEGATE); - cat3->src1_r = !!(src1->flags & IR3_REG_R); - - cat3->src2 = reg(src2, info, instr->repeat, - IR3_REG_CONST | IR3_REG_NEGATE | - IR3_REG_R | IR3_REG_HALF); - cat3->src2_c = !!(src2->flags & IR3_REG_CONST); - cat3->src2_neg = !!(src2->flags & IR3_REG_NEGATE); - cat3->src2_r = !!(src2->flags & IR3_REG_R); - - - if (src3->flags & IR3_REG_RELATIV) { - iassert(src3->num < (1 << 10)); - cat3->rel2.src3 = reg(src3, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | - IR3_REG_R | IR3_REG_HALF); - cat3->rel2.src3_c = !!(src3->flags & IR3_REG_CONST); - cat3->rel2.src3_rel = 1; - } else if (src3->flags & IR3_REG_CONST) { - iassert(src3->num < (1 << 12)); - cat3->c2.src3 = reg(src3, info, instr->repeat, - IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R | - IR3_REG_HALF); - cat3->c2.src3_c = 1; - } else { - iassert(src3->num < (1 << 11)); - cat3->src3 = reg(src3, info, instr->repeat, - IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF); - } - - cat3->src3_neg = !!(src3->flags & IR3_REG_NEGATE); - cat3->src3_r = !!(src3->flags & IR3_REG_R); - - cat3->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - cat3->repeat = instr->repeat; - cat3->ss = !!(instr->flags & IR3_INSTR_SS); - cat3->ul = !!(instr->flags & IR3_INSTR_UL); - cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF); - cat3->opc = instr->opc; - cat3->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat3->sync = !!(instr->flags & IR3_INSTR_SY); - cat3->opc_cat = 3; - - return 0; -} - -static int emit_cat4(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src = instr->regs[1]; - instr_cat4_t *cat4 = ptr; - - iassert(instr->regs_count == 2); - - if (src->flags & IR3_REG_RELATIV) { - iassert(src->num < (1 << 10)); - cat4->rel.src = reg(src, info, instr->repeat, - IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | - IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF); - cat4->rel.src_c = !!(src->flags & IR3_REG_CONST); - cat4->rel.src_rel = 1; - } else if (src->flags & IR3_REG_CONST) { - iassert(src->num < (1 << 12)); - cat4->c.src = reg(src, info, instr->repeat, - IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS | - IR3_REG_R | IR3_REG_HALF); - cat4->c.src_c = 1; - } else { - iassert(src->num < (1 << 11)); - cat4->src = reg(src, info, instr->repeat, - IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS | - IR3_REG_R | IR3_REG_HALF); - } - - cat4->src_im = !!(src->flags & IR3_REG_IMMED); - cat4->src_neg = !!(src->flags & IR3_REG_NEGATE); - cat4->src_abs = !!(src->flags & IR3_REG_ABS); - cat4->src_r = !!(src->flags & IR3_REG_R); - - cat4->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - cat4->repeat = instr->repeat; - cat4->ss = !!(instr->flags & IR3_INSTR_SS); - cat4->ul = !!(instr->flags & IR3_INSTR_UL); - cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF); - cat4->full = ! (src->flags & IR3_REG_HALF); - cat4->opc = instr->opc; - cat4->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat4->sync = !!(instr->flags & IR3_INSTR_SY); - cat4->opc_cat = 4; - - return 0; -} - -static int emit_cat5(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src1 = instr->regs[1]; - struct ir3_register *src2 = instr->regs[2]; - struct ir3_register *src3 = instr->regs[3]; - instr_cat5_t *cat5 = ptr; - - iassert(!((dst->flags ^ type_flags(instr->cat5.type)) & IR3_REG_HALF)); - - if (src1) { - cat5->full = ! (src1->flags & IR3_REG_HALF); - cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF); - } - - - if (instr->flags & IR3_INSTR_S2EN) { - if (src2) { - iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); - cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); - } - if (src3) { - iassert(src3->flags & IR3_REG_HALF); - cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF); - } - iassert(!(instr->cat5.samp | instr->cat5.tex)); - } else { - iassert(!src3); - if (src2) { - iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); - cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); - } - cat5->norm.samp = instr->cat5.samp; - cat5->norm.tex = instr->cat5.tex; - } - - cat5->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - cat5->wrmask = dst->wrmask; - cat5->type = instr->cat5.type; - cat5->is_3d = !!(instr->flags & IR3_INSTR_3D); - cat5->is_a = !!(instr->flags & IR3_INSTR_A); - cat5->is_s = !!(instr->flags & IR3_INSTR_S); - cat5->is_s2en = !!(instr->flags & IR3_INSTR_S2EN); - cat5->is_o = !!(instr->flags & IR3_INSTR_O); - cat5->is_p = !!(instr->flags & IR3_INSTR_P); - cat5->opc = instr->opc; - cat5->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat5->sync = !!(instr->flags & IR3_INSTR_SY); - cat5->opc_cat = 5; - - return 0; -} - -static int emit_cat6(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) -{ - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src = instr->regs[1]; - instr_cat6_t *cat6 = ptr; - - iassert(instr->regs_count == 2); - - switch (instr->opc) { - /* load instructions: */ - case OPC_LDG: - case OPC_LDP: - case OPC_LDL: - case OPC_LDLW: - case OPC_LDLV: - case OPC_PREFETCH: { - instr_cat6a_t *cat6a = ptr; - - iassert(!((dst->flags ^ type_flags(instr->cat6.type)) & IR3_REG_HALF)); - - cat6a->must_be_one1 = 1; - cat6a->must_be_one2 = 1; - cat6a->off = instr->cat6.offset; - cat6a->src = reg(src, info, instr->repeat, 0); - cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); - break; - } - /* store instructions: */ - case OPC_STG: - case OPC_STP: - case OPC_STL: - case OPC_STLW: - case OPC_STI: { - instr_cat6b_t *cat6b = ptr; - uint32_t src_flags = type_flags(instr->cat6.type); - uint32_t dst_flags = (instr->opc == OPC_STI) ? IR3_REG_HALF : 0; - - iassert(!((src->flags ^ src_flags) & IR3_REG_HALF)); - - cat6b->must_be_one1 = 1; - cat6b->must_be_one2 = 1; - cat6b->src = reg(src, info, instr->repeat, src_flags); - cat6b->off_hi = instr->cat6.offset >> 8; - cat6b->off = instr->cat6.offset; - cat6b->dst = reg(dst, info, instr->repeat, IR3_REG_R | dst_flags); - - break; - } - default: - // TODO - break; - } - - cat6->iim_val = instr->cat6.iim_val; - cat6->type = instr->cat6.type; - cat6->opc = instr->opc; - cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); - cat6->sync = !!(instr->flags & IR3_INSTR_SY); - cat6->opc_cat = 6; - - return 0; -} - -static int (*emit[])(struct ir3_instruction *instr, void *ptr, - struct ir3_info *info) = { - emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6, -}; - -void * ir3_assemble(struct ir3 *shader, struct ir3_info *info) -{ - uint32_t *ptr, *dwords; - uint32_t i; - - info->max_reg = -1; - info->max_half_reg = -1; - info->max_const = -1; - info->instrs_count = 0; - - /* need a integer number of instruction "groups" (sets of four - * instructions), so pad out w/ NOPs if needed: - * (each instruction is 64bits) - */ - info->sizedwords = 2 * align(shader->instrs_count, 4); - - ptr = dwords = calloc(1, 4 * info->sizedwords); - - for (i = 0; i < shader->instrs_count; i++) { - struct ir3_instruction *instr = shader->instrs[i]; - int ret = emit[instr->category](instr, dwords, info); - if (ret) - goto fail; - info->instrs_count += 1 + instr->repeat; - dwords += 2; - } - - return ptr; - -fail: - free(ptr); - return NULL; -} - -static struct ir3_register * reg_create(struct ir3 *shader, - int num, int flags) -{ - struct ir3_register *reg = - ir3_alloc(shader, sizeof(struct ir3_register)); - reg->wrmask = 1; - reg->flags = flags; - reg->num = num; - return reg; -} - -static void insert_instr(struct ir3 *shader, - struct ir3_instruction *instr) -{ -#ifdef DEBUG - static uint32_t serialno = 0; - instr->serialno = ++serialno; -#endif - if (shader->instrs_count == shader->instrs_sz) { - shader->instrs_sz = MAX2(2 * shader->instrs_sz, 16); - shader->instrs = realloc(shader->instrs, - shader->instrs_sz * sizeof(shader->instrs[0])); - } - shader->instrs[shader->instrs_count++] = instr; -} - -struct ir3_block * ir3_block_create(struct ir3 *shader, - unsigned ntmp, unsigned nin, unsigned nout) -{ - struct ir3_block *block; - unsigned size; - char *ptr; - - size = sizeof(*block); - size += sizeof(block->temporaries[0]) * ntmp; - size += sizeof(block->inputs[0]) * nin; - size += sizeof(block->outputs[0]) * nout; - - ptr = ir3_alloc(shader, size); - - block = (void *)ptr; - ptr += sizeof(*block); - - block->temporaries = (void *)ptr; - block->ntemporaries = ntmp; - ptr += sizeof(block->temporaries[0]) * ntmp; - - block->inputs = (void *)ptr; - block->ninputs = nin; - ptr += sizeof(block->inputs[0]) * nin; - - block->outputs = (void *)ptr; - block->noutputs = nout; - ptr += sizeof(block->outputs[0]) * nout; - - block->shader = shader; - - return block; -} - -struct ir3_instruction * ir3_instr_create(struct ir3_block *block, - int category, opc_t opc) -{ - struct ir3_instruction *instr = - ir3_alloc(block->shader, sizeof(struct ir3_instruction)); - instr->block = block; - instr->category = category; - instr->opc = opc; - insert_instr(block->shader, instr); - return instr; -} - -struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) -{ - struct ir3_instruction *new_instr = - ir3_alloc(instr->block->shader, sizeof(struct ir3_instruction)); - unsigned i; - - *new_instr = *instr; - insert_instr(instr->block->shader, new_instr); - - /* clone registers: */ - new_instr->regs_count = 0; - for (i = 0; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - struct ir3_register *new_reg = - ir3_reg_create(new_instr, reg->num, reg->flags); - *new_reg = *reg; - } - - return new_instr; -} - -struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, - int num, int flags) -{ - struct ir3_register *reg = reg_create(instr->block->shader, num, flags); - assert(instr->regs_count < ARRAY_SIZE(instr->regs)); - instr->regs[instr->regs_count++] = reg; - return reg; -} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h deleted file mode 100644 index 9ed914ba2e4..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/ir3.h +++ /dev/null @@ -1,480 +0,0 @@ -/* - * Copyright (c) 2013 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IR3_H_ -#define IR3_H_ - -#include -#include - -#include "instr-a3xx.h" -#include "disasm.h" /* TODO move 'enum shader_t' somewhere else.. */ - -/* low level intermediate representation of an adreno shader program */ - -struct ir3; -struct ir3_instruction; -struct ir3_block; - -struct ir3 * fd_asm_parse(const char *src); - -struct ir3_info { - uint16_t sizedwords; - uint16_t instrs_count; /* expanded to account for rpt's */ - /* NOTE: max_reg, etc, does not include registers not touched - * by the shader (ie. vertex fetched via VFD_DECODE but not - * touched by shader) - */ - int8_t max_reg; /* highest GPR # used by shader */ - int8_t max_half_reg; - int8_t max_const; -}; - -struct ir3_register { - enum { - IR3_REG_CONST = 0x001, - IR3_REG_IMMED = 0x002, - IR3_REG_HALF = 0x004, - IR3_REG_RELATIV= 0x008, - IR3_REG_R = 0x010, - IR3_REG_NEGATE = 0x020, - IR3_REG_ABS = 0x040, - IR3_REG_EVEN = 0x080, - IR3_REG_POS_INF= 0x100, - /* (ei) flag, end-input? Set on last bary, presumably to signal - * that the shader needs no more input: - */ - IR3_REG_EI = 0x200, - /* meta-flags, for intermediate stages of IR, ie. - * before register assignment is done: - */ - IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */ - IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */ - IR3_REG_ADDR = 0x4000, /* register is a0.x */ - } flags; - union { - /* normal registers: - * the component is in the low two bits of the reg #, so - * rN.x becomes: (N << 2) | x - */ - int num; - /* immediate: */ - int iim_val; - float fim_val; - /* relative: */ - int offset; - /* for IR3_REG_SSA, src registers contain ptr back to - * assigning instruction. - */ - struct ir3_instruction *instr; - }; - - /* used for cat5 instructions, but also for internal/IR level - * tracking of what registers are read/written by an instruction. - * wrmask may be a bad name since it is used to represent both - * src and dst that touch multiple adjacent registers. - */ - int wrmask; -}; - -struct ir3_instruction { - struct ir3_block *block; - int category; - opc_t opc; - enum { - /* (sy) flag is set on first instruction, and after sample - * instructions (probably just on RAW hazard). - */ - IR3_INSTR_SY = 0x001, - /* (ss) flag is set on first instruction, and first instruction - * to depend on the result of "long" instructions (RAW hazard): - * - * rcp, rsq, log2, exp2, sin, cos, sqrt - * - * It seems to synchronize until all in-flight instructions are - * completed, for example: - * - * rsq hr1.w, hr1.w - * add.f hr2.z, (neg)hr2.z, hc0.y - * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y - * rsq hr2.x, hr2.x - * (rpt1)nop - * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w - * nop - * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w - * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w - * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x - * - * The last mul.f does not have (ss) set, presumably because the - * (ss) on the previous instruction does the job. - * - * The blob driver also seems to set it on WAR hazards, although - * not really clear if this is needed or just blob compiler being - * sloppy. So far I haven't found a case where removing the (ss) - * causes problems for WAR hazard, but I could just be getting - * lucky: - * - * rcp r1.y, r3.y - * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z - * - */ - IR3_INSTR_SS = 0x002, - /* (jp) flag is set on jump targets: - */ - IR3_INSTR_JP = 0x004, - IR3_INSTR_UL = 0x008, - IR3_INSTR_3D = 0x010, - IR3_INSTR_A = 0x020, - IR3_INSTR_O = 0x040, - IR3_INSTR_P = 0x080, - IR3_INSTR_S = 0x100, - IR3_INSTR_S2EN = 0x200, - /* meta-flags, for intermediate stages of IR, ie. - * before register assignment is done: - */ - IR3_INSTR_MARK = 0x1000, - } flags; - int repeat; - unsigned regs_count; - struct ir3_register *regs[5]; - union { - struct { - char inv; - char comp; - int immed; - } cat0; - struct { - type_t src_type, dst_type; - } cat1; - struct { - enum { - IR3_COND_LT = 0, - IR3_COND_LE = 1, - IR3_COND_GT = 2, - IR3_COND_GE = 3, - IR3_COND_EQ = 4, - IR3_COND_NE = 5, - } condition; - } cat2; - struct { - unsigned samp, tex; - type_t type; - } cat5; - struct { - type_t type; - int offset; - int iim_val; - } cat6; - /* for meta-instructions, just used to hold extra data - * before instruction scheduling, etc - */ - struct { - int off; /* component/offset */ - } fo; - struct { - struct ir3_block *if_block, *else_block; - } flow; - struct { - struct ir3_block *block; - } inout; - }; - - /* transient values used during various algorithms: */ - union { - /* The instruction depth is the max dependency distance to output. - * - * You can also think of it as the "cost", if we did any sort of - * optimization for register footprint. Ie. a value that is just - * result of moving a const to a reg would have a low cost, so to - * it could make sense to duplicate the instruction at various - * points where the result is needed to reduce register footprint. - */ - unsigned depth; - }; - struct ir3_instruction *next; -#ifdef DEBUG - uint32_t serialno; -#endif -}; - -struct ir3_heap_chunk; - -struct ir3 { - unsigned instrs_count, instrs_sz; - struct ir3_instruction **instrs; - unsigned heap_idx; - struct ir3_heap_chunk *chunk; -}; - -struct ir3_block { - struct ir3 *shader; - unsigned ntemporaries, ninputs, noutputs; - /* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */ - struct ir3_instruction **temporaries; - struct ir3_instruction **inputs; - struct ir3_instruction **outputs; - /* only a single address register: */ - struct ir3_instruction *address; - struct ir3_block *parent; - struct ir3_instruction *head; -}; - -struct ir3 * ir3_create(void); -void ir3_destroy(struct ir3 *shader); -void * ir3_assemble(struct ir3 *shader, - struct ir3_info *info); -void * ir3_alloc(struct ir3 *shader, int sz); - -struct ir3_block * ir3_block_create(struct ir3 *shader, - unsigned ntmp, unsigned nin, unsigned nout); - -struct ir3_instruction * ir3_instr_create(struct ir3_block *block, - int category, opc_t opc); -struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr); -const char *ir3_instr_name(struct ir3_instruction *instr); - -struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, - int num, int flags); - - -static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) -{ - if (instr->flags & IR3_INSTR_MARK) - return true; /* already visited */ - instr->flags ^= IR3_INSTR_MARK; - return false; -} - -static inline void ir3_clear_mark(struct ir3 *shader) -{ - /* TODO would be nice to drop the instruction array.. for - * new compiler, _clear_mark() is all we use it for, and - * we could probably manage a linked list instead.. - */ - unsigned i; - for (i = 0; i < shader->instrs_count; i++) { - struct ir3_instruction *instr = shader->instrs[i]; - instr->flags &= ~IR3_INSTR_MARK; - } -} - -static inline int ir3_instr_regno(struct ir3_instruction *instr, - struct ir3_register *reg) -{ - unsigned i; - for (i = 0; i < instr->regs_count; i++) - if (reg == instr->regs[i]) - return i; - return -1; -} - - -/* comp: - * 0 - x - * 1 - y - * 2 - z - * 3 - w - */ -static inline uint32_t regid(int num, int comp) -{ - return (num << 2) | (comp & 0x3); -} - -static inline uint32_t reg_num(struct ir3_register *reg) -{ - return reg->num >> 2; -} - -static inline uint32_t reg_comp(struct ir3_register *reg) -{ - return reg->num & 0x3; -} - -static inline bool is_flow(struct ir3_instruction *instr) -{ - return (instr->category == 0); -} - -static inline bool is_kill(struct ir3_instruction *instr) -{ - return is_flow(instr) && (instr->opc == OPC_KILL); -} - -static inline bool is_nop(struct ir3_instruction *instr) -{ - return is_flow(instr) && (instr->opc == OPC_NOP); -} - -static inline bool is_alu(struct ir3_instruction *instr) -{ - return (1 <= instr->category) && (instr->category <= 3); -} - -static inline bool is_sfu(struct ir3_instruction *instr) -{ - return (instr->category == 4); -} - -static inline bool is_tex(struct ir3_instruction *instr) -{ - return (instr->category == 5); -} - -static inline bool is_input(struct ir3_instruction *instr) -{ - return (instr->category == 2) && (instr->opc == OPC_BARY_F); -} - -static inline bool is_meta(struct ir3_instruction *instr) -{ - /* TODO how should we count PHI (and maybe fan-in/out) which - * might actually contribute some instructions to the final - * result? - */ - return (instr->category == -1); -} - -static inline bool is_addr(struct ir3_instruction *instr) -{ - return is_meta(instr) && (instr->opc == OPC_META_DEREF); -} - -static inline bool writes_addr(struct ir3_instruction *instr) -{ - if (instr->regs_count > 0) { - struct ir3_register *dst = instr->regs[0]; - return !!(dst->flags & IR3_REG_ADDR); - } - return false; -} - -static inline bool writes_pred(struct ir3_instruction *instr) -{ - if (instr->regs_count > 0) { - struct ir3_register *dst = instr->regs[0]; - return reg_num(dst) == REG_P0; - } - return false; -} - -static inline bool reg_gpr(struct ir3_register *r) -{ - if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR)) - return false; - if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) - return false; - return true; -} - -/* dump: */ -#include -void ir3_dump(struct ir3 *shader, const char *name, - struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */, - FILE *f); -void ir3_dump_instr_single(struct ir3_instruction *instr); -void ir3_dump_instr_list(struct ir3_instruction *instr); - -/* flatten if/else: */ -int ir3_block_flatten(struct ir3_block *block); - -/* depth calculation: */ -int ir3_delayslots(struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned n); -void ir3_block_depth(struct ir3_block *block); - -/* copy-propagate: */ -void ir3_block_cp(struct ir3_block *block); - -/* scheduling: */ -void ir3_block_sched(struct ir3_block *block); - -/* register assignment: */ -int ir3_block_ra(struct ir3_block *block, enum shader_t type, - bool half_precision, bool frag_coord, bool frag_face, - bool *has_samp); - -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) -#endif - -/* ************************************************************************* */ -/* split this out or find some helper to use.. like main/bitset.h.. */ - -#include - -#define MAX_REG 256 - -typedef uint8_t regmask_t[2 * MAX_REG / 8]; - -static inline unsigned regmask_idx(struct ir3_register *reg) -{ - unsigned num = reg->num; - assert(num < MAX_REG); - if (reg->flags & IR3_REG_HALF) - num += MAX_REG; - return num; -} - -static inline void regmask_init(regmask_t *regmask) -{ - memset(regmask, 0, sizeof(*regmask)); -} - -static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg) -{ - unsigned idx = regmask_idx(reg); - unsigned i; - for (i = 0; i < 4; i++, idx++) - if (reg->wrmask & (1 << i)) - (*regmask)[idx / 8] |= 1 << (idx % 8); -} - -/* set bits in a if not set in b, conceptually: - * a |= (reg & ~b) - */ -static inline void regmask_set_if_not(regmask_t *a, - struct ir3_register *reg, regmask_t *b) -{ - unsigned idx = regmask_idx(reg); - unsigned i; - for (i = 0; i < 4; i++, idx++) - if (reg->wrmask & (1 << i)) - if (!((*b)[idx / 8] & (1 << (idx % 8)))) - (*a)[idx / 8] |= 1 << (idx % 8); -} - -static inline unsigned regmask_get(regmask_t *regmask, - struct ir3_register *reg) -{ - unsigned idx = regmask_idx(reg); - unsigned i; - for (i = 0; i < 4; i++, idx++) - if (reg->wrmask & (1 << i)) - if ((*regmask)[idx / 8] & (1 << (idx % 8))) - return true; - return false; -} - -/* ************************************************************************* */ - -#endif /* IR3_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c deleted file mode 100644 index 73c2a27c6eb..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c +++ /dev/null @@ -1,158 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2014 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include "ir3.h" - -/* - * Copy Propagate: - * - * TODO probably want some sort of visitor sort of interface to - * avoid duplicating the same graph traversal logic everywhere.. - * - */ - -static void block_cp(struct ir3_block *block); -static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep); - -static bool is_eligible_mov(struct ir3_instruction *instr) -{ - if ((instr->category == 1) && - (instr->cat1.src_type == instr->cat1.dst_type)) { - struct ir3_register *dst = instr->regs[0]; - struct ir3_register *src = instr->regs[1]; - if (dst->flags & IR3_REG_ADDR) - return false; - if ((src->flags & IR3_REG_SSA) && - /* TODO: propagate abs/neg modifiers if possible */ - !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))) - return true; - } - return false; -} - -static void walk_children(struct ir3_instruction *instr, bool keep) -{ - unsigned i; - - /* walk down the graph from each src: */ - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *src = instr->regs[i]; - if (src->flags & IR3_REG_SSA) - src->instr = instr_cp(src->instr, keep); - } -} - -static struct ir3_instruction * -instr_cp_fanin(struct ir3_instruction *instr) -{ - unsigned i; - - /* we need to handle fanin specially, to detect cases - * when we need to keep a mov - */ - - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *src = instr->regs[i]; - if (src->flags & IR3_REG_SSA) { - struct ir3_instruction *cand = - instr_cp(src->instr, false); - - /* if the candidate is a fanout, then keep - * the move. - * - * This is a bit, um, fragile, but it should - * catch the extra mov's that the front-end - * puts in for us already in these cases. - */ - if (is_meta(cand) && (cand->opc == OPC_META_FO)) - cand = instr_cp(src->instr, true); - - src->instr = cand; - } - } - - walk_children(instr, false); - - return instr; - -} - -static struct ir3_instruction * -instr_cp(struct ir3_instruction *instr, bool keep) -{ - /* if we've already visited this instruction, bail now: */ - if (ir3_instr_check_mark(instr)) - return instr; - - if (is_meta(instr) && (instr->opc == OPC_META_FI)) - return instr_cp_fanin(instr); - - if (is_eligible_mov(instr) && !keep) { - struct ir3_register *src = instr->regs[1]; - return instr_cp(src->instr, false); - } - - walk_children(instr, false); - - return instr; -} - -static void block_cp(struct ir3_block *block) -{ - unsigned i, j; - - for (i = 0; i < block->noutputs; i++) { - if (block->outputs[i]) { - struct ir3_instruction *out = - instr_cp(block->outputs[i], false); - - /* To deal with things like this: - * - * 43: MOV OUT[2], TEMP[5] - * 44: MOV OUT[0], TEMP[5] - * - * we need to ensure that no two outputs point to - * the same instruction - */ - for (j = 0; j < i; j++) { - if (block->outputs[j] == out) { - out = instr_cp(block->outputs[i], true); - break; - } - } - - block->outputs[i] = out; - } - } -} - -void ir3_block_cp(struct ir3_block *block) -{ - ir3_clear_mark(block->shader); - block_cp(block); -} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c deleted file mode 100644 index dcc0362f0c8..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c +++ /dev/null @@ -1,159 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2014 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include "util/u_math.h" - -#include "ir3.h" - -/* - * Instruction Depth: - * - * Calculates weighted instruction depth, ie. the sum of # of needed - * instructions plus delay slots back to original input (ie INPUT or - * CONST). That is to say, an instructions depth is: - * - * depth(instr) { - * d = 0; - * // for each src register: - * foreach (src in instr->regs[1..n]) - * d = max(d, delayslots(src->instr, n) + depth(src->instr)); - * return d + 1; - * } - * - * After an instruction's depth is calculated, it is inserted into the - * blocks depth sorted list, which is used by the scheduling pass. - */ - -/* calculate required # of delay slots between the instruction that - * assigns a value and the one that consumes - */ -int ir3_delayslots(struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned n) -{ - /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal - * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch - * handled with sync bits - */ - - if (is_meta(assigner)) - return 0; - - if (writes_addr(assigner)) - return 6; - - /* handled via sync flags: */ - if (is_sfu(assigner) || is_tex(assigner)) - return 0; - - /* assigner must be alu: */ - if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer)) { - return 6; - } else if ((consumer->category == 3) && - is_mad(consumer->opc) && (n == 2)) { - /* special case, 3rd src to cat3 not required on first cycle */ - return 1; - } else { - return 3; - } -} - -static void insert_by_depth(struct ir3_instruction *instr) -{ - struct ir3_block *block = instr->block; - struct ir3_instruction *n = block->head; - struct ir3_instruction *p = NULL; - - while (n && (n != instr) && (n->depth > instr->depth)) { - p = n; - n = n->next; - } - - instr->next = n; - if (p) - p->next = instr; - else - block->head = instr; -} - -static void ir3_instr_depth(struct ir3_instruction *instr) -{ - unsigned i; - - /* if we've already visited this instruction, bail now: */ - if (ir3_instr_check_mark(instr)) - return; - - instr->depth = 0; - - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *src = instr->regs[i]; - if (src->flags & IR3_REG_SSA) { - unsigned sd; - - /* visit child to compute it's depth: */ - ir3_instr_depth(src->instr); - - sd = ir3_delayslots(src->instr, instr, i-1) + - src->instr->depth; - - instr->depth = MAX2(instr->depth, sd); - } - } - - /* meta-instructions don't add cycles, other than PHI.. which - * might translate to a real instruction.. - * - * well, not entirely true, fan-in/out, etc might need to need - * to generate some extra mov's in edge cases, etc.. probably - * we might want to do depth calculation considering the worst - * case for these?? - */ - if (!is_meta(instr)) - instr->depth++; - - insert_by_depth(instr); -} - -void ir3_block_depth(struct ir3_block *block) -{ - unsigned i; - - block->head = NULL; - - ir3_clear_mark(block->shader); - for (i = 0; i < block->noutputs; i++) - if (block->outputs[i]) - ir3_instr_depth(block->outputs[i]); - - /* at this point, any unvisited input is unused: */ - for (i = 0; i < block->ninputs; i++) { - struct ir3_instruction *in = block->inputs[i]; - if (in && !ir3_instr_check_mark(in)) - block->inputs[i] = NULL; - } -} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c deleted file mode 100644 index 1a6f49d51cd..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c +++ /dev/null @@ -1,425 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2014 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include - -#include "ir3.h" - -#define PTRID(x) ((unsigned long)(x)) - -struct ir3_dump_ctx { - FILE *f; - bool verbose; -}; - -static void dump_instr_name(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr) -{ - /* for debugging: */ - if (ctx->verbose) { -#ifdef DEBUG - fprintf(ctx->f, "%04u:", instr->serialno); -#endif - fprintf(ctx->f, "%03u: ", instr->depth); - } - - if (instr->flags & IR3_INSTR_SY) - fprintf(ctx->f, "(sy)"); - if (instr->flags & IR3_INSTR_SS) - fprintf(ctx->f, "(ss)"); - - if (is_meta(instr)) { - switch(instr->opc) { - case OPC_META_PHI: - fprintf(ctx->f, "Φ"); - break; - case OPC_META_DEREF: - fprintf(ctx->f, "(*)"); - break; - default: - /* shouldn't hit here.. just for debugging: */ - switch (instr->opc) { - case OPC_META_INPUT: fprintf(ctx->f, "_meta:in"); break; - case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out"); break; - case OPC_META_FO: fprintf(ctx->f, "_meta:fo"); break; - case OPC_META_FI: fprintf(ctx->f, "_meta:fi"); break; - case OPC_META_FLOW: fprintf(ctx->f, "_meta:flow"); break; - - default: fprintf(ctx->f, "_meta:%d", instr->opc); break; - } - break; - } - } else if (instr->category == 1) { - static const char *type[] = { - [TYPE_F16] = "f16", - [TYPE_F32] = "f32", - [TYPE_U16] = "u16", - [TYPE_U32] = "u32", - [TYPE_S16] = "s16", - [TYPE_S32] = "s32", - [TYPE_U8] = "u8", - [TYPE_S8] = "s8", - }; - if (instr->cat1.src_type == instr->cat1.dst_type) - fprintf(ctx->f, "mov"); - else - fprintf(ctx->f, "cov"); - fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]); - } else { - fprintf(ctx->f, "%s", ir3_instr_name(instr)); - if (instr->flags & IR3_INSTR_3D) - fprintf(ctx->f, ".3d"); - if (instr->flags & IR3_INSTR_A) - fprintf(ctx->f, ".a"); - if (instr->flags & IR3_INSTR_O) - fprintf(ctx->f, ".o"); - if (instr->flags & IR3_INSTR_P) - fprintf(ctx->f, ".p"); - if (instr->flags & IR3_INSTR_S) - fprintf(ctx->f, ".s"); - if (instr->flags & IR3_INSTR_S2EN) - fprintf(ctx->f, ".s2en"); - } -} - -static void dump_reg_name(struct ir3_dump_ctx *ctx, - struct ir3_register *reg) -{ - if ((reg->flags & IR3_REG_ABS) && (reg->flags & IR3_REG_NEGATE)) - fprintf(ctx->f, "(absneg)"); - else if (reg->flags & IR3_REG_NEGATE) - fprintf(ctx->f, "(neg)"); - else if (reg->flags & IR3_REG_ABS) - fprintf(ctx->f, "(abs)"); - - if (reg->flags & IR3_REG_IMMED) { - fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); - } else if (reg->flags & IR3_REG_SSA) { - if (ctx->verbose) { - fprintf(ctx->f, "_["); - dump_instr_name(ctx, reg->instr); - fprintf(ctx->f, "]"); - } - } else { - if (reg->flags & IR3_REG_HALF) - fprintf(ctx->f, "h"); - if (reg->flags & IR3_REG_CONST) - fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); - else - fprintf(ctx->f, "r%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); - } -} - -static void ir3_instr_dump(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr); -static void ir3_block_dump(struct ir3_dump_ctx *ctx, - struct ir3_block *block, const char *name); - -static void dump_instr(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr) -{ - /* if we've already visited this instruction, bail now: */ - if (ir3_instr_check_mark(instr)) - return; - - /* some meta-instructions need to be handled specially: */ - if (is_meta(instr)) { - if ((instr->opc == OPC_META_FO) || - (instr->opc == OPC_META_FI)) { - unsigned i; - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - if (reg->flags & IR3_REG_SSA) - dump_instr(ctx, reg->instr); - } - } else if (instr->opc == OPC_META_FLOW) { - struct ir3_register *reg = instr->regs[1]; - ir3_block_dump(ctx, instr->flow.if_block, "if"); - if (instr->flow.else_block) - ir3_block_dump(ctx, instr->flow.else_block, "else"); - if (reg->flags & IR3_REG_SSA) - dump_instr(ctx, reg->instr); - } else if ((instr->opc == OPC_META_PHI) || - (instr->opc == OPC_META_DEREF)) { - /* treat like a normal instruction: */ - ir3_instr_dump(ctx, instr); - } - } else { - ir3_instr_dump(ctx, instr); - } -} - -/* arrarraggh! if link is to something outside of the current block, we - * need to defer emitting the link until the end of the block, since the - * edge triggers pre-creation of the node it links to inside the cluster, - * even though it is meant to be outside.. - */ -static struct { - char buf[40960]; - unsigned n; -} edge_buf; - -/* helper to print or defer: */ -static void printdef(struct ir3_dump_ctx *ctx, - bool defer, const char *fmt, ...) -{ - va_list ap; - va_start(ap, fmt); - if (defer) { - unsigned n = edge_buf.n; - n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n, - fmt, ap); - edge_buf.n = n; - } else { - vfprintf(ctx->f, fmt, ap); - } - va_end(ap); -} - -static void dump_link2(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr, const char *target, bool defer) -{ - /* some meta-instructions need to be handled specially: */ - if (is_meta(instr)) { - if (instr->opc == OPC_META_INPUT) { - printdef(ctx, defer, "input%lx::w -> %s", - PTRID(instr->inout.block), - instr->regs[0]->num, target); - } else if (instr->opc == OPC_META_FO) { - struct ir3_register *reg = instr->regs[1]; - dump_link2(ctx, reg->instr, target, defer); - printdef(ctx, defer, "[label=\".%c\"]", - "xyzw"[instr->fo.off & 0x3]); - } else if (instr->opc == OPC_META_FI) { - unsigned i; - - /* recursively dump all parents and links */ - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - if (reg->flags & IR3_REG_SSA) { - dump_link2(ctx, reg->instr, target, defer); - printdef(ctx, defer, "[label=\".%c\"]", - "xyzw"[(i - 1) & 0x3]); - } - } - } else if (instr->opc == OPC_META_OUTPUT) { - printdef(ctx, defer, "output%lx::w -> %s", - PTRID(instr->inout.block), - instr->regs[0]->num, target); - } else if ((instr->opc == OPC_META_PHI) || - (instr->opc == OPC_META_DEREF)) { - /* treat like a normal instruction: */ - printdef(ctx, defer, "instr%lx: -> %s", PTRID(instr), target); - } - } else { - printdef(ctx, defer, "instr%lx: -> %s", PTRID(instr), target); - } -} - -static void dump_link(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr, - struct ir3_block *block, const char *target) -{ - bool defer = instr->block != block; - dump_link2(ctx, instr, target, defer); - printdef(ctx, defer, "\n"); -} - -static struct ir3_register *follow_flow(struct ir3_register *reg) -{ - if (reg->flags & IR3_REG_SSA) { - struct ir3_instruction *instr = reg->instr; - /* go with the flow.. */ - if (is_meta(instr) && (instr->opc == OPC_META_FLOW)) - return instr->regs[1]; - } - return reg; -} - -static void ir3_instr_dump(struct ir3_dump_ctx *ctx, - struct ir3_instruction *instr) -{ - unsigned i; - - fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{", - PTRID(instr)); - dump_instr_name(ctx, instr); - - /* destination register: */ - fprintf(ctx->f, "|"); - - /* source register(s): */ - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *reg = follow_flow(instr->regs[i]); - - fprintf(ctx->f, "|"); - - if (reg->flags & IR3_REG_SSA) - fprintf(ctx->f, " ", (i - 1)); - - dump_reg_name(ctx, reg); - } - - fprintf(ctx->f, "}\"];\n"); - - /* and recursively dump dependent instructions: */ - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - char target[32]; /* link target */ - - if (!(reg->flags & IR3_REG_SSA)) - continue; - - snprintf(target, sizeof(target), "instr%lx:", - PTRID(instr), (i - 1)); - - dump_instr(ctx, reg->instr); - dump_link(ctx, follow_flow(reg)->instr, instr->block, target); - } -} - -static void ir3_block_dump(struct ir3_dump_ctx *ctx, - struct ir3_block *block, const char *name) -{ - unsigned i, n; - - n = edge_buf.n; - - fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block)); - fprintf(ctx->f, "label=\"%s\";\n", name); - - /* draw inputs: */ - fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block)); - for (i = 0; i < block->ninputs; i++) - if (block->inputs[i]) - fprintf(ctx->f, "| i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); - fprintf(ctx->f, "\"];\n"); - - /* draw instruction graph: */ - for (i = 0; i < block->noutputs; i++) - dump_instr(ctx, block->outputs[i]); - - /* draw outputs: */ - fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block)); - for (i = 0; i < block->noutputs; i++) - fprintf(ctx->f, "| o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); - fprintf(ctx->f, "\"];\n"); - - /* and links to outputs: */ - for (i = 0; i < block->noutputs; i++) { - char target[32]; /* link target */ - - /* NOTE: there could be outputs that are never assigned, - * so skip them - */ - if (!block->outputs[i]) - continue; - - snprintf(target, sizeof(target), "output%lx::e", - PTRID(block), i); - - dump_link(ctx, block->outputs[i], block, target); - } - - fprintf(ctx->f, "}\n"); - - /* and links to inputs: */ - if (block->parent) { - for (i = 0; i < block->ninputs; i++) { - char target[32]; /* link target */ - - if (!block->inputs[i]) - continue; - - dump_instr(ctx, block->inputs[i]); - - snprintf(target, sizeof(target), "input%lx::e", - PTRID(block), i); - - dump_link(ctx, block->inputs[i], block, target); - } - } - - /* dump deferred edges: */ - if (edge_buf.n > n) { - fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]); - edge_buf.n = n; - } -} - -void ir3_dump(struct ir3 *shader, const char *name, - struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */, - FILE *f) -{ - struct ir3_dump_ctx ctx = { - .f = f, - }; - ir3_clear_mark(shader); - fprintf(ctx.f, "digraph G {\n"); - fprintf(ctx.f, "rankdir=RL;\n"); - fprintf(ctx.f, "nodesep=0.25;\n"); - fprintf(ctx.f, "ranksep=1.5;\n"); - ir3_block_dump(&ctx, block, name); - fprintf(ctx.f, "}\n"); -} - -/* - * For Debugging: - */ - -void -ir3_dump_instr_single(struct ir3_instruction *instr) -{ - struct ir3_dump_ctx ctx = { - .f = stdout, - .verbose = true, - }; - unsigned i; - - dump_instr_name(&ctx, instr); - for (i = 0; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - printf(i ? ", " : " "); - dump_reg_name(&ctx, reg); - } - printf("\n"); -} - -void -ir3_dump_instr_list(struct ir3_instruction *instr) -{ - unsigned n = 0; - - while (instr) { - ir3_dump_instr_single(instr); - if (!is_meta(instr)) - n++; - instr = instr->next; - } - printf("%u instructions\n", n); -} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_flatten.c b/src/gallium/drivers/freedreno/a3xx/ir3_flatten.c deleted file mode 100644 index 9389227034c..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/ir3_flatten.c +++ /dev/null @@ -1,155 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2014 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include - -#include "ir3.h" - -/* - * Flatten: flatten out legs of if/else, etc - * - * TODO probably should use some heuristic to decide to not flatten - * if one side of the other is too large / deeply nested / whatever? - */ - -struct ir3_flatten_ctx { - struct ir3_block *block; - unsigned cnt; -}; - -static struct ir3_register *unwrap(struct ir3_register *reg) -{ - - if (reg->flags & IR3_REG_SSA) { - struct ir3_instruction *instr = reg->instr; - if (is_meta(instr)) { - switch (instr->opc) { - case OPC_META_OUTPUT: - case OPC_META_FLOW: - if (instr->regs_count > 1) - return instr->regs[1]; - return NULL; - default: - break; - } - } - } - return reg; -} - -static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx, - struct ir3_instruction *instr) -{ - unsigned i; - - /* if we've already visited this instruction, bail now: */ - if (ir3_instr_check_mark(instr)) - return; - - instr->block = ctx->block; - - /* TODO: maybe some threshold to decide whether to - * flatten or not?? - */ - if (is_meta(instr)) { - if (instr->opc == OPC_META_PHI) { - struct ir3_register *cond, *t, *f; - - cond = unwrap(instr->regs[1]); - t = unwrap(instr->regs[2]); /* true val */ - f = unwrap(instr->regs[3]); /* false val */ - - /* must have cond, but t or f may be null if only written - * one one side of the if/else (in which case we can just - * convert the PHI to a simple move). - */ - assert(cond); - assert(t || f); - - if (t && f) { - /* convert the PHI instruction to sel.{b16,b32} */ - instr->category = 3; - - /* instruction type based on dst size: */ - if (instr->regs[0]->flags & IR3_REG_HALF) - instr->opc = OPC_SEL_B16; - else - instr->opc = OPC_SEL_B32; - - instr->regs[1] = t; - instr->regs[2] = cond; - instr->regs[3] = f; - } else { - /* convert to simple mov: */ - instr->category = 1; - instr->cat1.dst_type = TYPE_F32; - instr->cat1.src_type = TYPE_F32; - instr->regs_count = 2; - instr->regs[1] = t ? t : f; - } - - ctx->cnt++; - } else if ((instr->opc == OPC_META_INPUT) && - (instr->regs_count == 2)) { - type_t ftype; - - if (instr->regs[0]->flags & IR3_REG_HALF) - ftype = TYPE_F16; - else - ftype = TYPE_F32; - - /* convert meta:input to mov: */ - instr->category = 1; - instr->cat1.src_type = ftype; - instr->cat1.dst_type = ftype; - } - } - - /* recursively visit children: */ - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *src = instr->regs[i]; - if (src->flags & IR3_REG_SSA) - ir3_instr_flatten(ctx, src->instr); - } -} - -/* return >= 0 is # of phi's flattened, < 0 is error */ -int ir3_block_flatten(struct ir3_block *block) -{ - struct ir3_flatten_ctx ctx = { - .block = block, - }; - unsigned i; - - ir3_clear_mark(block->shader); - for(i = 0; i < block->noutputs; i++) - if (block->outputs[i]) - ir3_instr_flatten(&ctx, block->outputs[i]); - - return ctx.cnt; -} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c deleted file mode 100644 index b916dd51393..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c +++ /dev/null @@ -1,790 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2014 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#include "pipe/p_shader_tokens.h" -#include "util/u_math.h" - -#include "ir3.h" -#include "ir3_visitor.h" - -/* - * Register Assignment: - * - * NOTE: currently only works on a single basic block.. need to think - * about how multiple basic blocks are going to get scheduled. But - * I think I want to re-arrange how blocks work, ie. get rid of the - * block nesting thing.. - * - * NOTE: we could do register coalescing (eliminate moves) as part of - * the RA step.. OTOH I think we need to do scheduling before register - * assignment. And if we remove a mov that effects scheduling (unless - * we leave a placeholder nop, which seems lame), so I'm not really - * sure how practical this is to do both in a single stage. But OTOH - * I'm not really sure a sane way for the CP stage to realize when it - * cannot remove a mov due to multi-register constraints.. - * - */ - -struct ir3_ra_ctx { - struct ir3_block *block; - enum shader_t type; - bool half_precision; - bool frag_coord; - bool frag_face; - bool has_samp; - int cnt; - bool error; -}; - -/* sorta ugly way to retrofit half-precision support.. rather than - * passing extra param around, just OR in a high bit. All the low - * value arithmetic (ie. +/- offset within a contiguous vec4, etc) - * will continue to work as long as you don't underflow (and that - * would go badly anyways). - */ -#define REG_HALF 0x8000 - -struct ir3_ra_assignment { - int8_t off; /* offset of instruction dst within range */ - uint8_t num; /* number of components for the range */ -}; - -static void ra_assign(struct ir3_ra_ctx *ctx, - struct ir3_instruction *assigner, int num); -static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr); - -/* - * Register Allocation: - */ - -#define REG(n, wm, f) (struct ir3_register){ \ - .flags = (f), \ - .num = (n), \ - .wrmask = TGSI_WRITEMASK_ ## wm, \ - } - -/* check that the register exists, is a GPR and is not special (a0/p0) */ -static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n) -{ - if ((n < instr->regs_count) && reg_gpr(instr->regs[n])) - return instr->regs[n]; - return NULL; -} - -static int output_base(struct ir3_ra_ctx *ctx) -{ - /* ugg, for fragment shader we need to have input at r0.x - * (or at least if there is a way to configure it, I can't - * see how because the blob driver always uses r0.x (ie. - * all zeros) - */ - if (ctx->type == SHADER_FRAGMENT) { - if (ctx->half_precision) - return ctx->frag_face ? 4 : 3; - return ctx->frag_coord ? 8 : 4; - } - return 0; -} - -/* live means read before written */ -static void compute_liveregs(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, regmask_t *liveregs) -{ - struct ir3_block *block = instr->block; - regmask_t written; - unsigned i, j; - - regmask_init(liveregs); - regmask_init(&written); - - for (instr = instr->next; instr; instr = instr->next) { - struct ir3_register *r; - - if (is_meta(instr)) - continue; - - /* check first src's read: */ - for (j = 1; j < instr->regs_count; j++) { - r = reg_check(instr, j); - if (r) - regmask_set_if_not(liveregs, r, &written); - } - - /* then dst written (if assigned already): */ - if (instr->flags & IR3_INSTR_MARK) { - r = reg_check(instr, 0); - if (r) - regmask_set(&written, r); - } - } - - /* be sure to account for output registers too: */ - for (i = 0; i < block->noutputs; i++) { - struct ir3_register reg = REG(output_base(ctx) + i, X, 0); - regmask_set_if_not(liveregs, ®, &written); - } -} - -/* calculate registers that are clobbered before last use of 'assigner'. - * This needs to be done backwards, although it could possibly be - * combined into compute_liveregs(). (Ie. compute_liveregs() could - * reverse the list, then do this part backwards reversing the list - * again back to original order.) Otoh, probably I should try to - * construct a proper interference graph instead. - * - * XXX this need to follow the same recursion path that is used for - * to rename/assign registers (ie. ra_assign_src()).. this is a bit - * ugly right now, maybe refactor into node iterator sort of things - * that iterates nodes in the correct order? - */ -static bool compute_clobbers(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, struct ir3_instruction *assigner, - regmask_t *liveregs) -{ - unsigned i; - bool live = false, was_live = false; - - if (instr == NULL) { - struct ir3_block *block = ctx->block; - - /* if at the end, check outputs: */ - for (i = 0; i < block->noutputs; i++) - if (block->outputs[i] == assigner) - return true; - return false; - } - - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) { - if (is_meta(instr)) { - switch (instr->opc) { - case OPC_META_INPUT: - // TODO - assert(0); - break; - case OPC_META_FO: - case OPC_META_FI: - was_live |= compute_clobbers(ctx, instr->next, - instr, liveregs); - break; - default: - break; - } - } - live = true; - break; - } - } - - was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs); - - if (was_live && (instr->regs_count > 0) && - (instr->flags & IR3_INSTR_MARK) && - !is_meta(instr)) - regmask_set(liveregs, instr->regs[0]); - - return live || was_live; -} - -static int find_available(regmask_t *liveregs, int size, bool half) -{ - unsigned i; - unsigned f = half ? IR3_REG_HALF : 0; - for (i = 0; i < MAX_REG - size; i++) { - if (!regmask_get(liveregs, ®(i, X, f))) { - unsigned start = i++; - for (; (i < MAX_REG) && ((i - start) < size); i++) - if (regmask_get(liveregs, ®(i, X, f))) - break; - if ((i - start) >= size) - return start; - } - } - assert(0); - return -1; -} - -static int alloc_block(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr, int size) -{ - if (!instr) { - /* special case, allocating shader outputs. At this - * point, nothing is allocated, just start the shader - * outputs at r0.x and let compute_liveregs() take - * care of the rest from here: - */ - return 0; - } else { - struct ir3_register *dst = instr->regs[0]; - regmask_t liveregs; - - compute_liveregs(ctx, instr, &liveregs); - - // XXX XXX XXX XXX XXX XXX XXX XXX XXX - // XXX hack.. maybe ra_calc should give us a list of - // instrs to compute_clobbers() on? - if (is_meta(instr) && (instr->opc == OPC_META_INPUT) && - (instr->regs_count == 1)) { - unsigned i, base = instr->regs[0]->num & ~0x3; - for (i = 0; i < 4; i++) { - struct ir3_instruction *in = ctx->block->inputs[base + i]; - if (in) - compute_clobbers(ctx, in->next, in, &liveregs); - } - } else - // XXX XXX XXX XXX XXX XXX XXX XXX XXX - compute_clobbers(ctx, instr->next, instr, &liveregs); - - return find_available(&liveregs, size, - !!(dst->flags & IR3_REG_HALF)); - } -} - -/* - * Constraint Calculation: - */ - -struct ra_calc_visitor { - struct ir3_visitor base; - struct ir3_ra_assignment a; -}; - -static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v) -{ - return (struct ra_calc_visitor *)v; -} - -/* calculate register assignment for the instruction. If the register - * written by this instruction is required to be part of a range, to - * handle other (input/output/sam/bary.f/etc) contiguous register range - * constraints, that is calculated handled here. - */ -static void ra_calc_dst(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - struct ra_calc_visitor *c = ra_calc_visitor(v); - if (is_tex(instr)) { - c->a.off = 0; - c->a.num = 4; - } else { - c->a.off = 0; - c->a.num = 1; - } -} - -static void -ra_calc_dst_shader_input(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - struct ra_calc_visitor *c = ra_calc_visitor(v); - struct ir3_block *block = instr->block; - struct ir3_register *dst = instr->regs[0]; - unsigned base = dst->num & ~0x3; - unsigned i, num = 0; - - assert(!(dst->flags & IR3_REG_IA)); - - /* check what input components we need: */ - for (i = 0; i < 4; i++) { - unsigned idx = base + i; - if ((idx < block->ninputs) && block->inputs[idx]) - num = i + 1; - } - - c->a.off = dst->num - base; - c->a.num = num; -} - -static void ra_calc_src_fanin(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - struct ra_calc_visitor *c = ra_calc_visitor(v); - unsigned srcn = ir3_instr_regno(instr, reg) - 1; - c->a.off += srcn; - c->a.num += srcn; - c->a.num = MAX2(c->a.num, instr->regs_count - 1); -} - -static const struct ir3_visitor_funcs calc_visitor_funcs = { - .instr = ir3_visit_instr, - .dst_shader_input = ra_calc_dst_shader_input, - .dst_fanout = ra_calc_dst, - .dst_fanin = ra_calc_dst, - .dst = ra_calc_dst, - .src_fanout = ir3_visit_reg, - .src_fanin = ra_calc_src_fanin, - .src = ir3_visit_reg, -}; - -static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner) -{ - struct ra_calc_visitor v = { - .base.funcs = &calc_visitor_funcs, - }; - - ir3_visit_instr(&v.base, assigner); - - return v.a; -} - -/* - * Register Assignment: - */ - -struct ra_assign_visitor { - struct ir3_visitor base; - struct ir3_ra_ctx *ctx; - int num; -}; - -static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v) -{ - return (struct ra_assign_visitor *)v; -} - -static type_t half_type(type_t type) -{ - switch (type) { - case TYPE_F32: return TYPE_F16; - case TYPE_U32: return TYPE_U16; - case TYPE_S32: return TYPE_S16; - /* instructions may already be fixed up: */ - case TYPE_F16: - case TYPE_U16: - case TYPE_S16: - return type; - default: - assert(0); - return ~0; - } -} - -/* some instructions need fix-up if dst register is half precision: */ -static void fixup_half_instr_dst(struct ir3_instruction *instr) -{ - switch (instr->category) { - case 1: /* move instructions */ - instr->cat1.dst_type = half_type(instr->cat1.dst_type); - break; - case 3: - switch (instr->opc) { - case OPC_MAD_F32: - instr->opc = OPC_MAD_F16; - break; - case OPC_SEL_B32: - instr->opc = OPC_SEL_B16; - break; - case OPC_SEL_S32: - instr->opc = OPC_SEL_S16; - break; - case OPC_SEL_F32: - instr->opc = OPC_SEL_F16; - break; - case OPC_SAD_S32: - instr->opc = OPC_SAD_S16; - break; - /* instructions may already be fixed up: */ - case OPC_MAD_F16: - case OPC_SEL_B16: - case OPC_SEL_S16: - case OPC_SEL_F16: - case OPC_SAD_S16: - break; - default: - assert(0); - break; - } - break; - case 5: - instr->cat5.type = half_type(instr->cat5.type); - break; - } -} -/* some instructions need fix-up if src register is half precision: */ -static void fixup_half_instr_src(struct ir3_instruction *instr) -{ - switch (instr->category) { - case 1: /* move instructions */ - instr->cat1.src_type = half_type(instr->cat1.src_type); - break; - } -} - -static void ra_assign_reg(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - struct ra_assign_visitor *a = ra_assign_visitor(v); - - if (is_flow(instr) && (instr->opc == OPC_KILL)) - return; - - reg->flags &= ~IR3_REG_SSA; - reg->num = a->num & ~REG_HALF; - - assert(reg->num >= 0); - - if (a->num & REG_HALF) { - reg->flags |= IR3_REG_HALF; - /* if dst reg being assigned, patch up the instr: */ - if (reg == instr->regs[0]) - fixup_half_instr_dst(instr); - else - fixup_half_instr_src(instr); - } -} - -static void ra_assign_dst_shader_input(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - struct ra_assign_visitor *a = ra_assign_visitor(v); - unsigned i, base = reg->num & ~0x3; - int off = base - reg->num; - - ra_assign_reg(v, instr, reg); - reg->flags |= IR3_REG_IA; - - /* trigger assignment of all our companion input components: */ - for (i = 0; i < 4; i++) { - struct ir3_instruction *in = instr->block->inputs[i+base]; - if (in && is_meta(in) && (in->opc == OPC_META_INPUT)) - ra_assign(a->ctx, in, a->num + off + i); - } -} - -static void ra_assign_dst_fanout(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - struct ra_assign_visitor *a = ra_assign_visitor(v); - struct ir3_register *src = instr->regs[1]; - ra_assign_reg(v, instr, reg); - if (src->flags & IR3_REG_SSA) - ra_assign(a->ctx, src->instr, a->num - instr->fo.off); -} - -static void ra_assign_src_fanout(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - struct ra_assign_visitor *a = ra_assign_visitor(v); - ra_assign_reg(v, instr, reg); - ra_assign(a->ctx, instr, a->num + instr->fo.off); -} - - -static void ra_assign_src_fanin(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - struct ra_assign_visitor *a = ra_assign_visitor(v); - unsigned j, srcn = ir3_instr_regno(instr, reg) - 1; - ra_assign_reg(v, instr, reg); - ra_assign(a->ctx, instr, a->num - srcn); - for (j = 1; j < instr->regs_count; j++) { - struct ir3_register *reg = instr->regs[j]; - if (reg->flags & IR3_REG_SSA) /* could be renamed already */ - ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1); - } -} - -static const struct ir3_visitor_funcs assign_visitor_funcs = { - .instr = ir3_visit_instr, - .dst_shader_input = ra_assign_dst_shader_input, - .dst_fanout = ra_assign_dst_fanout, - .dst_fanin = ra_assign_reg, - .dst = ra_assign_reg, - .src_fanout = ra_assign_src_fanout, - .src_fanin = ra_assign_src_fanin, - .src = ra_assign_reg, -}; - -static void ra_assign(struct ir3_ra_ctx *ctx, - struct ir3_instruction *assigner, int num) -{ - struct ra_assign_visitor v = { - .base.funcs = &assign_visitor_funcs, - .ctx = ctx, - .num = num, - }; - - /* if we've already visited this instruction, bail now: */ - if (ir3_instr_check_mark(assigner)) { - debug_assert(assigner->regs[0]->num == (num & ~REG_HALF)); - if (assigner->regs[0]->num != (num & ~REG_HALF)) { - /* impossible situation, should have been resolved - * at an earlier stage by inserting extra mov's: - */ - ctx->error = true; - } - return; - } - - ir3_visit_instr(&v.base, assigner); -} - -/* - * - */ - -static void ir3_instr_ra(struct ir3_ra_ctx *ctx, - struct ir3_instruction *instr) -{ - struct ir3_register *dst; - unsigned num; - - /* skip over nop's */ - if (instr->regs_count == 0) - return; - - dst = instr->regs[0]; - - /* if we've already visited this instruction, bail now: */ - if (instr->flags & IR3_INSTR_MARK) - return; - - /* allocate register(s): */ - if (is_addr(instr)) { - num = instr->regs[2]->num; - } else if (reg_gpr(dst)) { - struct ir3_ra_assignment a; - a = ra_calc(instr); - num = alloc_block(ctx, instr, a.num) + a.off; - } else if (dst->flags & IR3_REG_ADDR) { - dst->flags &= ~IR3_REG_ADDR; - num = regid(REG_A0, 0) | REG_HALF; - } else { - /* predicate register (p0).. etc */ - return; - } - - ra_assign(ctx, instr, num); -} - -/* flatten into shader: */ -// XXX this should probably be somewhere else: -static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) -{ - struct ir3_instruction *n; - struct ir3 *shader = block->shader; - struct ir3_instruction *end = - ir3_instr_create(block, 0, OPC_END); - struct ir3_instruction *last_input = NULL; - struct ir3_instruction *last_rel = NULL; - regmask_t needs_ss_war; /* write after read */ - regmask_t needs_ss; - regmask_t needs_sy; - - regmask_init(&needs_ss_war); - regmask_init(&needs_ss); - regmask_init(&needs_sy); - - shader->instrs_count = 0; - - for (n = block->head; n; n = n->next) { - struct ir3_register *reg; - unsigned i; - - if (is_meta(n)) - continue; - - for (i = 1; i < n->regs_count; i++) { - reg = n->regs[i]; - - if (reg_gpr(reg)) { - - /* TODO: we probably only need (ss) for alu - * instr consuming sfu result.. need to make - * some tests for both this and (sy).. - */ - if (regmask_get(&needs_ss, reg)) { - n->flags |= IR3_INSTR_SS; - regmask_init(&needs_ss); - } - - if (regmask_get(&needs_sy, reg)) { - n->flags |= IR3_INSTR_SY; - regmask_init(&needs_sy); - } - } - - /* TODO: is it valid to have address reg loaded from a - * relative src (ie. mova a0, c)? If so, the - * last_rel check below should be moved ahead of this: - */ - if (reg->flags & IR3_REG_RELATIV) - last_rel = n; - } - - if (n->regs_count > 0) { - reg = n->regs[0]; - if (regmask_get(&needs_ss_war, reg)) { - n->flags |= IR3_INSTR_SS; - regmask_init(&needs_ss_war); // ??? I assume? - } - - if (last_rel && (reg->num == regid(REG_A0, 0))) { - last_rel->flags |= IR3_INSTR_UL; - last_rel = NULL; - } - } - - /* cat5+ does not have an (ss) bit, if needed we need to - * insert a nop to carry the sync flag. Would be kinda - * clever if we were aware of this during scheduling, but - * this should be a pretty rare case: - */ - if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) { - struct ir3_instruction *nop; - nop = ir3_instr_create(block, 0, OPC_NOP); - nop->flags |= IR3_INSTR_SS; - n->flags &= ~IR3_INSTR_SS; - } - - /* need to be able to set (ss) on first instruction: */ - if ((shader->instrs_count == 0) && (n->category >= 5)) - ir3_instr_create(block, 0, OPC_NOP); - - if (is_nop(n) && shader->instrs_count) { - struct ir3_instruction *last = - shader->instrs[shader->instrs_count-1]; - if (is_nop(last) && (last->repeat < 5)) { - last->repeat++; - last->flags |= n->flags; - continue; - } - } - - shader->instrs[shader->instrs_count++] = n; - - if (is_sfu(n)) - regmask_set(&needs_ss, n->regs[0]); - - if (is_tex(n)) { - /* this ends up being the # of samp instructions.. but that - * is ok, everything else only cares whether it is zero or - * not. We do this here, rather than when we encounter a - * SAMP decl, because (especially in binning pass shader) - * the samp instruction(s) could get eliminated if the - * result is not used. - */ - ctx->has_samp = true; - regmask_set(&needs_sy, n->regs[0]); - } - - /* both tex/sfu appear to not always immediately consume - * their src register(s): - */ - if (is_tex(n) || is_sfu(n)) { - for (i = 1; i < n->regs_count; i++) { - reg = n->regs[i]; - if (reg_gpr(reg)) - regmask_set(&needs_ss_war, reg); - } - } - - if (is_input(n)) - last_input = n; - } - - if (last_input) - last_input->regs[0]->flags |= IR3_REG_EI; - - if (last_rel) - last_rel->flags |= IR3_INSTR_UL; - - shader->instrs[shader->instrs_count++] = end; - - shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; -} - -static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) -{ - struct ir3_instruction *n; - - if (!block->parent) { - unsigned i, j; - int base, off = output_base(ctx); - - base = alloc_block(ctx, NULL, block->noutputs + off); - - if (ctx->half_precision) - base |= REG_HALF; - - for (i = 0; i < block->noutputs; i++) - if (block->outputs[i] && !is_kill(block->outputs[i])) - ra_assign(ctx, block->outputs[i], base + i + off); - - if (ctx->type == SHADER_FRAGMENT) { - i = 0; - if (ctx->frag_face) { - /* if we have frag_face, it gets hr0.x */ - ra_assign(ctx, block->inputs[i], REG_HALF | 0); - i += 4; - } - for (j = 0; i < block->ninputs; i++, j++) - if (block->inputs[i]) - ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j); - } else { - for (i = 0; i < block->ninputs; i++) - if (block->inputs[i]) - ir3_instr_ra(ctx, block->inputs[i]); - } - } - - /* then loop over instruction list and assign registers: - */ - n = block->head; - while (n) { - ir3_instr_ra(ctx, n); - if (ctx->error) - return -1; - n = n->next; - } - - legalize(ctx, block); - - return 0; -} - -int ir3_block_ra(struct ir3_block *block, enum shader_t type, - bool half_precision, bool frag_coord, bool frag_face, - bool *has_samp) -{ - struct ir3_ra_ctx ctx = { - .block = block, - .type = type, - .half_precision = half_precision, - .frag_coord = frag_coord, - .frag_face = frag_face, - }; - int ret; - - ir3_clear_mark(block->shader); - ret = block_ra(&ctx, block); - *has_samp = ctx.has_samp; - - return ret; -} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c deleted file mode 100644 index 3ef67731926..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c +++ /dev/null @@ -1,401 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2014 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - - -#include "util/u_math.h" - -#include "ir3.h" - -enum { - SCHEDULED = -1, - DELAYED = -2, -}; - -/* - * Instruction Scheduling: - * - * Using the depth sorted list from depth pass, attempt to recursively - * schedule deepest unscheduled path. The first instruction that cannot - * be scheduled, returns the required delay slots it needs, at which - * point we return back up to the top and attempt to schedule by next - * highest depth. After a sufficient number of instructions have been - * scheduled, return back to beginning of list and start again. If you - * reach the end of depth sorted list without being able to insert any - * instruction, insert nop's. Repeat until no more unscheduled - * instructions. - * - * There are a few special cases that need to be handled, since sched - * is currently independent of register allocation. Usages of address - * register (a0.x) or predicate register (p0.x) must be serialized. Ie. - * if you have two pairs of instructions that write the same special - * register and then read it, then those pairs cannot be interleaved. - * To solve this, when we are in such a scheduling "critical section", - * and we encounter a conflicting write to a special register, we try - * to schedule any remaining instructions that use that value first. - */ - -struct ir3_sched_ctx { - struct ir3_instruction *scheduled; /* last scheduled instr */ - struct ir3_instruction *addr; /* current a0.x user, if any */ - struct ir3_instruction *pred; /* current p0.x user, if any */ - unsigned cnt; -}; - -static struct ir3_instruction * -deepest(struct ir3_instruction **srcs, unsigned nsrcs) -{ - struct ir3_instruction *d = NULL; - unsigned i = 0, id = 0; - - while ((i < nsrcs) && !(d = srcs[id = i])) - i++; - - if (!d) - return NULL; - - for (; i < nsrcs; i++) - if (srcs[i] && (srcs[i]->depth > d->depth)) - d = srcs[id = i]; - - srcs[id] = NULL; - - return d; -} - -static unsigned distance(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr, unsigned maxd) -{ - struct ir3_instruction *n = ctx->scheduled; - unsigned d = 0; - while (n && (n != instr) && (d < maxd)) { - if (is_alu(n) || is_flow(n)) - d++; - n = n->next; - } - return d; -} - -/* TODO maybe we want double linked list? */ -static struct ir3_instruction * prev(struct ir3_instruction *instr) -{ - struct ir3_instruction *p = instr->block->head; - while (p && (p->next != instr)) - p = p->next; - return p; -} - -static void schedule(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr, bool remove) -{ - struct ir3_block *block = instr->block; - - /* maybe there is a better way to handle this than just stuffing - * a nop.. ideally we'd know about this constraint in the - * scheduling and depth calculation.. - */ - if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr)) - schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); - - /* remove from depth list: - */ - if (remove) { - struct ir3_instruction *p = prev(instr); - - /* NOTE: this can happen for inputs which are not - * read.. in that case there is no need to schedule - * the input, so just bail: - */ - if (instr != (p ? p->next : block->head)) - return; - - if (p) - p->next = instr->next; - else - block->head = instr->next; - } - - if (writes_addr(instr)) { - assert(ctx->addr == NULL); - ctx->addr = instr; - } - - if (writes_pred(instr)) { - assert(ctx->pred == NULL); - ctx->pred = instr; - } - - instr->flags |= IR3_INSTR_MARK; - - instr->next = ctx->scheduled; - ctx->scheduled = instr; - - ctx->cnt++; -} - -/* - * Delay-slot calculation. Follows fanin/fanout. - */ - -static unsigned delay_calc2(struct ir3_sched_ctx *ctx, - struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned srcn) -{ - unsigned delay = 0; - - if (is_meta(assigner)) { - unsigned i; - for (i = 1; i < assigner->regs_count; i++) { - struct ir3_register *reg = assigner->regs[i]; - if (reg->flags & IR3_REG_SSA) { - unsigned d = delay_calc2(ctx, reg->instr, - consumer, srcn); - delay = MAX2(delay, d); - } - } - } else { - delay = ir3_delayslots(assigner, consumer, srcn); - delay -= distance(ctx, assigner, delay); - } - - return delay; -} - -static unsigned delay_calc(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) -{ - unsigned i, delay = 0; - - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - if (reg->flags & IR3_REG_SSA) { - unsigned d = delay_calc2(ctx, reg->instr, - instr, i - 1); - delay = MAX2(delay, d); - } - } - - return delay; -} - -/* A negative return value signals that an instruction has been newly - * scheduled, return back up to the top of the stack (to block_sched()) - */ -static int trysched(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) -{ - struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1]; - struct ir3_instruction *src; - unsigned i, delay, nsrcs = 0; - - /* if already scheduled: */ - if (instr->flags & IR3_INSTR_MARK) - return 0; - - /* figure out our src's: */ - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - if (reg->flags & IR3_REG_SSA) - srcs[nsrcs++] = reg->instr; - } - - /* for each src register in sorted order: - */ - delay = 0; - while ((src = deepest(srcs, nsrcs))) { - delay = trysched(ctx, src); - if (delay) - return delay; - } - - /* all our dependents are scheduled, figure out if - * we have enough delay slots to schedule ourself: - */ - delay = delay_calc(ctx, instr); - if (delay) - return delay; - - /* if this is a write to address/predicate register, and that - * register is currently in use, we need to defer until it is - * free: - */ - if (writes_addr(instr) && ctx->addr) { - assert(ctx->addr != instr); - return DELAYED; - } - if (writes_pred(instr) && ctx->pred) { - assert(ctx->pred != instr); - return DELAYED; - } - - schedule(ctx, instr, true); - return SCHEDULED; -} - -static struct ir3_instruction * reverse(struct ir3_instruction *instr) -{ - struct ir3_instruction *reversed = NULL; - while (instr) { - struct ir3_instruction *next = instr->next; - instr->next = reversed; - reversed = instr; - instr = next; - } - return reversed; -} - -static bool uses_current_addr(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) -{ - unsigned i; - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - if (reg->flags & IR3_REG_SSA) { - if (is_addr(reg->instr)) { - struct ir3_instruction *addr; - addr = reg->instr->regs[1]->instr; /* the mova */ - if (ctx->addr == addr) - return true; - } - } - } - return false; -} - -static bool uses_current_pred(struct ir3_sched_ctx *ctx, - struct ir3_instruction *instr) -{ - unsigned i; - for (i = 1; i < instr->regs_count; i++) { - struct ir3_register *reg = instr->regs[i]; - if ((reg->flags & IR3_REG_SSA) && (ctx->pred == reg->instr)) - return true; - } - return false; -} - -/* when we encounter an instruction that writes to the address register - * when it is in use, we delay that instruction and try to schedule all - * other instructions using the current address register: - */ -static int block_sched_undelayed(struct ir3_sched_ctx *ctx, - struct ir3_block *block) -{ - struct ir3_instruction *instr = block->head; - bool addr_in_use = false; - bool pred_in_use = false; - unsigned cnt = ~0; - - while (instr) { - struct ir3_instruction *next = instr->next; - bool addr = uses_current_addr(ctx, instr); - bool pred = uses_current_pred(ctx, instr); - - if (addr || pred) { - int ret = trysched(ctx, instr); - if (ret == SCHEDULED) - cnt = 0; - else if (ret > 0) - cnt = MIN2(cnt, ret); - if (addr) - addr_in_use = true; - if (pred) - pred_in_use = true; - } - - instr = next; - } - - if (!addr_in_use) - ctx->addr = NULL; - - if (!pred_in_use) - ctx->pred = NULL; - - return cnt; -} - -static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) -{ - struct ir3_instruction *instr; - - /* schedule all the shader input's (meta-instr) first so that - * the RA step sees that the input registers contain a value - * from the start of the shader: - */ - if (!block->parent) { - unsigned i; - for (i = 0; i < block->ninputs; i++) { - struct ir3_instruction *in = block->inputs[i]; - if (in) - schedule(ctx, in, true); - } - } - - while ((instr = block->head)) { - /* NOTE: always grab next *before* trysched(), in case the - * instruction is actually scheduled (and therefore moved - * from depth list into scheduled list) - */ - struct ir3_instruction *next = instr->next; - int cnt = trysched(ctx, instr); - - if (cnt == DELAYED) - cnt = block_sched_undelayed(ctx, block); - - /* -1 is signal to return up stack, but to us means same as 0: */ - cnt = MAX2(0, cnt); - cnt += ctx->cnt; - instr = next; - - /* if deepest remaining instruction cannot be scheduled, try - * the increasingly more shallow instructions until needed - * number of delay slots is filled: - */ - while (instr && (cnt > ctx->cnt)) { - next = instr->next; - trysched(ctx, instr); - instr = next; - } - - /* and if we run out of instructions that can be scheduled, - * then it is time for nop's: - */ - while (cnt > ctx->cnt) - schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); - } - - /* at this point, scheduled list is in reverse order, so fix that: */ - block->head = reverse(ctx->scheduled); -} - -void ir3_block_sched(struct ir3_block *block) -{ - struct ir3_sched_ctx ctx = {0}; - ir3_clear_mark(block->shader); - block_sched(&ctx, block); -} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_visitor.h b/src/gallium/drivers/freedreno/a3xx/ir3_visitor.h deleted file mode 100644 index 1c60d1620ca..00000000000 --- a/src/gallium/drivers/freedreno/a3xx/ir3_visitor.h +++ /dev/null @@ -1,154 +0,0 @@ -/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ - -/* - * Copyright (C) 2014 Rob Clark - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Rob Clark - */ - -#ifndef IR3_VISITOR_H_ -#define IR3_VISITOR_H_ - -/** - * Visitor which follows dst to src relationships between instructions, - * first visiting the dst (writer) instruction, followed by src (reader) - * instruction(s). - * - * TODO maybe we want multiple different visitors to walk the - * graph in different ways? - */ - -struct ir3_visitor; - -typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v, - struct ir3_instruction *instr); - -typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg); - -struct ir3_visitor_funcs { - ir3_visit_instr_func instr; // TODO do we need?? - - ir3_visit_reg_func dst_shader_input; - ir3_visit_reg_func dst_block_input; - ir3_visit_reg_func dst_fanout; - ir3_visit_reg_func dst_fanin; - ir3_visit_reg_func dst; - - ir3_visit_reg_func src_block_input; - ir3_visit_reg_func src_fanout; - ir3_visit_reg_func src_fanin; - ir3_visit_reg_func src; -}; - -struct ir3_visitor { - const struct ir3_visitor_funcs *funcs; - bool error; -}; - -#include "util/u_debug.h" - -static void visit_instr_dst(struct ir3_visitor *v, - struct ir3_instruction *instr) -{ - struct ir3_register *reg = instr->regs[0]; - - if (is_meta(instr)) { - switch (instr->opc) { - case OPC_META_INPUT: - if (instr->regs_count == 1) - v->funcs->dst_shader_input(v, instr, reg); - else - v->funcs->dst_block_input(v, instr, reg); - return; - case OPC_META_FO: - v->funcs->dst_fanout(v, instr, reg); - return; - case OPC_META_FI: - v->funcs->dst_fanin(v, instr, reg); - return; - default: - break; - - } - } - - v->funcs->dst(v, instr, reg); -} - -static void visit_instr_src(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - if (is_meta(instr)) { - switch (instr->opc) { - case OPC_META_INPUT: - /* shader-input does not have a src, only block input: */ - debug_assert(instr->regs_count == 2); - v->funcs->src_block_input(v, instr, reg); - return; - case OPC_META_FO: - v->funcs->src_fanout(v, instr, reg); - return; - case OPC_META_FI: - v->funcs->src_fanin(v, instr, reg); - return; - default: - break; - - } - } - - v->funcs->src(v, instr, reg); -} - -static void ir3_visit_instr(struct ir3_visitor *v, - struct ir3_instruction *instr) -{ - struct ir3_instruction *n; - - /* visit instruction that assigns value: */ - if (instr->regs_count > 0) - visit_instr_dst(v, instr); - - /* and of any following instructions which read that value: */ - n = instr->next; - while (n && !v->error) { - unsigned i; - - for (i = 1; i < n->regs_count; i++) { - struct ir3_register *reg = n->regs[i]; - if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr)) - visit_instr_src(v, n, reg); - } - - n = n->next; - } -} - -static void ir3_visit_reg(struct ir3_visitor *v, - struct ir3_instruction *instr, struct ir3_register *reg) -{ - /* no-op */ -} - -#endif /* IR3_VISITOR_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c new file mode 100644 index 00000000000..8c3704bf658 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2013 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include + +#include "disasm.h" +#include "instr-a3xx.h" + +static enum debug_t debug; + +#define printf debug_printf + +static const char *levels[] = { + "", + "\t", + "\t\t", + "\t\t\t", + "\t\t\t\t", + "\t\t\t\t\t", + "\t\t\t\t\t\t", + "\t\t\t\t\t\t\t", + "\t\t\t\t\t\t\t\t", + "\t\t\t\t\t\t\t\t\t", + "x", + "x", + "x", + "x", + "x", + "x", +}; + +static const char *component = "xyzw"; + +static const char *type[] = { + [TYPE_F16] = "f16", + [TYPE_F32] = "f32", + [TYPE_U16] = "u16", + [TYPE_U32] = "u32", + [TYPE_S16] = "s16", + [TYPE_S32] = "s32", + [TYPE_U8] = "u8", + [TYPE_S8] = "s8", +}; + +static void print_reg(reg_t reg, bool full, bool r, bool c, bool im, + bool neg, bool abs, bool addr_rel) +{ + const char type = c ? 'c' : 'r'; + + // XXX I prefer - and || for neg/abs, but preserving format used + // by libllvm-a3xx for easy diffing.. + + if (abs && neg) + printf("(absneg)"); + else if (neg) + printf("(neg)"); + else if (abs) + printf("(abs)"); + + if (r) + printf("(r)"); + + if (im) { + printf("%d", reg.iim_val); + } else if (addr_rel) { + /* I would just use %+d but trying to make it diff'able with + * libllvm-a3xx... + */ + if (reg.iim_val < 0) + printf("%s%c", full ? "" : "h", type, -reg.iim_val); + else if (reg.iim_val > 0) + printf("%s%c", full ? "" : "h", type, reg.iim_val); + else + printf("%s%c", full ? "" : "h", type); + } else if ((reg.num == REG_A0) && !c) { + printf("a0.%c", component[reg.comp]); + } else if ((reg.num == REG_P0) && !c) { + printf("p0.%c", component[reg.comp]); + } else { + printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]); + } +} + + +/* current instruction repeat flag: */ +static unsigned repeat; + +static void print_reg_dst(reg_t reg, bool full, bool addr_rel) +{ + print_reg(reg, full, false, false, false, false, false, addr_rel); +} + +static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im, + bool neg, bool abs, bool addr_rel) +{ + print_reg(reg, full, r, c, im, neg, abs, addr_rel); +} + +static void print_instr_cat0(instr_t *instr) +{ + instr_cat0_t *cat0 = &instr->cat0; + + switch (cat0->opc) { + case OPC_KILL: + printf(" %sp0.%c", cat0->inv ? "!" : "", + component[cat0->comp]); + break; + case OPC_BR: + printf(" %sp0.%c, #%d", cat0->inv ? "!" : "", + component[cat0->comp], cat0->immed); + break; + case OPC_JUMP: + case OPC_CALL: + printf(" #%d", cat0->immed); + break; + } + + if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4)) + printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4); +} + +static void print_instr_cat1(instr_t *instr) +{ + instr_cat1_t *cat1 = &instr->cat1; + + if (cat1->ul) + printf("(ul)"); + + if (cat1->src_type == cat1->dst_type) { + if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) { + /* special case (nmemonic?): */ + printf("mova"); + } else { + printf("mov.%s%s", type[cat1->src_type], type[cat1->dst_type]); + } + } else { + printf("cov.%s%s", type[cat1->src_type], type[cat1->dst_type]); + } + + printf(" "); + + if (cat1->even) + printf("(even)"); + + if (cat1->pos_inf) + printf("(pos_infinity)"); + + print_reg_dst((reg_t)(cat1->dst), type_size(cat1->dst_type) == 32, + cat1->dst_rel); + + printf(", "); + + /* ugg, have to special case this.. vs print_reg().. */ + if (cat1->src_im) { + if (type_float(cat1->src_type)) + printf("(%f)", cat1->fim_val); + else + printf("%d", cat1->iim_val); + } else if (cat1->src_rel && !cat1->src_c) { + /* I would just use %+d but trying to make it diff'able with + * libllvm-a3xx... + */ + char type = cat1->src_rel_c ? 'c' : 'r'; + if (cat1->off < 0) + printf("%c", type, -cat1->off); + else if (cat1->off > 0) + printf("%c", type, cat1->off); + else + printf("c"); + } else { + print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32, + cat1->src_r, cat1->src_c, cat1->src_im, false, false, false); + } + + if ((debug & PRINT_VERBOSE) && (cat1->must_be_0)) + printf("\t{1: %x}", cat1->must_be_0); +} + +static void print_instr_cat2(instr_t *instr) +{ + instr_cat2_t *cat2 = &instr->cat2; + static const char *cond[] = { + "lt", + "le", + "gt", + "ge", + "eq", + "ne", + "?6?", + }; + + switch (cat2->opc) { + case OPC_CMPS_F: + case OPC_CMPS_U: + case OPC_CMPS_S: + case OPC_CMPV_F: + case OPC_CMPV_U: + case OPC_CMPV_S: + printf(".%s", cond[cat2->cond]); + break; + } + + printf(" "); + if (cat2->ei) + printf("(ei)"); + print_reg_dst((reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false); + printf(", "); + + if (cat2->c1.src1_c) { + print_reg_src((reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r, + cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg, + cat2->src1_abs, false); + } else if (cat2->rel1.src1_rel) { + print_reg_src((reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r, + cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg, + cat2->src1_abs, cat2->rel1.src1_rel); + } else { + print_reg_src((reg_t)(cat2->src1), cat2->full, cat2->src1_r, + false, cat2->src1_im, cat2->src1_neg, + cat2->src1_abs, false); + } + + switch (cat2->opc) { + case OPC_ABSNEG_F: + case OPC_ABSNEG_S: + case OPC_CLZ_B: + case OPC_CLZ_S: + case OPC_SIGN_F: + case OPC_FLOOR_F: + case OPC_CEIL_F: + case OPC_RNDNE_F: + case OPC_RNDAZ_F: + case OPC_TRUNC_F: + case OPC_NOT_B: + case OPC_BFREV_B: + case OPC_SETRM: + case OPC_CBITS_B: + /* these only have one src reg */ + break; + default: + printf(", "); + if (cat2->c2.src2_c) { + print_reg_src((reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r, + cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg, + cat2->src2_abs, false); + } else if (cat2->rel2.src2_rel) { + print_reg_src((reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r, + cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg, + cat2->src2_abs, cat2->rel2.src2_rel); + } else { + print_reg_src((reg_t)(cat2->src2), cat2->full, cat2->src2_r, + false, cat2->src2_im, cat2->src2_neg, + cat2->src2_abs, false); + } + break; + } +} + +static void print_instr_cat3(instr_t *instr) +{ + instr_cat3_t *cat3 = &instr->cat3; + bool full = instr_cat3_full(cat3); + + printf(" "); + print_reg_dst((reg_t)(cat3->dst), full ^ cat3->dst_half, false); + printf(", "); + if (cat3->c1.src1_c) { + print_reg_src((reg_t)(cat3->c1.src1), full, + cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg, + false, false); + } else if (cat3->rel1.src1_rel) { + print_reg_src((reg_t)(cat3->rel1.src1), full, + cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg, + false, cat3->rel1.src1_rel); + } else { + print_reg_src((reg_t)(cat3->src1), full, + cat3->src1_r, false, false, cat3->src1_neg, + false, false); + } + printf(", "); + print_reg_src((reg_t)cat3->src2, full, + cat3->src2_r, cat3->src2_c, false, cat3->src2_neg, + false, false); + printf(", "); + if (cat3->c2.src3_c) { + print_reg_src((reg_t)(cat3->c2.src3), full, + cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg, + false, false); + } else if (cat3->rel2.src3_rel) { + print_reg_src((reg_t)(cat3->rel2.src3), full, + cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg, + false, cat3->rel2.src3_rel); + } else { + print_reg_src((reg_t)(cat3->src3), full, + cat3->src3_r, false, false, cat3->src3_neg, + false, false); + } +} + +static void print_instr_cat4(instr_t *instr) +{ + instr_cat4_t *cat4 = &instr->cat4; + + printf(" "); + print_reg_dst((reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false); + printf(", "); + + if (cat4->c.src_c) { + print_reg_src((reg_t)(cat4->c.src), cat4->full, + cat4->src_r, cat4->c.src_c, cat4->src_im, + cat4->src_neg, cat4->src_abs, false); + } else if (cat4->rel.src_rel) { + print_reg_src((reg_t)(cat4->rel.src), cat4->full, + cat4->src_r, cat4->rel.src_c, cat4->src_im, + cat4->src_neg, cat4->src_abs, cat4->rel.src_rel); + } else { + print_reg_src((reg_t)(cat4->src), cat4->full, + cat4->src_r, false, cat4->src_im, + cat4->src_neg, cat4->src_abs, false); + } + + if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2)) + printf("\t{4: %x,%x}", cat4->dummy1, cat4->dummy2); +} + +static void print_instr_cat5(instr_t *instr) +{ + static const struct { + bool src1, src2, samp, tex; + } info[0x1f] = { + [OPC_ISAM] = { true, false, true, true, }, + [OPC_ISAML] = { true, true, true, true, }, + [OPC_ISAMM] = { true, false, true, true, }, + [OPC_SAM] = { true, false, true, true, }, + [OPC_SAMB] = { true, true, true, true, }, + [OPC_SAML] = { true, true, true, true, }, + [OPC_SAMGQ] = { true, false, true, true, }, + [OPC_GETLOD] = { true, false, true, true, }, + [OPC_CONV] = { true, true, true, true, }, + [OPC_CONVM] = { true, true, true, true, }, + [OPC_GETSIZE] = { true, false, false, true, }, + [OPC_GETBUF] = { false, false, false, true, }, + [OPC_GETPOS] = { true, false, false, true, }, + [OPC_GETINFO] = { false, false, false, true, }, + [OPC_DSX] = { true, false, false, false, }, + [OPC_DSY] = { true, false, false, false, }, + [OPC_GATHER4R] = { true, false, true, true, }, + [OPC_GATHER4G] = { true, false, true, true, }, + [OPC_GATHER4B] = { true, false, true, true, }, + [OPC_GATHER4A] = { true, false, true, true, }, + [OPC_SAMGP0] = { true, false, true, true, }, + [OPC_SAMGP1] = { true, false, true, true, }, + [OPC_SAMGP2] = { true, false, true, true, }, + [OPC_SAMGP3] = { true, false, true, true, }, + [OPC_DSXPP_1] = { true, false, false, false, }, + [OPC_DSYPP_1] = { true, false, false, false, }, + [OPC_RGETPOS] = { false, false, false, false, }, + [OPC_RGETINFO] = { false, false, false, false, }, + }; + instr_cat5_t *cat5 = &instr->cat5; + int i; + + if (cat5->is_3d) printf(".3d"); + if (cat5->is_a) printf(".a"); + if (cat5->is_o) printf(".o"); + if (cat5->is_p) printf(".p"); + if (cat5->is_s) printf(".s"); + if (cat5->is_s2en) printf(".s2en"); + + printf(" "); + + switch (cat5->opc) { + case OPC_DSXPP_1: + case OPC_DSYPP_1: + break; + default: + printf("(%s)", type[cat5->type]); + break; + } + + printf("("); + for (i = 0; i < 4; i++) + if (cat5->wrmask & (1 << i)) + printf("%c", "xyzw"[i]); + printf(")"); + + print_reg_dst((reg_t)(cat5->dst), type_size(cat5->type) == 32, false); + + if (info[cat5->opc].src1) { + printf(", "); + print_reg_src((reg_t)(cat5->src1), cat5->full, false, false, false, + false, false, false); + } + + if (cat5->is_s2en) { + printf(", "); + print_reg_src((reg_t)(cat5->s2en.src2), cat5->full, false, false, false, + false, false, false); + printf(", "); + print_reg_src((reg_t)(cat5->s2en.src3), false, false, false, false, + false, false, false); + } else { + if (cat5->is_o || info[cat5->opc].src2) { + printf(", "); + print_reg_src((reg_t)(cat5->norm.src2), cat5->full, + false, false, false, false, false, false); + } + if (info[cat5->opc].samp) + printf(", s#%d", cat5->norm.samp); + if (info[cat5->opc].tex) + printf(", t#%d", cat5->norm.tex); + } + + if (debug & PRINT_VERBOSE) { + if (cat5->is_s2en) { + if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2)) + printf("\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2); + } else { + if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2)) + printf("\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2); + } + } +} + +static int32_t u2i(uint32_t val, int nbits) +{ + return ((val >> (nbits-1)) * ~((1 << nbits) - 1)) | val; +} + +static void print_instr_cat6(instr_t *instr) +{ + instr_cat6_t *cat6 = &instr->cat6; + + printf(".%s ", type[cat6->type]); + + switch (cat6->opc) { + case OPC_LDG: + case OPC_LDP: + case OPC_LDL: + case OPC_LDLW: + case OPC_LDLV: + /* load instructions: */ + print_reg_dst((reg_t)(cat6->a.dst), type_size(cat6->type) == 32, false); + printf(","); + switch (cat6->opc) { + case OPC_LDG: + printf("g"); + break; + case OPC_LDP: + printf("p"); + break; + case OPC_LDL: + case OPC_LDLW: + case OPC_LDLV: + printf("l"); + break; + } + printf("["); + print_reg_src((reg_t)(cat6->a.src), true, + false, false, false, false, false, false); + if (cat6->a.off) + printf("%+d", cat6->a.off); + printf("]"); + break; + case OPC_PREFETCH: + /* similar to load instructions: */ + printf("g["); + print_reg_src((reg_t)(cat6->a.src), true, + false, false, false, false, false, false); + if (cat6->a.off) + printf("%+d", cat6->a.off); + printf("]"); + break; + case OPC_STG: + case OPC_STP: + case OPC_STL: + case OPC_STLW: + /* store instructions: */ + switch (cat6->opc) { + case OPC_STG: + printf("g"); + break; + case OPC_STP: + printf("p"); + break; + case OPC_STL: + case OPC_STLW: + printf("l"); + break; + } + printf("["); + print_reg_dst((reg_t)(cat6->b.dst), true, false); + if (cat6->b.off || cat6->b.off_hi) + printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13)); + printf("]"); + printf(","); + print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32, + false, false, false, false, false, false); + + break; + case OPC_STI: + /* sti has same encoding as other store instructions, but + * slightly different syntax: + */ + print_reg_dst((reg_t)(cat6->b.dst), false /* XXX is it always half? */, false); + if (cat6->b.off || cat6->b.off_hi) + printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13)); + printf(","); + print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32, + false, false, false, false, false, false); + break; + } + + printf(", %d", cat6->iim_val); + + if (debug & PRINT_VERBOSE) { + switch (cat6->opc) { + case OPC_LDG: + case OPC_LDP: + /* load instructions: */ + if (cat6->a.dummy1|cat6->a.dummy2|cat6->a.dummy3) + printf("\t{6: %x,%x,%x}", cat6->a.dummy1, cat6->a.dummy2, cat6->a.dummy3); + if ((cat6->a.must_be_one1 != 1) || (cat6->a.must_be_one2 != 1)) + printf("{?? %d,%d ??}", cat6->a.must_be_one1, cat6->a.must_be_one2); + break; + case OPC_STG: + case OPC_STP: + case OPC_STI: + /* store instructions: */ + if (cat6->b.dummy1|cat6->b.dummy2) + printf("\t{6: %x,%x}", cat6->b.dummy1, cat6->b.dummy2); + if ((cat6->b.must_be_one1 != 1) || (cat6->b.must_be_one2 != 1) || + (cat6->b.must_be_zero1 != 0)) + printf("{?? %d,%d,%d ??}", cat6->b.must_be_one1, cat6->b.must_be_one2, + cat6->b.must_be_zero1); + break; + } + } +} + +/* size of largest OPC field of all the instruction categories: */ +#define NOPC_BITS 6 + +struct opc_info { + uint16_t cat; + uint16_t opc; + const char *name; + void (*print)(instr_t *instr); +} opcs[1 << (3+NOPC_BITS)] = { +#define OPC(cat, opc, name) [((cat) << NOPC_BITS) | (opc)] = { (cat), (opc), #name, print_instr_cat##cat } + /* category 0: */ + OPC(0, OPC_NOP, nop), + OPC(0, OPC_BR, br), + OPC(0, OPC_JUMP, jump), + OPC(0, OPC_CALL, call), + OPC(0, OPC_RET, ret), + OPC(0, OPC_KILL, kill), + OPC(0, OPC_END, end), + OPC(0, OPC_EMIT, emit), + OPC(0, OPC_CUT, cut), + OPC(0, OPC_CHMASK, chmask), + OPC(0, OPC_CHSH, chsh), + OPC(0, OPC_FLOW_REV, flow_rev), + + /* category 1: */ + OPC(1, 0, ), + + /* category 2: */ + OPC(2, OPC_ADD_F, add.f), + OPC(2, OPC_MIN_F, min.f), + OPC(2, OPC_MAX_F, max.f), + OPC(2, OPC_MUL_F, mul.f), + OPC(2, OPC_SIGN_F, sign.f), + OPC(2, OPC_CMPS_F, cmps.f), + OPC(2, OPC_ABSNEG_F, absneg.f), + OPC(2, OPC_CMPV_F, cmpv.f), + OPC(2, OPC_FLOOR_F, floor.f), + OPC(2, OPC_CEIL_F, ceil.f), + OPC(2, OPC_RNDNE_F, rndne.f), + OPC(2, OPC_RNDAZ_F, rndaz.f), + OPC(2, OPC_TRUNC_F, trunc.f), + OPC(2, OPC_ADD_U, add.u), + OPC(2, OPC_ADD_S, add.s), + OPC(2, OPC_SUB_U, sub.u), + OPC(2, OPC_SUB_S, sub.s), + OPC(2, OPC_CMPS_U, cmps.u), + OPC(2, OPC_CMPS_S, cmps.s), + OPC(2, OPC_MIN_U, min.u), + OPC(2, OPC_MIN_S, min.s), + OPC(2, OPC_MAX_U, max.u), + OPC(2, OPC_MAX_S, max.s), + OPC(2, OPC_ABSNEG_S, absneg.s), + OPC(2, OPC_AND_B, and.b), + OPC(2, OPC_OR_B, or.b), + OPC(2, OPC_NOT_B, not.b), + OPC(2, OPC_XOR_B, xor.b), + OPC(2, OPC_CMPV_U, cmpv.u), + OPC(2, OPC_CMPV_S, cmpv.s), + OPC(2, OPC_MUL_U, mul.u), + OPC(2, OPC_MUL_S, mul.s), + OPC(2, OPC_MULL_U, mull.u), + OPC(2, OPC_BFREV_B, bfrev.b), + OPC(2, OPC_CLZ_S, clz.s), + OPC(2, OPC_CLZ_B, clz.b), + OPC(2, OPC_SHL_B, shl.b), + OPC(2, OPC_SHR_B, shr.b), + OPC(2, OPC_ASHR_B, ashr.b), + OPC(2, OPC_BARY_F, bary.f), + OPC(2, OPC_MGEN_B, mgen.b), + OPC(2, OPC_GETBIT_B, getbit.b), + OPC(2, OPC_SETRM, setrm), + OPC(2, OPC_CBITS_B, cbits.b), + OPC(2, OPC_SHB, shb), + OPC(2, OPC_MSAD, msad), + + /* category 3: */ + OPC(3, OPC_MAD_U16, mad.u16), + OPC(3, OPC_MADSH_U16, madsh.u16), + OPC(3, OPC_MAD_S16, mad.s16), + OPC(3, OPC_MADSH_M16, madsh.m16), + OPC(3, OPC_MAD_U24, mad.u24), + OPC(3, OPC_MAD_S24, mad.s24), + OPC(3, OPC_MAD_F16, mad.f16), + OPC(3, OPC_MAD_F32, mad.f32), + OPC(3, OPC_SEL_B16, sel.b16), + OPC(3, OPC_SEL_B32, sel.b32), + OPC(3, OPC_SEL_S16, sel.s16), + OPC(3, OPC_SEL_S32, sel.s32), + OPC(3, OPC_SEL_F16, sel.f16), + OPC(3, OPC_SEL_F32, sel.f32), + OPC(3, OPC_SAD_S16, sad.s16), + OPC(3, OPC_SAD_S32, sad.s32), + + /* category 4: */ + OPC(4, OPC_RCP, rcp), + OPC(4, OPC_RSQ, rsq), + OPC(4, OPC_LOG2, log2), + OPC(4, OPC_EXP2, exp2), + OPC(4, OPC_SIN, sin), + OPC(4, OPC_COS, cos), + OPC(4, OPC_SQRT, sqrt), + + /* category 5: */ + OPC(5, OPC_ISAM, isam), + OPC(5, OPC_ISAML, isaml), + OPC(5, OPC_ISAMM, isamm), + OPC(5, OPC_SAM, sam), + OPC(5, OPC_SAMB, samb), + OPC(5, OPC_SAML, saml), + OPC(5, OPC_SAMGQ, samgq), + OPC(5, OPC_GETLOD, getlod), + OPC(5, OPC_CONV, conv), + OPC(5, OPC_CONVM, convm), + OPC(5, OPC_GETSIZE, getsize), + OPC(5, OPC_GETBUF, getbuf), + OPC(5, OPC_GETPOS, getpos), + OPC(5, OPC_GETINFO, getinfo), + OPC(5, OPC_DSX, dsx), + OPC(5, OPC_DSY, dsy), + OPC(5, OPC_GATHER4R, gather4r), + OPC(5, OPC_GATHER4G, gather4g), + OPC(5, OPC_GATHER4B, gather4b), + OPC(5, OPC_GATHER4A, gather4a), + OPC(5, OPC_SAMGP0, samgp0), + OPC(5, OPC_SAMGP1, samgp1), + OPC(5, OPC_SAMGP2, samgp2), + OPC(5, OPC_SAMGP3, samgp3), + OPC(5, OPC_DSXPP_1, dsxpp.1), + OPC(5, OPC_DSYPP_1, dsypp.1), + OPC(5, OPC_RGETPOS, rgetpos), + OPC(5, OPC_RGETINFO, rgetinfo), + + + /* category 6: */ + OPC(6, OPC_LDG, ldg), + OPC(6, OPC_LDL, ldl), + OPC(6, OPC_LDP, ldp), + OPC(6, OPC_STG, stg), + OPC(6, OPC_STL, stl), + OPC(6, OPC_STP, stp), + OPC(6, OPC_STI, sti), + OPC(6, OPC_G2L, g2l), + OPC(6, OPC_L2G, l2g), + OPC(6, OPC_PREFETCH, prefetch), + OPC(6, OPC_LDLW, ldlw), + OPC(6, OPC_STLW, stlw), + OPC(6, OPC_RESFMT, resfmt), + OPC(6, OPC_RESINFO, resinf), + OPC(6, OPC_ATOMIC_ADD_L, atomic.add.l), + OPC(6, OPC_ATOMIC_SUB_L, atomic.sub.l), + OPC(6, OPC_ATOMIC_XCHG_L, atomic.xchg.l), + OPC(6, OPC_ATOMIC_INC_L, atomic.inc.l), + OPC(6, OPC_ATOMIC_DEC_L, atomic.dec.l), + OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l), + OPC(6, OPC_ATOMIC_MIN_L, atomic.min.l), + OPC(6, OPC_ATOMIC_MAX_L, atomic.max.l), + OPC(6, OPC_ATOMIC_AND_L, atomic.and.l), + OPC(6, OPC_ATOMIC_OR_L, atomic.or.l), + OPC(6, OPC_ATOMIC_XOR_L, atomic.xor.l), + OPC(6, OPC_LDGB_TYPED_4D, ldgb.typed.4d), + OPC(6, OPC_STGB_4D_4, stgb.4d.4), + OPC(6, OPC_STIB, stib), + OPC(6, OPC_LDC_4, ldc.4), + OPC(6, OPC_LDLV, ldlv), + + +#undef OPC +}; + +#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)])) + +// XXX hack.. probably should move this table somewhere common: +#include "ir3.h" +const char *ir3_instr_name(struct ir3_instruction *instr) +{ + if (instr->category == -1) return "??meta??"; + return opcs[(instr->category << NOPC_BITS) | instr->opc].name; +} + +static void print_instr(uint32_t *dwords, int level, int n) +{ + instr_t *instr = (instr_t *)dwords; + uint32_t opc = instr_opc(instr); + const char *name; + + printf("%s%04d[%08xx_%08xx] ", levels[level], n, dwords[1], dwords[0]); + +#if 0 + /* print unknown bits: */ + if (debug & PRINT_RAW) + printf("[%08xx_%08xx] ", dwords[1] & 0x001ff800, dwords[0] & 0x00000000); + + if (debug & PRINT_VERBOSE) + printf("%d,%02d ", instr->opc_cat, opc); +#endif + + /* NOTE: order flags are printed is a bit fugly.. but for now I + * try to match the order in llvm-a3xx disassembler for easy + * diff'ing.. + */ + + if (instr->sync) + printf("(sy)"); + if (instr->ss && (instr->opc_cat <= 4)) + printf("(ss)"); + if (instr->jmp_tgt) + printf("(jp)"); + if (instr->repeat && (instr->opc_cat <= 4)) { + printf("(rpt%d)", instr->repeat); + repeat = instr->repeat; + } else { + repeat = 0; + } + if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4))) + printf("(ul)"); + + name = GETINFO(instr)->name; + + if (name) { + printf("%s", name); + GETINFO(instr)->print(instr); + } else { + printf("unknown(%d,%d)", instr->opc_cat, opc); + } + + printf("\n"); +} + +int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type) +{ + int i; + + assert((sizedwords % 2) == 0); + + for (i = 0; i < sizedwords; i += 2) + print_instr(&dwords[i], level, i/2); + + return 0; +} diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h new file mode 100644 index 00000000000..c67f1037ced --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h @@ -0,0 +1,691 @@ +/* + * Copyright (c) 2013 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INSTR_A3XX_H_ +#define INSTR_A3XX_H_ + +#define PACKED __attribute__((__packed__)) + +#include +#include + +typedef enum { + /* category 0: */ + OPC_NOP = 0, + OPC_BR = 1, + OPC_JUMP = 2, + OPC_CALL = 3, + OPC_RET = 4, + OPC_KILL = 5, + OPC_END = 6, + OPC_EMIT = 7, + OPC_CUT = 8, + OPC_CHMASK = 9, + OPC_CHSH = 10, + OPC_FLOW_REV = 11, + + /* category 1: */ + /* no opc.. all category 1 are variants of mov */ + + /* category 2: */ + OPC_ADD_F = 0, + OPC_MIN_F = 1, + OPC_MAX_F = 2, + OPC_MUL_F = 3, + OPC_SIGN_F = 4, + OPC_CMPS_F = 5, + OPC_ABSNEG_F = 6, + OPC_CMPV_F = 7, + /* 8 - invalid */ + OPC_FLOOR_F = 9, + OPC_CEIL_F = 10, + OPC_RNDNE_F = 11, + OPC_RNDAZ_F = 12, + OPC_TRUNC_F = 13, + /* 14-15 - invalid */ + OPC_ADD_U = 16, + OPC_ADD_S = 17, + OPC_SUB_U = 18, + OPC_SUB_S = 19, + OPC_CMPS_U = 20, + OPC_CMPS_S = 21, + OPC_MIN_U = 22, + OPC_MIN_S = 23, + OPC_MAX_U = 24, + OPC_MAX_S = 25, + OPC_ABSNEG_S = 26, + /* 27 - invalid */ + OPC_AND_B = 28, + OPC_OR_B = 29, + OPC_NOT_B = 30, + OPC_XOR_B = 31, + /* 32 - invalid */ + OPC_CMPV_U = 33, + OPC_CMPV_S = 34, + /* 35-47 - invalid */ + OPC_MUL_U = 48, + OPC_MUL_S = 49, + OPC_MULL_U = 50, + OPC_BFREV_B = 51, + OPC_CLZ_S = 52, + OPC_CLZ_B = 53, + OPC_SHL_B = 54, + OPC_SHR_B = 55, + OPC_ASHR_B = 56, + OPC_BARY_F = 57, + OPC_MGEN_B = 58, + OPC_GETBIT_B = 59, + OPC_SETRM = 60, + OPC_CBITS_B = 61, + OPC_SHB = 62, + OPC_MSAD = 63, + + /* category 3: */ + OPC_MAD_U16 = 0, + OPC_MADSH_U16 = 1, + OPC_MAD_S16 = 2, + OPC_MADSH_M16 = 3, /* should this be .s16? */ + OPC_MAD_U24 = 4, + OPC_MAD_S24 = 5, + OPC_MAD_F16 = 6, + OPC_MAD_F32 = 7, + OPC_SEL_B16 = 8, + OPC_SEL_B32 = 9, + OPC_SEL_S16 = 10, + OPC_SEL_S32 = 11, + OPC_SEL_F16 = 12, + OPC_SEL_F32 = 13, + OPC_SAD_S16 = 14, + OPC_SAD_S32 = 15, + + /* category 4: */ + OPC_RCP = 0, + OPC_RSQ = 1, + OPC_LOG2 = 2, + OPC_EXP2 = 3, + OPC_SIN = 4, + OPC_COS = 5, + OPC_SQRT = 6, + // 7-63 - invalid + + /* category 5: */ + OPC_ISAM = 0, + OPC_ISAML = 1, + OPC_ISAMM = 2, + OPC_SAM = 3, + OPC_SAMB = 4, + OPC_SAML = 5, + OPC_SAMGQ = 6, + OPC_GETLOD = 7, + OPC_CONV = 8, + OPC_CONVM = 9, + OPC_GETSIZE = 10, + OPC_GETBUF = 11, + OPC_GETPOS = 12, + OPC_GETINFO = 13, + OPC_DSX = 14, + OPC_DSY = 15, + OPC_GATHER4R = 16, + OPC_GATHER4G = 17, + OPC_GATHER4B = 18, + OPC_GATHER4A = 19, + OPC_SAMGP0 = 20, + OPC_SAMGP1 = 21, + OPC_SAMGP2 = 22, + OPC_SAMGP3 = 23, + OPC_DSXPP_1 = 24, + OPC_DSYPP_1 = 25, + OPC_RGETPOS = 26, + OPC_RGETINFO = 27, + + /* category 6: */ + OPC_LDG = 0, /* load-global */ + OPC_LDL = 1, + OPC_LDP = 2, + OPC_STG = 3, /* store-global */ + OPC_STL = 4, + OPC_STP = 5, + OPC_STI = 6, + OPC_G2L = 7, + OPC_L2G = 8, + OPC_PREFETCH = 9, + OPC_LDLW = 10, + OPC_STLW = 11, + OPC_RESFMT = 14, + OPC_RESINFO = 15, + OPC_ATOMIC_ADD_L = 16, + OPC_ATOMIC_SUB_L = 17, + OPC_ATOMIC_XCHG_L = 18, + OPC_ATOMIC_INC_L = 19, + OPC_ATOMIC_DEC_L = 20, + OPC_ATOMIC_CMPXCHG_L = 21, + OPC_ATOMIC_MIN_L = 22, + OPC_ATOMIC_MAX_L = 23, + OPC_ATOMIC_AND_L = 24, + OPC_ATOMIC_OR_L = 25, + OPC_ATOMIC_XOR_L = 26, + OPC_LDGB_TYPED_4D = 27, + OPC_STGB_4D_4 = 28, + OPC_STIB = 29, + OPC_LDC_4 = 30, + OPC_LDLV = 31, + + /* meta instructions (category -1): */ + /* placeholder instr to mark inputs/outputs: */ + OPC_META_INPUT = 0, + OPC_META_OUTPUT = 1, + /* The "fan-in" and "fan-out" instructions are used for keeping + * track of instructions that write to multiple dst registers + * (fan-out) like texture sample instructions, or read multiple + * consecutive scalar registers (fan-in) (bary.f, texture samp) + */ + OPC_META_FO = 2, + OPC_META_FI = 3, + /* branches/flow control */ + OPC_META_FLOW = 4, + OPC_META_PHI = 5, + /* relative addressing */ + OPC_META_DEREF = 6, + + +} opc_t; + +typedef enum { + TYPE_F16 = 0, + TYPE_F32 = 1, + TYPE_U16 = 2, + TYPE_U32 = 3, + TYPE_S16 = 4, + TYPE_S32 = 5, + TYPE_U8 = 6, + TYPE_S8 = 7, // XXX I assume? +} type_t; + +static inline uint32_t type_size(type_t type) +{ + switch (type) { + case TYPE_F32: + case TYPE_U32: + case TYPE_S32: + return 32; + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return 16; + case TYPE_U8: + case TYPE_S8: + return 8; + default: + assert(0); /* invalid type */ + return 0; + } +} + +static inline int type_float(type_t type) +{ + return (type == TYPE_F32) || (type == TYPE_F16); +} + +static inline int type_uint(type_t type) +{ + return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8); +} + +static inline int type_sint(type_t type) +{ + return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8); +} + +typedef union PACKED { + /* normal gpr or const src register: */ + struct PACKED { + uint32_t comp : 2; + uint32_t num : 10; + }; + /* for immediate val: */ + int32_t iim_val : 11; + /* to make compiler happy: */ + uint32_t dummy32; + uint32_t dummy10 : 10; + uint32_t dummy11 : 11; + uint32_t dummy12 : 12; + uint32_t dummy13 : 13; + uint32_t dummy8 : 8; +} reg_t; + +/* special registers: */ +#define REG_A0 61 /* address register */ +#define REG_P0 62 /* predicate register */ + +static inline int reg_special(reg_t reg) +{ + return (reg.num == REG_A0) || (reg.num == REG_P0); +} + +typedef struct PACKED { + /* dword0: */ + int16_t immed : 16; + uint32_t dummy1 : 16; + + /* dword1: */ + uint32_t dummy2 : 8; + uint32_t repeat : 3; + uint32_t dummy3 : 1; + uint32_t ss : 1; + uint32_t dummy4 : 7; + uint32_t inv : 1; + uint32_t comp : 2; + uint32_t opc : 4; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat0_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + /* for normal src register: */ + struct PACKED { + uint32_t src : 11; + /* at least low bit of pad must be zero or it will + * look like a address relative src + */ + uint32_t pad : 21; + }; + /* for address relative: */ + struct PACKED { + int32_t off : 10; + uint32_t src_rel_c : 1; + uint32_t src_rel : 1; + uint32_t unknown : 20; + }; + /* for immediate: */ + int32_t iim_val; + float fim_val; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 3; + uint32_t src_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; + uint32_t dst_type : 3; + uint32_t dst_rel : 1; + uint32_t src_type : 3; + uint32_t src_c : 1; + uint32_t src_im : 1; + uint32_t even : 1; + uint32_t pos_inf : 1; + uint32_t must_be_0 : 2; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat1_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + uint32_t src1 : 11; + uint32_t must_be_zero1: 2; + uint32_t src1_im : 1; /* immediate */ + uint32_t src1_neg : 1; /* negate */ + uint32_t src1_abs : 1; /* absolute value */ + }; + struct PACKED { + uint32_t src1 : 10; + uint32_t src1_c : 1; /* relative-const */ + uint32_t src1_rel : 1; /* relative address */ + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel1; + struct PACKED { + uint32_t src1 : 12; + uint32_t src1_c : 1; /* const */ + uint32_t dummy : 3; + } c1; + }; + + union PACKED { + struct PACKED { + uint32_t src2 : 11; + uint32_t must_be_zero2: 2; + uint32_t src2_im : 1; /* immediate */ + uint32_t src2_neg : 1; /* negate */ + uint32_t src2_abs : 1; /* absolute value */ + }; + struct PACKED { + uint32_t src2 : 10; + uint32_t src2_c : 1; /* relative-const */ + uint32_t src2_rel : 1; /* relative address */ + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel2; + struct PACKED { + uint32_t src2 : 12; + uint32_t src2_c : 1; /* const */ + uint32_t dummy : 3; + } c2; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 3; + uint32_t src1_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; /* dunno */ + uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ + uint32_t ei : 1; + uint32_t cond : 3; + uint32_t src2_r : 1; + uint32_t full : 1; /* not half */ + uint32_t opc : 6; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat2_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + uint32_t src1 : 11; + uint32_t must_be_zero1: 2; + uint32_t src2_c : 1; + uint32_t src1_neg : 1; + uint32_t src2_r : 1; + }; + struct PACKED { + uint32_t src1 : 10; + uint32_t src1_c : 1; + uint32_t src1_rel : 1; + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel1; + struct PACKED { + uint32_t src1 : 12; + uint32_t src1_c : 1; + uint32_t dummy : 3; + } c1; + }; + + union PACKED { + struct PACKED { + uint32_t src3 : 11; + uint32_t must_be_zero2: 2; + uint32_t src3_r : 1; + uint32_t src2_neg : 1; + uint32_t src3_neg : 1; + }; + struct PACKED { + uint32_t src3 : 10; + uint32_t src3_c : 1; + uint32_t src3_rel : 1; + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel2; + struct PACKED { + uint32_t src3 : 12; + uint32_t src3_c : 1; + uint32_t dummy : 3; + } c2; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 3; + uint32_t src1_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; + uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ + uint32_t src2 : 8; + uint32_t opc : 4; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat3_t; + +static inline bool instr_cat3_full(instr_cat3_t *cat3) +{ + switch (cat3->opc) { + case OPC_MAD_F16: + case OPC_MAD_U16: + case OPC_MAD_S16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + case OPC_SAD_S32: // really?? + return false; + default: + return true; + } +} + +typedef struct PACKED { + /* dword0: */ + union PACKED { + struct PACKED { + uint32_t src : 11; + uint32_t must_be_zero1: 2; + uint32_t src_im : 1; /* immediate */ + uint32_t src_neg : 1; /* negate */ + uint32_t src_abs : 1; /* absolute value */ + }; + struct PACKED { + uint32_t src : 10; + uint32_t src_c : 1; /* relative-const */ + uint32_t src_rel : 1; /* relative address */ + uint32_t must_be_zero : 1; + uint32_t dummy : 3; + } rel; + struct PACKED { + uint32_t src : 12; + uint32_t src_c : 1; /* const */ + uint32_t dummy : 3; + } c; + }; + uint32_t dummy1 : 16; /* seem to be ignored */ + + /* dword1: */ + uint32_t dst : 8; + uint32_t repeat : 3; + uint32_t src_r : 1; + uint32_t ss : 1; + uint32_t ul : 1; + uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */ + uint32_t dummy2 : 5; /* seem to be ignored */ + uint32_t full : 1; /* not half */ + uint32_t opc : 6; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat4_t; + +typedef struct PACKED { + /* dword0: */ + union PACKED { + /* normal case: */ + struct PACKED { + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t src2 : 8; + uint32_t dummy1 : 4; /* seem to be ignored */ + uint32_t samp : 4; + uint32_t tex : 7; + } norm; + /* s2en case: */ + struct PACKED { + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t src2 : 11; + uint32_t dummy1 : 1; + uint32_t src3 : 8; + uint32_t dummy2 : 3; + } s2en; + /* same in either case: */ + // XXX I think, confirm this + struct PACKED { + uint32_t full : 1; /* not half */ + uint32_t src1 : 8; + uint32_t pad : 23; + }; + }; + + /* dword1: */ + uint32_t dst : 8; + uint32_t wrmask : 4; /* write-mask */ + uint32_t type : 3; + uint32_t dummy2 : 1; /* seems to be ignored */ + uint32_t is_3d : 1; + + uint32_t is_a : 1; + uint32_t is_s : 1; + uint32_t is_s2en : 1; + uint32_t is_o : 1; + uint32_t is_p : 1; + + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat5_t; + +/* used for load instructions: */ +typedef struct PACKED { + /* dword0: */ + uint32_t must_be_one1 : 1; + int16_t off : 13; + uint32_t src : 8; + uint32_t dummy1 : 1; + uint32_t must_be_one2 : 1; + int32_t iim_val : 8; + + /* dword1: */ + uint32_t dst : 8; + uint32_t dummy2 : 9; + uint32_t type : 3; + uint32_t dummy3 : 2; + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat6a_t; + +/* used for store instructions: */ +typedef struct PACKED { + /* dword0: */ + uint32_t must_be_zero1 : 1; + uint32_t src : 8; + uint32_t off_hi : 5; /* high bits of 'off'... ugly! */ + uint32_t dummy1 : 9; + uint32_t must_be_one1 : 1; + int32_t iim_val : 8; + + /* dword1: */ + uint16_t off : 8; + uint32_t must_be_one2 : 1; + uint32_t dst : 8; + uint32_t type : 3; + uint32_t dummy2 : 2; + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; +} instr_cat6b_t; + +typedef union PACKED { + instr_cat6a_t a; + instr_cat6b_t b; + struct PACKED { + /* dword0: */ + uint32_t pad1 : 24; + int32_t iim_val : 8; + + /* dword1: */ + uint32_t pad2 : 17; + uint32_t type : 3; + uint32_t pad3 : 2; + uint32_t opc : 5; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; + }; +} instr_cat6_t; + +typedef union PACKED { + instr_cat0_t cat0; + instr_cat1_t cat1; + instr_cat2_t cat2; + instr_cat3_t cat3; + instr_cat4_t cat4; + instr_cat5_t cat5; + instr_cat6_t cat6; + struct PACKED { + /* dword0: */ + uint64_t pad1 : 40; + uint32_t repeat : 3; /* cat0-cat4 */ + uint32_t pad2 : 1; + uint32_t ss : 1; /* cat1-cat4 (cat0??) */ + uint32_t ul : 1; /* cat2-cat4 (and cat1 in blob.. which may be bug??) */ + uint32_t pad3 : 13; + uint32_t jmp_tgt : 1; + uint32_t sync : 1; + uint32_t opc_cat : 3; + + }; +} instr_t; + +static inline uint32_t instr_opc(instr_t *instr) +{ + switch (instr->opc_cat) { + case 0: return instr->cat0.opc; + case 1: return 0; + case 2: return instr->cat2.opc; + case 3: return instr->cat3.opc; + case 4: return instr->cat4.opc; + case 5: return instr->cat5.opc; + case 6: return instr->cat6.opc; + default: return 0; + } +} + +static inline bool is_mad(opc_t opc) +{ + switch (opc) { + case OPC_MAD_U16: + case OPC_MADSH_U16: + case OPC_MAD_S16: + case OPC_MADSH_M16: + case OPC_MAD_U24: + case OPC_MAD_S24: + case OPC_MAD_F16: + case OPC_MAD_F32: + return true; + default: + return false; + } +} + +#endif /* INSTR_A3XX_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c new file mode 100644 index 00000000000..ea2a9251b28 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -0,0 +1,675 @@ +/* + * Copyright (c) 2012 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3.h" + +#include +#include +#include +#include +#include +#include + +#include "freedreno_util.h" +#include "instr-a3xx.h" + +#define CHUNK_SZ 1020 + +struct ir3_heap_chunk { + struct ir3_heap_chunk *next; + uint32_t heap[CHUNK_SZ]; +}; + +static void grow_heap(struct ir3 *shader) +{ + struct ir3_heap_chunk *chunk = calloc(1, sizeof(*chunk)); + chunk->next = shader->chunk; + shader->chunk = chunk; + shader->heap_idx = 0; +} + +/* simple allocator to carve allocations out of an up-front allocated heap, + * so that we can free everything easily in one shot. + */ +void * ir3_alloc(struct ir3 *shader, int sz) +{ + void *ptr; + + sz = align(sz, 4) / 4; + + if ((shader->heap_idx + sz) > CHUNK_SZ) + grow_heap(shader); + + ptr = &shader->chunk->heap[shader->heap_idx]; + shader->heap_idx += sz; + + return ptr; +} + +struct ir3 * ir3_create(void) +{ + struct ir3 *shader = + calloc(1, sizeof(struct ir3)); + grow_heap(shader); + return shader; +} + +void ir3_destroy(struct ir3 *shader) +{ + while (shader->chunk) { + struct ir3_heap_chunk *chunk = shader->chunk; + shader->chunk = chunk->next; + free(chunk); + } + free(shader); +} + +#define iassert(cond) do { \ + if (!(cond)) { \ + assert(cond); \ + return -1; \ + } } while (0) + +static uint32_t reg(struct ir3_register *reg, struct ir3_info *info, + uint32_t repeat, uint32_t valid_flags) +{ + reg_t val = { .dummy32 = 0 }; + + assert(!(reg->flags & ~valid_flags)); + + if (!(reg->flags & IR3_REG_R)) + repeat = 0; + + if (reg->flags & IR3_REG_IMMED) { + val.iim_val = reg->iim_val; + } else { + int8_t components = util_last_bit(reg->wrmask); + int8_t max = (reg->num + repeat + components - 1) >> 2; + + val.comp = reg->num & 0x3; + val.num = reg->num >> 2; + + if (reg->flags & IR3_REG_CONST) { + info->max_const = MAX2(info->max_const, max); + } else if ((max != REG_A0) && (max != REG_P0)) { + if (reg->flags & IR3_REG_HALF) { + info->max_half_reg = MAX2(info->max_half_reg, max); + } else { + info->max_reg = MAX2(info->max_reg, max); + } + } + } + + return val.dummy32; +} + +static int emit_cat0(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + instr_cat0_t *cat0 = ptr; + + cat0->immed = instr->cat0.immed; + cat0->repeat = instr->repeat; + cat0->ss = !!(instr->flags & IR3_INSTR_SS); + cat0->inv = instr->cat0.inv; + cat0->comp = instr->cat0.comp; + cat0->opc = instr->opc; + cat0->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat0->sync = !!(instr->flags & IR3_INSTR_SY); + cat0->opc_cat = 0; + + return 0; +} + +static uint32_t type_flags(type_t type) +{ + return (type_size(type) == 32) ? 0 : IR3_REG_HALF; +} + +static int emit_cat1(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + instr_cat1_t *cat1 = ptr; + + iassert(instr->regs_count == 2); + iassert(!((dst->flags ^ type_flags(instr->cat1.dst_type)) & IR3_REG_HALF)); + iassert((src->flags & IR3_REG_IMMED) || + !((src->flags ^ type_flags(instr->cat1.src_type)) & IR3_REG_HALF)); + + if (src->flags & IR3_REG_IMMED) { + cat1->iim_val = src->iim_val; + cat1->src_im = 1; + } else if (src->flags & IR3_REG_RELATIV) { + cat1->off = src->offset; + cat1->src_rel = 1; + cat1->src_rel_c = !!(src->flags & IR3_REG_CONST); + } else { + cat1->src = reg(src, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_R | + IR3_REG_CONST | IR3_REG_HALF); + cat1->src_c = !!(src->flags & IR3_REG_CONST); + } + + cat1->dst = reg(dst, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_EVEN | + IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF); + cat1->repeat = instr->repeat; + cat1->src_r = !!(src->flags & IR3_REG_R); + cat1->ss = !!(instr->flags & IR3_INSTR_SS); + cat1->ul = !!(instr->flags & IR3_INSTR_UL); + cat1->dst_type = instr->cat1.dst_type; + cat1->dst_rel = !!(dst->flags & IR3_REG_RELATIV); + cat1->src_type = instr->cat1.src_type; + cat1->even = !!(dst->flags & IR3_REG_EVEN); + cat1->pos_inf = !!(dst->flags & IR3_REG_POS_INF); + cat1->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat1->sync = !!(instr->flags & IR3_INSTR_SY); + cat1->opc_cat = 1; + + return 0; +} + +static int emit_cat2(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src1 = instr->regs[1]; + struct ir3_register *src2 = instr->regs[2]; + instr_cat2_t *cat2 = ptr; + + iassert((instr->regs_count == 2) || (instr->regs_count == 3)); + + if (src1->flags & IR3_REG_RELATIV) { + iassert(src1->num < (1 << 10)); + cat2->rel1.src1 = reg(src1, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF); + cat2->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); + cat2->rel1.src1_rel = 1; + } else if (src1->flags & IR3_REG_CONST) { + iassert(src1->num < (1 << 12)); + cat2->c1.src1 = reg(src1, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + cat2->c1.src1_c = 1; + } else { + iassert(src1->num < (1 << 11)); + cat2->src1 = reg(src1, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + } + cat2->src1_im = !!(src1->flags & IR3_REG_IMMED); + cat2->src1_neg = !!(src1->flags & IR3_REG_NEGATE); + cat2->src1_abs = !!(src1->flags & IR3_REG_ABS); + cat2->src1_r = !!(src1->flags & IR3_REG_R); + + if (src2) { + iassert((src2->flags & IR3_REG_IMMED) || + !((src1->flags ^ src2->flags) & IR3_REG_HALF)); + + if (src2->flags & IR3_REG_RELATIV) { + iassert(src2->num < (1 << 10)); + cat2->rel2.src2 = reg(src2, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF); + cat2->rel2.src2_c = !!(src2->flags & IR3_REG_CONST); + cat2->rel2.src2_rel = 1; + } else if (src2->flags & IR3_REG_CONST) { + iassert(src2->num < (1 << 12)); + cat2->c2.src2 = reg(src2, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + cat2->c2.src2_c = 1; + } else { + iassert(src2->num < (1 << 11)); + cat2->src2 = reg(src2, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + } + + cat2->src2_im = !!(src2->flags & IR3_REG_IMMED); + cat2->src2_neg = !!(src2->flags & IR3_REG_NEGATE); + cat2->src2_abs = !!(src2->flags & IR3_REG_ABS); + cat2->src2_r = !!(src2->flags & IR3_REG_R); + } + + cat2->dst = reg(dst, info, instr->repeat, + IR3_REG_R | IR3_REG_EI | IR3_REG_HALF); + cat2->repeat = instr->repeat; + cat2->ss = !!(instr->flags & IR3_INSTR_SS); + cat2->ul = !!(instr->flags & IR3_INSTR_UL); + cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF); + cat2->ei = !!(dst->flags & IR3_REG_EI); + cat2->cond = instr->cat2.condition; + cat2->full = ! (src1->flags & IR3_REG_HALF); + cat2->opc = instr->opc; + cat2->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat2->sync = !!(instr->flags & IR3_INSTR_SY); + cat2->opc_cat = 2; + + return 0; +} + +static int emit_cat3(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src1 = instr->regs[1]; + struct ir3_register *src2 = instr->regs[2]; + struct ir3_register *src3 = instr->regs[3]; + instr_cat3_t *cat3 = ptr; + uint32_t src_flags = 0; + + switch (instr->opc) { + case OPC_MAD_F16: + case OPC_MAD_U16: + case OPC_MAD_S16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + case OPC_SAD_S32: // really?? + src_flags |= IR3_REG_HALF; + break; + default: + break; + } + + iassert(instr->regs_count == 4); + iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF)); + iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF)); + iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF)); + + if (src1->flags & IR3_REG_RELATIV) { + iassert(src1->num < (1 << 10)); + cat3->rel1.src1 = reg(src1, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_R | IR3_REG_HALF); + cat3->rel1.src1_c = !!(src1->flags & IR3_REG_CONST); + cat3->rel1.src1_rel = 1; + } else if (src1->flags & IR3_REG_CONST) { + iassert(src1->num < (1 << 12)); + cat3->c1.src1 = reg(src1, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R | + IR3_REG_HALF); + cat3->c1.src1_c = 1; + } else { + iassert(src1->num < (1 << 11)); + cat3->src1 = reg(src1, info, instr->repeat, + IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF); + } + + cat3->src1_neg = !!(src1->flags & IR3_REG_NEGATE); + cat3->src1_r = !!(src1->flags & IR3_REG_R); + + cat3->src2 = reg(src2, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_R | IR3_REG_HALF); + cat3->src2_c = !!(src2->flags & IR3_REG_CONST); + cat3->src2_neg = !!(src2->flags & IR3_REG_NEGATE); + cat3->src2_r = !!(src2->flags & IR3_REG_R); + + + if (src3->flags & IR3_REG_RELATIV) { + iassert(src3->num < (1 << 10)); + cat3->rel2.src3 = reg(src3, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_R | IR3_REG_HALF); + cat3->rel2.src3_c = !!(src3->flags & IR3_REG_CONST); + cat3->rel2.src3_rel = 1; + } else if (src3->flags & IR3_REG_CONST) { + iassert(src3->num < (1 << 12)); + cat3->c2.src3 = reg(src3, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R | + IR3_REG_HALF); + cat3->c2.src3_c = 1; + } else { + iassert(src3->num < (1 << 11)); + cat3->src3 = reg(src3, info, instr->repeat, + IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF); + } + + cat3->src3_neg = !!(src3->flags & IR3_REG_NEGATE); + cat3->src3_r = !!(src3->flags & IR3_REG_R); + + cat3->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat3->repeat = instr->repeat; + cat3->ss = !!(instr->flags & IR3_INSTR_SS); + cat3->ul = !!(instr->flags & IR3_INSTR_UL); + cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF); + cat3->opc = instr->opc; + cat3->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat3->sync = !!(instr->flags & IR3_INSTR_SY); + cat3->opc_cat = 3; + + return 0; +} + +static int emit_cat4(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + instr_cat4_t *cat4 = ptr; + + iassert(instr->regs_count == 2); + + if (src->flags & IR3_REG_RELATIV) { + iassert(src->num < (1 << 10)); + cat4->rel.src = reg(src, info, instr->repeat, + IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE | + IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF); + cat4->rel.src_c = !!(src->flags & IR3_REG_CONST); + cat4->rel.src_rel = 1; + } else if (src->flags & IR3_REG_CONST) { + iassert(src->num < (1 << 12)); + cat4->c.src = reg(src, info, instr->repeat, + IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + cat4->c.src_c = 1; + } else { + iassert(src->num < (1 << 11)); + cat4->src = reg(src, info, instr->repeat, + IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS | + IR3_REG_R | IR3_REG_HALF); + } + + cat4->src_im = !!(src->flags & IR3_REG_IMMED); + cat4->src_neg = !!(src->flags & IR3_REG_NEGATE); + cat4->src_abs = !!(src->flags & IR3_REG_ABS); + cat4->src_r = !!(src->flags & IR3_REG_R); + + cat4->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat4->repeat = instr->repeat; + cat4->ss = !!(instr->flags & IR3_INSTR_SS); + cat4->ul = !!(instr->flags & IR3_INSTR_UL); + cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF); + cat4->full = ! (src->flags & IR3_REG_HALF); + cat4->opc = instr->opc; + cat4->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat4->sync = !!(instr->flags & IR3_INSTR_SY); + cat4->opc_cat = 4; + + return 0; +} + +static int emit_cat5(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src1 = instr->regs[1]; + struct ir3_register *src2 = instr->regs[2]; + struct ir3_register *src3 = instr->regs[3]; + instr_cat5_t *cat5 = ptr; + + iassert(!((dst->flags ^ type_flags(instr->cat5.type)) & IR3_REG_HALF)); + + if (src1) { + cat5->full = ! (src1->flags & IR3_REG_HALF); + cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF); + } + + + if (instr->flags & IR3_INSTR_S2EN) { + if (src2) { + iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); + cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); + } + if (src3) { + iassert(src3->flags & IR3_REG_HALF); + cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF); + } + iassert(!(instr->cat5.samp | instr->cat5.tex)); + } else { + iassert(!src3); + if (src2) { + iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF)); + cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF); + } + cat5->norm.samp = instr->cat5.samp; + cat5->norm.tex = instr->cat5.tex; + } + + cat5->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + cat5->wrmask = dst->wrmask; + cat5->type = instr->cat5.type; + cat5->is_3d = !!(instr->flags & IR3_INSTR_3D); + cat5->is_a = !!(instr->flags & IR3_INSTR_A); + cat5->is_s = !!(instr->flags & IR3_INSTR_S); + cat5->is_s2en = !!(instr->flags & IR3_INSTR_S2EN); + cat5->is_o = !!(instr->flags & IR3_INSTR_O); + cat5->is_p = !!(instr->flags & IR3_INSTR_P); + cat5->opc = instr->opc; + cat5->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat5->sync = !!(instr->flags & IR3_INSTR_SY); + cat5->opc_cat = 5; + + return 0; +} + +static int emit_cat6(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) +{ + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + instr_cat6_t *cat6 = ptr; + + iassert(instr->regs_count == 2); + + switch (instr->opc) { + /* load instructions: */ + case OPC_LDG: + case OPC_LDP: + case OPC_LDL: + case OPC_LDLW: + case OPC_LDLV: + case OPC_PREFETCH: { + instr_cat6a_t *cat6a = ptr; + + iassert(!((dst->flags ^ type_flags(instr->cat6.type)) & IR3_REG_HALF)); + + cat6a->must_be_one1 = 1; + cat6a->must_be_one2 = 1; + cat6a->off = instr->cat6.offset; + cat6a->src = reg(src, info, instr->repeat, 0); + cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF); + break; + } + /* store instructions: */ + case OPC_STG: + case OPC_STP: + case OPC_STL: + case OPC_STLW: + case OPC_STI: { + instr_cat6b_t *cat6b = ptr; + uint32_t src_flags = type_flags(instr->cat6.type); + uint32_t dst_flags = (instr->opc == OPC_STI) ? IR3_REG_HALF : 0; + + iassert(!((src->flags ^ src_flags) & IR3_REG_HALF)); + + cat6b->must_be_one1 = 1; + cat6b->must_be_one2 = 1; + cat6b->src = reg(src, info, instr->repeat, src_flags); + cat6b->off_hi = instr->cat6.offset >> 8; + cat6b->off = instr->cat6.offset; + cat6b->dst = reg(dst, info, instr->repeat, IR3_REG_R | dst_flags); + + break; + } + default: + // TODO + break; + } + + cat6->iim_val = instr->cat6.iim_val; + cat6->type = instr->cat6.type; + cat6->opc = instr->opc; + cat6->jmp_tgt = !!(instr->flags & IR3_INSTR_JP); + cat6->sync = !!(instr->flags & IR3_INSTR_SY); + cat6->opc_cat = 6; + + return 0; +} + +static int (*emit[])(struct ir3_instruction *instr, void *ptr, + struct ir3_info *info) = { + emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6, +}; + +void * ir3_assemble(struct ir3 *shader, struct ir3_info *info) +{ + uint32_t *ptr, *dwords; + uint32_t i; + + info->max_reg = -1; + info->max_half_reg = -1; + info->max_const = -1; + info->instrs_count = 0; + + /* need a integer number of instruction "groups" (sets of four + * instructions), so pad out w/ NOPs if needed: + * (each instruction is 64bits) + */ + info->sizedwords = 2 * align(shader->instrs_count, 4); + + ptr = dwords = calloc(1, 4 * info->sizedwords); + + for (i = 0; i < shader->instrs_count; i++) { + struct ir3_instruction *instr = shader->instrs[i]; + int ret = emit[instr->category](instr, dwords, info); + if (ret) + goto fail; + info->instrs_count += 1 + instr->repeat; + dwords += 2; + } + + return ptr; + +fail: + free(ptr); + return NULL; +} + +static struct ir3_register * reg_create(struct ir3 *shader, + int num, int flags) +{ + struct ir3_register *reg = + ir3_alloc(shader, sizeof(struct ir3_register)); + reg->wrmask = 1; + reg->flags = flags; + reg->num = num; + return reg; +} + +static void insert_instr(struct ir3 *shader, + struct ir3_instruction *instr) +{ +#ifdef DEBUG + static uint32_t serialno = 0; + instr->serialno = ++serialno; +#endif + if (shader->instrs_count == shader->instrs_sz) { + shader->instrs_sz = MAX2(2 * shader->instrs_sz, 16); + shader->instrs = realloc(shader->instrs, + shader->instrs_sz * sizeof(shader->instrs[0])); + } + shader->instrs[shader->instrs_count++] = instr; +} + +struct ir3_block * ir3_block_create(struct ir3 *shader, + unsigned ntmp, unsigned nin, unsigned nout) +{ + struct ir3_block *block; + unsigned size; + char *ptr; + + size = sizeof(*block); + size += sizeof(block->temporaries[0]) * ntmp; + size += sizeof(block->inputs[0]) * nin; + size += sizeof(block->outputs[0]) * nout; + + ptr = ir3_alloc(shader, size); + + block = (void *)ptr; + ptr += sizeof(*block); + + block->temporaries = (void *)ptr; + block->ntemporaries = ntmp; + ptr += sizeof(block->temporaries[0]) * ntmp; + + block->inputs = (void *)ptr; + block->ninputs = nin; + ptr += sizeof(block->inputs[0]) * nin; + + block->outputs = (void *)ptr; + block->noutputs = nout; + ptr += sizeof(block->outputs[0]) * nout; + + block->shader = shader; + + return block; +} + +struct ir3_instruction * ir3_instr_create(struct ir3_block *block, + int category, opc_t opc) +{ + struct ir3_instruction *instr = + ir3_alloc(block->shader, sizeof(struct ir3_instruction)); + instr->block = block; + instr->category = category; + instr->opc = opc; + insert_instr(block->shader, instr); + return instr; +} + +struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) +{ + struct ir3_instruction *new_instr = + ir3_alloc(instr->block->shader, sizeof(struct ir3_instruction)); + unsigned i; + + *new_instr = *instr; + insert_instr(instr->block->shader, new_instr); + + /* clone registers: */ + new_instr->regs_count = 0; + for (i = 0; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + struct ir3_register *new_reg = + ir3_reg_create(new_instr, reg->num, reg->flags); + *new_reg = *reg; + } + + return new_instr; +} + +struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, + int num, int flags) +{ + struct ir3_register *reg = reg_create(instr->block->shader, num, flags); + assert(instr->regs_count < ARRAY_SIZE(instr->regs)); + instr->regs[instr->regs_count++] = reg; + return reg; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h new file mode 100644 index 00000000000..9ed914ba2e4 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2013 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IR3_H_ +#define IR3_H_ + +#include +#include + +#include "instr-a3xx.h" +#include "disasm.h" /* TODO move 'enum shader_t' somewhere else.. */ + +/* low level intermediate representation of an adreno shader program */ + +struct ir3; +struct ir3_instruction; +struct ir3_block; + +struct ir3 * fd_asm_parse(const char *src); + +struct ir3_info { + uint16_t sizedwords; + uint16_t instrs_count; /* expanded to account for rpt's */ + /* NOTE: max_reg, etc, does not include registers not touched + * by the shader (ie. vertex fetched via VFD_DECODE but not + * touched by shader) + */ + int8_t max_reg; /* highest GPR # used by shader */ + int8_t max_half_reg; + int8_t max_const; +}; + +struct ir3_register { + enum { + IR3_REG_CONST = 0x001, + IR3_REG_IMMED = 0x002, + IR3_REG_HALF = 0x004, + IR3_REG_RELATIV= 0x008, + IR3_REG_R = 0x010, + IR3_REG_NEGATE = 0x020, + IR3_REG_ABS = 0x040, + IR3_REG_EVEN = 0x080, + IR3_REG_POS_INF= 0x100, + /* (ei) flag, end-input? Set on last bary, presumably to signal + * that the shader needs no more input: + */ + IR3_REG_EI = 0x200, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */ + IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */ + IR3_REG_ADDR = 0x4000, /* register is a0.x */ + } flags; + union { + /* normal registers: + * the component is in the low two bits of the reg #, so + * rN.x becomes: (N << 2) | x + */ + int num; + /* immediate: */ + int iim_val; + float fim_val; + /* relative: */ + int offset; + /* for IR3_REG_SSA, src registers contain ptr back to + * assigning instruction. + */ + struct ir3_instruction *instr; + }; + + /* used for cat5 instructions, but also for internal/IR level + * tracking of what registers are read/written by an instruction. + * wrmask may be a bad name since it is used to represent both + * src and dst that touch multiple adjacent registers. + */ + int wrmask; +}; + +struct ir3_instruction { + struct ir3_block *block; + int category; + opc_t opc; + enum { + /* (sy) flag is set on first instruction, and after sample + * instructions (probably just on RAW hazard). + */ + IR3_INSTR_SY = 0x001, + /* (ss) flag is set on first instruction, and first instruction + * to depend on the result of "long" instructions (RAW hazard): + * + * rcp, rsq, log2, exp2, sin, cos, sqrt + * + * It seems to synchronize until all in-flight instructions are + * completed, for example: + * + * rsq hr1.w, hr1.w + * add.f hr2.z, (neg)hr2.z, hc0.y + * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y + * rsq hr2.x, hr2.x + * (rpt1)nop + * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w + * nop + * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w + * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w + * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x + * + * The last mul.f does not have (ss) set, presumably because the + * (ss) on the previous instruction does the job. + * + * The blob driver also seems to set it on WAR hazards, although + * not really clear if this is needed or just blob compiler being + * sloppy. So far I haven't found a case where removing the (ss) + * causes problems for WAR hazard, but I could just be getting + * lucky: + * + * rcp r1.y, r3.y + * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z + * + */ + IR3_INSTR_SS = 0x002, + /* (jp) flag is set on jump targets: + */ + IR3_INSTR_JP = 0x004, + IR3_INSTR_UL = 0x008, + IR3_INSTR_3D = 0x010, + IR3_INSTR_A = 0x020, + IR3_INSTR_O = 0x040, + IR3_INSTR_P = 0x080, + IR3_INSTR_S = 0x100, + IR3_INSTR_S2EN = 0x200, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_INSTR_MARK = 0x1000, + } flags; + int repeat; + unsigned regs_count; + struct ir3_register *regs[5]; + union { + struct { + char inv; + char comp; + int immed; + } cat0; + struct { + type_t src_type, dst_type; + } cat1; + struct { + enum { + IR3_COND_LT = 0, + IR3_COND_LE = 1, + IR3_COND_GT = 2, + IR3_COND_GE = 3, + IR3_COND_EQ = 4, + IR3_COND_NE = 5, + } condition; + } cat2; + struct { + unsigned samp, tex; + type_t type; + } cat5; + struct { + type_t type; + int offset; + int iim_val; + } cat6; + /* for meta-instructions, just used to hold extra data + * before instruction scheduling, etc + */ + struct { + int off; /* component/offset */ + } fo; + struct { + struct ir3_block *if_block, *else_block; + } flow; + struct { + struct ir3_block *block; + } inout; + }; + + /* transient values used during various algorithms: */ + union { + /* The instruction depth is the max dependency distance to output. + * + * You can also think of it as the "cost", if we did any sort of + * optimization for register footprint. Ie. a value that is just + * result of moving a const to a reg would have a low cost, so to + * it could make sense to duplicate the instruction at various + * points where the result is needed to reduce register footprint. + */ + unsigned depth; + }; + struct ir3_instruction *next; +#ifdef DEBUG + uint32_t serialno; +#endif +}; + +struct ir3_heap_chunk; + +struct ir3 { + unsigned instrs_count, instrs_sz; + struct ir3_instruction **instrs; + unsigned heap_idx; + struct ir3_heap_chunk *chunk; +}; + +struct ir3_block { + struct ir3 *shader; + unsigned ntemporaries, ninputs, noutputs; + /* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */ + struct ir3_instruction **temporaries; + struct ir3_instruction **inputs; + struct ir3_instruction **outputs; + /* only a single address register: */ + struct ir3_instruction *address; + struct ir3_block *parent; + struct ir3_instruction *head; +}; + +struct ir3 * ir3_create(void); +void ir3_destroy(struct ir3 *shader); +void * ir3_assemble(struct ir3 *shader, + struct ir3_info *info); +void * ir3_alloc(struct ir3 *shader, int sz); + +struct ir3_block * ir3_block_create(struct ir3 *shader, + unsigned ntmp, unsigned nin, unsigned nout); + +struct ir3_instruction * ir3_instr_create(struct ir3_block *block, + int category, opc_t opc); +struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr); +const char *ir3_instr_name(struct ir3_instruction *instr); + +struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, + int num, int flags); + + +static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) +{ + if (instr->flags & IR3_INSTR_MARK) + return true; /* already visited */ + instr->flags ^= IR3_INSTR_MARK; + return false; +} + +static inline void ir3_clear_mark(struct ir3 *shader) +{ + /* TODO would be nice to drop the instruction array.. for + * new compiler, _clear_mark() is all we use it for, and + * we could probably manage a linked list instead.. + */ + unsigned i; + for (i = 0; i < shader->instrs_count; i++) { + struct ir3_instruction *instr = shader->instrs[i]; + instr->flags &= ~IR3_INSTR_MARK; + } +} + +static inline int ir3_instr_regno(struct ir3_instruction *instr, + struct ir3_register *reg) +{ + unsigned i; + for (i = 0; i < instr->regs_count; i++) + if (reg == instr->regs[i]) + return i; + return -1; +} + + +/* comp: + * 0 - x + * 1 - y + * 2 - z + * 3 - w + */ +static inline uint32_t regid(int num, int comp) +{ + return (num << 2) | (comp & 0x3); +} + +static inline uint32_t reg_num(struct ir3_register *reg) +{ + return reg->num >> 2; +} + +static inline uint32_t reg_comp(struct ir3_register *reg) +{ + return reg->num & 0x3; +} + +static inline bool is_flow(struct ir3_instruction *instr) +{ + return (instr->category == 0); +} + +static inline bool is_kill(struct ir3_instruction *instr) +{ + return is_flow(instr) && (instr->opc == OPC_KILL); +} + +static inline bool is_nop(struct ir3_instruction *instr) +{ + return is_flow(instr) && (instr->opc == OPC_NOP); +} + +static inline bool is_alu(struct ir3_instruction *instr) +{ + return (1 <= instr->category) && (instr->category <= 3); +} + +static inline bool is_sfu(struct ir3_instruction *instr) +{ + return (instr->category == 4); +} + +static inline bool is_tex(struct ir3_instruction *instr) +{ + return (instr->category == 5); +} + +static inline bool is_input(struct ir3_instruction *instr) +{ + return (instr->category == 2) && (instr->opc == OPC_BARY_F); +} + +static inline bool is_meta(struct ir3_instruction *instr) +{ + /* TODO how should we count PHI (and maybe fan-in/out) which + * might actually contribute some instructions to the final + * result? + */ + return (instr->category == -1); +} + +static inline bool is_addr(struct ir3_instruction *instr) +{ + return is_meta(instr) && (instr->opc == OPC_META_DEREF); +} + +static inline bool writes_addr(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return !!(dst->flags & IR3_REG_ADDR); + } + return false; +} + +static inline bool writes_pred(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return reg_num(dst) == REG_P0; + } + return false; +} + +static inline bool reg_gpr(struct ir3_register *r) +{ + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR)) + return false; + if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) + return false; + return true; +} + +/* dump: */ +#include +void ir3_dump(struct ir3 *shader, const char *name, + struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */, + FILE *f); +void ir3_dump_instr_single(struct ir3_instruction *instr); +void ir3_dump_instr_list(struct ir3_instruction *instr); + +/* flatten if/else: */ +int ir3_block_flatten(struct ir3_block *block); + +/* depth calculation: */ +int ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n); +void ir3_block_depth(struct ir3_block *block); + +/* copy-propagate: */ +void ir3_block_cp(struct ir3_block *block); + +/* scheduling: */ +void ir3_block_sched(struct ir3_block *block); + +/* register assignment: */ +int ir3_block_ra(struct ir3_block *block, enum shader_t type, + bool half_precision, bool frag_coord, bool frag_face, + bool *has_samp); + +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#endif + +/* ************************************************************************* */ +/* split this out or find some helper to use.. like main/bitset.h.. */ + +#include + +#define MAX_REG 256 + +typedef uint8_t regmask_t[2 * MAX_REG / 8]; + +static inline unsigned regmask_idx(struct ir3_register *reg) +{ + unsigned num = reg->num; + assert(num < MAX_REG); + if (reg->flags & IR3_REG_HALF) + num += MAX_REG; + return num; +} + +static inline void regmask_init(regmask_t *regmask) +{ + memset(regmask, 0, sizeof(*regmask)); +} + +static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg) +{ + unsigned idx = regmask_idx(reg); + unsigned i; + for (i = 0; i < 4; i++, idx++) + if (reg->wrmask & (1 << i)) + (*regmask)[idx / 8] |= 1 << (idx % 8); +} + +/* set bits in a if not set in b, conceptually: + * a |= (reg & ~b) + */ +static inline void regmask_set_if_not(regmask_t *a, + struct ir3_register *reg, regmask_t *b) +{ + unsigned idx = regmask_idx(reg); + unsigned i; + for (i = 0; i < 4; i++, idx++) + if (reg->wrmask & (1 << i)) + if (!((*b)[idx / 8] & (1 << (idx % 8)))) + (*a)[idx / 8] |= 1 << (idx % 8); +} + +static inline unsigned regmask_get(regmask_t *regmask, + struct ir3_register *reg) +{ + unsigned idx = regmask_idx(reg); + unsigned i; + for (i = 0; i < 4; i++, idx++) + if (reg->wrmask & (1 << i)) + if ((*regmask)[idx / 8] & (1 << (idx % 8))) + return true; + return false; +} + +/* ************************************************************************* */ + +#endif /* IR3_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c new file mode 100644 index 00000000000..1fa2fd4e389 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c @@ -0,0 +1,2639 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2013 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include + +#include "pipe/p_state.h" +#include "util/u_string.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_ureg.h" +#include "tgsi/tgsi_info.h" +#include "tgsi/tgsi_strings.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_scan.h" + +#include "freedreno_lowering.h" +#include "freedreno_util.h" + +#include "ir3_compiler.h" +#include "ir3_shader.h" + +#include "instr-a3xx.h" +#include "ir3.h" + +struct ir3_compile_context { + const struct tgsi_token *tokens; + bool free_tokens; + struct ir3 *ir; + struct ir3_shader_variant *so; + + struct ir3_block *block; + struct ir3_instruction *current_instr; + + /* we need to defer updates to block->outputs[] until the end + * of an instruction (so we don't see new value until *after* + * the src registers are processed) + */ + struct { + struct ir3_instruction *instr, **instrp; + } output_updates[16]; + unsigned num_output_updates; + + /* are we in a sequence of "atomic" instructions? + */ + bool atomic; + + /* For fragment shaders, from the hw perspective the only + * actual input is r0.xy position register passed to bary.f. + * But TGSI doesn't know that, it still declares things as + * IN[] registers. So we do all the input tracking normally + * and fix things up after compile_instructions() + * + * NOTE that frag_pos is the hardware position (possibly it + * is actually an index or tag or some such.. it is *not* + * values that can be directly used for gl_FragCoord..) + */ + struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4]; + + struct tgsi_parse_context parser; + unsigned type; + + struct tgsi_shader_info info; + + /* for calculating input/output positions/linkages: */ + unsigned next_inloc; + + unsigned num_internal_temps; + struct tgsi_src_register internal_temps[6]; + + /* idx/slot for last compiler generated immediate */ + unsigned immediate_idx; + + /* stack of branch instructions that mark (potentially nested) + * branch if/else/loop/etc + */ + struct { + struct ir3_instruction *instr, *cond; + bool inv; /* true iff in else leg of branch */ + } branch[16]; + unsigned int branch_count; + + /* list of kill instructions: */ + struct ir3_instruction *kill[16]; + unsigned int kill_count; + + /* used when dst is same as one of the src, to avoid overwriting a + * src element before the remaining scalar instructions that make + * up the vector operation + */ + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; +}; + + +static void vectorize(struct ir3_compile_context *ctx, + struct ir3_instruction *instr, struct tgsi_dst_register *dst, + int nsrcs, ...); +static void create_mov(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, struct tgsi_src_register *src); +static type_t get_ftype(struct ir3_compile_context *ctx); + +static unsigned +compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, + const struct tgsi_token *tokens) +{ + unsigned ret; + struct tgsi_shader_info *info = &ctx->info; + const struct fd_lowering_config lconfig = { + .color_two_side = so->key.color_two_side, + .lower_DST = true, + .lower_XPD = true, + .lower_SCS = true, + .lower_LRP = true, + .lower_FRC = true, + .lower_POW = true, + .lower_LIT = true, + .lower_EXP = true, + .lower_LOG = true, + .lower_DP4 = true, + .lower_DP3 = true, + .lower_DPH = true, + .lower_DP2 = true, + .lower_DP2A = true, + }; + + ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info); + ctx->free_tokens = !!ctx->tokens; + if (!ctx->tokens) { + /* no lowering */ + ctx->tokens = tokens; + } + ctx->ir = so->ir; + ctx->so = so; + ctx->next_inloc = 8; + ctx->num_internal_temps = 0; + ctx->branch_count = 0; + ctx->kill_count = 0; + ctx->block = NULL; + ctx->current_instr = NULL; + ctx->num_output_updates = 0; + ctx->atomic = false; + ctx->frag_pos = NULL; + ctx->frag_face = NULL; + + memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord)); + +#define FM(x) (1 << TGSI_FILE_##x) + /* optimize can't deal with relative addressing: */ + if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) + return TGSI_PARSE_ERROR; + + /* Immediates go after constants: */ + so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1; + ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); + + ret = tgsi_parse_init(&ctx->parser, ctx->tokens); + if (ret != TGSI_PARSE_OK) + return ret; + + ctx->type = ctx->parser.FullHeader.Processor.Processor; + + return ret; +} + +static void +compile_error(struct ir3_compile_context *ctx, const char *format, ...) +{ + va_list ap; + va_start(ap, format); + _debug_vprintf(format, ap); + va_end(ap); + tgsi_dump(ctx->tokens, 0); + debug_assert(0); +} + +#define compile_assert(ctx, cond) do { \ + if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ + } while (0) + +static void +compile_free(struct ir3_compile_context *ctx) +{ + if (ctx->free_tokens) + free((void *)ctx->tokens); + tgsi_parse_free(&ctx->parser); +} + +struct instr_translater { + void (*fxn)(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst); + unsigned tgsi_opc; + opc_t opc; + opc_t hopc; /* opc to use for half_precision mode, if different */ + unsigned arg; +}; + +static void +instr_finish(struct ir3_compile_context *ctx) +{ + unsigned i; + + if (ctx->atomic) + return; + + for (i = 0; i < ctx->num_output_updates; i++) + *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr; + + ctx->num_output_updates = 0; +} + +/* For "atomic" groups of instructions, for example the four scalar + * instructions to perform a vec4 operation. Basically this just + * blocks out handling of output_updates so the next scalar instruction + * still sees the result from before the start of the atomic group. + * + * NOTE: when used properly, this could probably replace get/put_dst() + * stuff. + */ +static void +instr_atomic_start(struct ir3_compile_context *ctx) +{ + ctx->atomic = true; +} + +static void +instr_atomic_end(struct ir3_compile_context *ctx) +{ + ctx->atomic = false; + instr_finish(ctx); +} + +static struct ir3_instruction * +instr_create(struct ir3_compile_context *ctx, int category, opc_t opc) +{ + instr_finish(ctx); + return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc)); +} + +static struct ir3_instruction * +instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr) +{ + instr_finish(ctx); + return (ctx->current_instr = ir3_instr_clone(instr)); +} + +static struct ir3_block * +push_block(struct ir3_compile_context *ctx) +{ + struct ir3_block *block; + unsigned ntmp, nin, nout; + +#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1)) + + /* hmm, give ourselves room to create 4 extra temporaries (vec4): + */ + ntmp = SCALAR_REGS(TEMPORARY); + ntmp += 4 * 4; + + nout = SCALAR_REGS(OUTPUT); + nin = SCALAR_REGS(INPUT); + + /* for outermost block, 'inputs' are the actual shader INPUT + * register file. Reads from INPUT registers always go back to + * top block. For nested blocks, 'inputs' is used to track any + * TEMPORARY file register from one of the enclosing blocks that + * is ready in this block. + */ + if (!ctx->block) { + /* NOTE: fragment shaders actually have two inputs (r0.xy, the + * position) + */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + int n = 2; + if (ctx->info.reads_position) + n += 4; + if (ctx->info.uses_frontface) + n += 4; + nin = MAX2(n, nin); + nout += ARRAY_SIZE(ctx->kill); + } + } else { + nin = ntmp; + } + + block = ir3_block_create(ctx->ir, ntmp, nin, nout); + + if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block) + block->noutputs -= ARRAY_SIZE(ctx->kill); + + block->parent = ctx->block; + ctx->block = block; + + return block; +} + +static void +pop_block(struct ir3_compile_context *ctx) +{ + ctx->block = ctx->block->parent; + compile_assert(ctx, ctx->block); +} + +static struct ir3_instruction * +create_output(struct ir3_block *block, struct ir3_instruction *instr, + unsigned n) +{ + struct ir3_instruction *out; + + out = ir3_instr_create(block, -1, OPC_META_OUTPUT); + out->inout.block = block; + ir3_reg_create(out, n, 0); + if (instr) + ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr; + + return out; +} + +static struct ir3_instruction * +create_input(struct ir3_block *block, struct ir3_instruction *instr, + unsigned n) +{ + struct ir3_instruction *in; + + in = ir3_instr_create(block, -1, OPC_META_INPUT); + in->inout.block = block; + ir3_reg_create(in, n, 0); + if (instr) + ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; + + return in; +} + +static struct ir3_instruction * +block_input(struct ir3_block *block, unsigned n) +{ + /* references to INPUT register file always go back up to + * top level: + */ + if (block->parent) + return block_input(block->parent, n); + return block->inputs[n]; +} + +/* return temporary in scope, creating if needed meta-input node + * to track block inputs + */ +static struct ir3_instruction * +block_temporary(struct ir3_block *block, unsigned n) +{ + /* references to TEMPORARY register file, find the nearest + * enclosing block which has already assigned this temporary, + * creating meta-input instructions along the way to keep + * track of block inputs + */ + if (block->parent && !block->temporaries[n]) { + /* if already have input for this block, reuse: */ + if (!block->inputs[n]) + block->inputs[n] = block_temporary(block->parent, n); + + /* and create new input to return: */ + return create_input(block, block->inputs[n], n); + } + return block->temporaries[n]; +} + +static struct ir3_instruction * +create_immed(struct ir3_compile_context *ctx, float val) +{ + /* NOTE: *don't* use instr_create() here! + */ + struct ir3_instruction *instr; + instr = ir3_instr_create(ctx->block, 1, 0); + instr->cat1.src_type = get_ftype(ctx); + instr->cat1.dst_type = get_ftype(ctx); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val; + return instr; +} + +static void +ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_dst_register *dst, unsigned chan) +{ + unsigned n = regid(dst->Index, chan); + unsigned idx = ctx->num_output_updates; + + compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates)); + + /* NOTE: defer update of temporaries[idx] or output[idx] + * until instr_finish(), so that if the current instruction + * reads the same TEMP/OUT[] it gets the old value: + * + * bleh.. this might be a bit easier to just figure out + * in instr_finish(). But at that point we've already + * lost information about OUTPUT vs TEMPORARY register + * file.. + */ + + switch (dst->File) { + case TGSI_FILE_OUTPUT: + compile_assert(ctx, n < ctx->block->noutputs); + ctx->output_updates[idx].instrp = &ctx->block->outputs[n]; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; + case TGSI_FILE_TEMPORARY: + compile_assert(ctx, n < ctx->block->ntemporaries); + ctx->output_updates[idx].instrp = &ctx->block->temporaries[n]; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; + case TGSI_FILE_ADDRESS: + compile_assert(ctx, n < 1); + ctx->output_updates[idx].instrp = &ctx->block->address; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; + } +} + +static void +ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg, + const struct tgsi_src_register *src, unsigned chan) +{ + struct ir3_block *block = ctx->block; + unsigned n = regid(src->Index, chan); + + switch (src->File) { + case TGSI_FILE_INPUT: + reg->flags |= IR3_REG_SSA; + reg->instr = block_input(ctx->block, n); + break; + case TGSI_FILE_OUTPUT: + /* really this should just happen in case of 'MOV_SAT OUT[n], ..', + * for the following clamp instructions: + */ + reg->flags |= IR3_REG_SSA; + reg->instr = block->outputs[n]; + /* we don't have to worry about read from an OUTPUT that was + * assigned outside of the current block, because the _SAT + * clamp instructions will always be in the same block as + * the original instruction which wrote the OUTPUT + */ + compile_assert(ctx, reg->instr); + break; + case TGSI_FILE_TEMPORARY: + reg->flags |= IR3_REG_SSA; + reg->instr = block_temporary(ctx->block, n); + break; + } + + if ((reg->flags & IR3_REG_SSA) && !reg->instr) { + /* this can happen when registers (or components of a TGSI + * register) are used as src before they have been assigned + * (undefined contents). To avoid confusing the rest of the + * compiler, and to generally keep things peachy, substitute + * an instruction that sets the src to 0.0. Or to keep + * things undefined, I could plug in a random number? :-P + * + * NOTE: *don't* use instr_create() here! + */ + reg->instr = create_immed(ctx, 0.0); + } +} + +static struct ir3_register * +add_dst_reg_wrmask(struct ir3_compile_context *ctx, + struct ir3_instruction *instr, const struct tgsi_dst_register *dst, + unsigned chan, unsigned wrmask) +{ + unsigned flags = 0, num = 0; + struct ir3_register *reg; + + switch (dst->File) { + case TGSI_FILE_OUTPUT: + case TGSI_FILE_TEMPORARY: + /* uses SSA */ + break; + case TGSI_FILE_ADDRESS: + flags |= IR3_REG_ADDR; + /* uses SSA */ + break; + default: + compile_error(ctx, "unsupported dst register file: %s\n", + tgsi_file_name(dst->File)); + break; + } + + if (dst->Indirect) + flags |= IR3_REG_RELATIV; + + reg = ir3_reg_create(instr, regid(num, chan), flags); + + /* NOTE: do not call ssa_dst() if atomic.. vectorize() + * itself will call ssa_dst(). This is to filter out + * the (initially bogus) .x component dst which is + * created (but not necessarily used, ie. if the net + * result of the vector operation does not write to + * the .x component) + */ + + reg->wrmask = wrmask; + if (wrmask == 0x1) { + /* normal case */ + if (!ctx->atomic) + ssa_dst(ctx, instr, dst, chan); + } else if ((dst->File == TGSI_FILE_TEMPORARY) || + (dst->File == TGSI_FILE_OUTPUT) || + (dst->File == TGSI_FILE_ADDRESS)) { + unsigned i; + + /* if instruction writes multiple, we need to create + * some place-holder collect the registers: + */ + for (i = 0; i < 4; i++) { + if (wrmask & (1 << i)) { + struct ir3_instruction *collect = + ir3_instr_create(ctx->block, -1, OPC_META_FO); + collect->fo.off = i; + /* unused dst reg: */ + ir3_reg_create(collect, 0, 0); + /* and src reg used to hold original instr */ + ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr; + if (!ctx->atomic) + ssa_dst(ctx, collect, dst, chan+i); + } + } + } + + return reg; +} + +static struct ir3_register * +add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_dst_register *dst, unsigned chan) +{ + return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1); +} + +static struct ir3_register * +add_src_reg_wrmask(struct ir3_compile_context *ctx, + struct ir3_instruction *instr, const struct tgsi_src_register *src, + unsigned chan, unsigned wrmask) +{ + unsigned flags = 0, num = 0; + struct ir3_register *reg; + struct ir3_instruction *orig = NULL; + + /* TODO we need to use a mov to temp for const >= 64.. or maybe + * we could use relative addressing.. + */ + compile_assert(ctx, src->Index < 64); + + switch (src->File) { + case TGSI_FILE_IMMEDIATE: + /* TODO if possible, use actual immediate instead of const.. but + * TGSI has vec4 immediates, we can only embed scalar (of limited + * size, depending on instruction..) + */ + flags |= IR3_REG_CONST; + num = src->Index + ctx->so->first_immediate; + break; + case TGSI_FILE_CONSTANT: + flags |= IR3_REG_CONST; + num = src->Index; + break; + case TGSI_FILE_OUTPUT: + /* NOTE: we should only end up w/ OUTPUT file for things like + * clamp()'ing saturated dst instructions + */ + case TGSI_FILE_INPUT: + case TGSI_FILE_TEMPORARY: + /* uses SSA */ + break; + default: + compile_error(ctx, "unsupported src register file: %s\n", + tgsi_file_name(src->File)); + break; + } + + if (src->Absolute) + flags |= IR3_REG_ABS; + if (src->Negate) + flags |= IR3_REG_NEGATE; + + if (src->Indirect) { + flags |= IR3_REG_RELATIV; + + /* shouldn't happen, and we can't cope with it below: */ + compile_assert(ctx, wrmask == 0x1); + + /* wrap in a meta-deref to track both the src and address: */ + orig = instr; + + instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address; + } + + reg = ir3_reg_create(instr, regid(num, chan), flags); + + reg->wrmask = wrmask; + if (wrmask == 0x1) { + /* normal case */ + ssa_src(ctx, reg, src, chan); + } else if ((src->File == TGSI_FILE_TEMPORARY) || + (src->File == TGSI_FILE_OUTPUT) || + (src->File == TGSI_FILE_INPUT)) { + struct ir3_instruction *collect; + unsigned i; + + compile_assert(ctx, !src->Indirect); + + /* if instruction reads multiple, we need to create + * some place-holder collect the registers: + */ + collect = ir3_instr_create(ctx->block, -1, OPC_META_FI); + ir3_reg_create(collect, 0, 0); /* unused dst reg */ + + for (i = 0; i < 4; i++) { + if (wrmask & (1 << i)) { + /* and src reg used point to the original instr */ + ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), + src, chan + i); + } else if (wrmask & ~((i << i) - 1)) { + /* if any remaining components, then dummy + * placeholder src reg to fill in the blanks: + */ + ir3_reg_create(collect, 0, 0); + } + } + + reg->flags |= IR3_REG_SSA; + reg->instr = collect; + } + + if (src->Indirect) { + reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA); + reg->instr = instr; + } + return reg; +} + +static struct ir3_register * +add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_src_register *src, unsigned chan) +{ + return add_src_reg_wrmask(ctx, instr, src, chan, 0x1); +} + +static void +src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) +{ + src->File = dst->File; + src->Indirect = dst->Indirect; + src->Dimension = dst->Dimension; + src->Index = dst->Index; + src->Absolute = 0; + src->Negate = 0; + src->SwizzleX = TGSI_SWIZZLE_X; + src->SwizzleY = TGSI_SWIZZLE_Y; + src->SwizzleZ = TGSI_SWIZZLE_Z; + src->SwizzleW = TGSI_SWIZZLE_W; +} + +/* Get internal-temp src/dst to use for a sequence of instructions + * generated by a single TGSI op. + */ +static struct tgsi_src_register * +get_internal_temp(struct ir3_compile_context *ctx, + struct tgsi_dst_register *tmp_dst) +{ + struct tgsi_src_register *tmp_src; + int n; + + tmp_dst->File = TGSI_FILE_TEMPORARY; + tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; + tmp_dst->Indirect = 0; + tmp_dst->Dimension = 0; + + /* assign next temporary: */ + n = ctx->num_internal_temps++; + compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); + tmp_src = &ctx->internal_temps[n]; + + tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1; + + src_from_dst(tmp_src, tmp_dst); + + return tmp_src; +} + +static inline bool +is_const(struct tgsi_src_register *src) +{ + return (src->File == TGSI_FILE_CONSTANT) || + (src->File == TGSI_FILE_IMMEDIATE); +} + +static inline bool +is_relative(struct tgsi_src_register *src) +{ + return src->Indirect; +} + +static inline bool +is_rel_or_const(struct tgsi_src_register *src) +{ + return is_relative(src) || is_const(src); +} + +static type_t +get_ftype(struct ir3_compile_context *ctx) +{ + return TYPE_F32; +} + +static type_t +get_utype(struct ir3_compile_context *ctx) +{ + return TYPE_U32; +} + +static unsigned +src_swiz(struct tgsi_src_register *src, int chan) +{ + switch (chan) { + case 0: return src->SwizzleX; + case 1: return src->SwizzleY; + case 2: return src->SwizzleZ; + case 3: return src->SwizzleW; + } + assert(0); + return 0; +} + +/* for instructions that cannot take a const register as src, if needed + * generate a move to temporary gpr: + */ +static struct tgsi_src_register * +get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src) +{ + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + + compile_assert(ctx, is_rel_or_const(src)); + + tmp_src = get_internal_temp(ctx, &tmp_dst); + + create_mov(ctx, &tmp_dst, src); + + return tmp_src; +} + +static void +get_immediate(struct ir3_compile_context *ctx, + struct tgsi_src_register *reg, uint32_t val) +{ + unsigned neg, swiz, idx, i; + /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ + static const unsigned swiz2tgsi[] = { + TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, + }; + + for (i = 0; i < ctx->immediate_idx; i++) { + swiz = i % 4; + idx = i / 4; + + if (ctx->so->immediates[idx].val[swiz] == val) { + neg = 0; + break; + } + + if (ctx->so->immediates[idx].val[swiz] == -val) { + neg = 1; + break; + } + } + + if (i == ctx->immediate_idx) { + /* need to generate a new immediate: */ + swiz = i % 4; + idx = i / 4; + neg = 0; + ctx->so->immediates[idx].val[swiz] = val; + ctx->so->immediates_count = idx + 1; + ctx->immediate_idx++; + } + + reg->File = TGSI_FILE_IMMEDIATE; + reg->Indirect = 0; + reg->Dimension = 0; + reg->Index = idx; + reg->Absolute = 0; + reg->Negate = neg; + reg->SwizzleX = swiz2tgsi[swiz]; + reg->SwizzleY = swiz2tgsi[swiz]; + reg->SwizzleZ = swiz2tgsi[swiz]; + reg->SwizzleW = swiz2tgsi[swiz]; +} + +static void +create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst, + struct tgsi_src_register *src) +{ + type_t type_mov = get_ftype(ctx); + unsigned i; + + for (i = 0; i < 4; i++) { + /* move to destination: */ + if (dst->WriteMask & (1 << i)) { + struct ir3_instruction *instr; + + if (src->Absolute || src->Negate) { + /* can't have abs or neg on a mov instr, so use + * absneg.f instead to handle these cases: + */ + instr = instr_create(ctx, 2, OPC_ABSNEG_F); + } else { + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + } + + add_dst_reg(ctx, instr, dst, i); + add_src_reg(ctx, instr, src, src_swiz(src, i)); + } + } +} + +static void +create_clamp(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, struct tgsi_src_register *val, + struct tgsi_src_register *minval, struct tgsi_src_register *maxval) +{ + struct ir3_instruction *instr; + + instr = instr_create(ctx, 2, OPC_MAX_F); + vectorize(ctx, instr, dst, 2, val, 0, minval, 0); + + instr = instr_create(ctx, 2, OPC_MIN_F); + vectorize(ctx, instr, dst, 2, val, 0, maxval, 0); +} + +static void +create_clamp_imm(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, + uint32_t minval, uint32_t maxval) +{ + struct tgsi_src_register minconst, maxconst; + struct tgsi_src_register src; + + src_from_dst(&src, dst); + + get_immediate(ctx, &minconst, minval); + get_immediate(ctx, &maxconst, maxval); + + create_clamp(ctx, dst, &src, &minconst, &maxconst); +} + +static struct tgsi_dst_register * +get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + unsigned i; + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + struct tgsi_src_register *src = &inst->Src[i].Register; + if ((src->File == dst->File) && (src->Index == dst->Index)) { + if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) && + (src->SwizzleX == TGSI_SWIZZLE_X) && + (src->SwizzleY == TGSI_SWIZZLE_Y) && + (src->SwizzleZ == TGSI_SWIZZLE_Z) && + (src->SwizzleW == TGSI_SWIZZLE_W)) + continue; + ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst); + ctx->tmp_dst.WriteMask = dst->WriteMask; + dst = &ctx->tmp_dst; + break; + } + } + return dst; +} + +static void +put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst, + struct tgsi_dst_register *dst) +{ + /* if necessary, add mov back into original dst: */ + if (dst != &inst->Dst[0].Register) { + create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src); + } +} + +/* helper to generate the necessary repeat and/or additional instructions + * to turn a scalar instruction into a vector operation: + */ +static void +vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + struct tgsi_dst_register *dst, int nsrcs, ...) +{ + va_list ap; + int i, j, n = 0; + + instr_atomic_start(ctx); + + add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X); + + va_start(ap, nsrcs); + for (j = 0; j < nsrcs; j++) { + struct tgsi_src_register *src = + va_arg(ap, struct tgsi_src_register *); + unsigned flags = va_arg(ap, unsigned); + struct ir3_register *reg; + if (flags & IR3_REG_IMMED) { + reg = ir3_reg_create(instr, 0, IR3_REG_IMMED); + /* this is an ugly cast.. should have put flags first! */ + reg->iim_val = *(int *)&src; + } else { + reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X); + } + reg->flags |= flags & ~IR3_REG_NEGATE; + if (flags & IR3_REG_NEGATE) + reg->flags ^= IR3_REG_NEGATE; + } + va_end(ap); + + for (i = 0; i < 4; i++) { + if (dst->WriteMask & (1 << i)) { + struct ir3_instruction *cur; + + if (n++ == 0) { + cur = instr; + } else { + cur = instr_clone(ctx, instr); + } + + ssa_dst(ctx, cur, dst, i); + + /* fix-up dst register component: */ + cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i); + + /* fix-up src register component: */ + va_start(ap, nsrcs); + for (j = 0; j < nsrcs; j++) { + struct ir3_register *reg = cur->regs[j+1]; + struct tgsi_src_register *src = + va_arg(ap, struct tgsi_src_register *); + unsigned flags = va_arg(ap, unsigned); + if (reg->flags & IR3_REG_SSA) { + ssa_src(ctx, reg, src, src_swiz(src, i)); + } else if (!(flags & IR3_REG_IMMED)) { + reg->num = regid(reg->num >> 2, src_swiz(src, i)); + } + } + va_end(ap); + } + } + + instr_atomic_end(ctx); +} + +/* + * Handlers for TGSI instructions which do not have a 1:1 mapping to + * native instructions: + */ + +static void +trans_clamp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct tgsi_src_register *src2 = &inst->Src[2].Register; + + create_clamp(ctx, dst, src0, src1, src2); + + put_dst(ctx, inst, dst); +} + +/* ARL(x) = x, but mova from hrN.x to a0.. */ +static void +trans_arl(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + struct tgsi_src_register *src = &inst->Src[0].Register; + unsigned chan = src->SwizzleX; + + compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); + + /* NOTE: we allocate a temporary from a flat register + * namespace (ignoring half vs full). It turns out + * not to really matter since registers get reassigned + * later in ir3_ra which (hopefully!) can deal a bit + * better with mixed half and full precision. + */ + tmp_src = get_internal_temp(ctx, &tmp_dst); + + /* cov.f{32,16}s16 Rtmp, Rsrc */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = get_ftype(ctx); + instr->cat1.dst_type = TYPE_S16; + add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, src, chan); + + /* shl.b Rtmp, Rtmp, 2 */ + instr = instr_create(ctx, 2, OPC_SHL_B); + add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; + + /* mova a0, Rtmp */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = TYPE_S16; + instr->cat1.dst_type = TYPE_S16; + add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; +} + +/* + * texture fetch/sample instructions: + */ + +struct tex_info { + int8_t order[4]; + unsigned src_wrmask, flags; +}; + +static const struct tex_info * +get_tex_info(struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + static const struct tex_info tex1d = { + .order = { 0, -1, -1, -1 }, /* coord.x */ + .src_wrmask = TGSI_WRITEMASK_XY, + .flags = 0, + }; + static const struct tex_info tex1ds = { + .order = { 0, -1, 2, -1 }, /* coord.xz */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_S, + }; + static const struct tex_info tex2d = { + .order = { 0, 1, -1, -1 }, /* coord.xy */ + .src_wrmask = TGSI_WRITEMASK_XY, + .flags = 0, + }; + static const struct tex_info tex2ds = { + .order = { 0, 1, 2, -1 }, /* coord.xyz */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_S, + }; + static const struct tex_info tex3d = { + .order = { 0, 1, 2, -1 }, /* coord.xyz */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_3D, + }; + static const struct tex_info tex3ds = { + .order = { 0, 1, 2, 3 }, /* coord.xyzw */ + .src_wrmask = TGSI_WRITEMASK_XYZW, + .flags = IR3_INSTR_S | IR3_INSTR_3D, + }; + static const struct tex_info txp1d = { + .order = { 0, -1, 3, -1 }, /* coord.xw */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_P, + }; + static const struct tex_info txp1ds = { + .order = { 0, -1, 2, 3 }, /* coord.xzw */ + .src_wrmask = TGSI_WRITEMASK_XYZW, + .flags = IR3_INSTR_P | IR3_INSTR_S, + }; + static const struct tex_info txp2d = { + .order = { 0, 1, 3, -1 }, /* coord.xyw */ + .src_wrmask = TGSI_WRITEMASK_XYZ, + .flags = IR3_INSTR_P, + }; + static const struct tex_info txp2ds = { + .order = { 0, 1, 2, 3 }, /* coord.xyzw */ + .src_wrmask = TGSI_WRITEMASK_XYZW, + .flags = IR3_INSTR_P | IR3_INSTR_S, + }; + static const struct tex_info txp3d = { + .order = { 0, 1, 2, 3 }, /* coord.xyzw */ + .src_wrmask = TGSI_WRITEMASK_XYZW, + .flags = IR3_INSTR_P | IR3_INSTR_3D, + }; + + unsigned tex = inst->Texture.Texture; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_TEX: + switch (tex) { + case TGSI_TEXTURE_1D: + return &tex1d; + case TGSI_TEXTURE_SHADOW1D: + return &tex1ds; + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + return &tex2d; + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + return &tex2ds; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + return &tex3d; + case TGSI_TEXTURE_SHADOWCUBE: + return &tex3ds; + default: + compile_error(ctx, "unknown texture type: %s\n", + tgsi_texture_names[tex]); + return NULL; + } + break; + case TGSI_OPCODE_TXP: + switch (tex) { + case TGSI_TEXTURE_1D: + return &txp1d; + case TGSI_TEXTURE_SHADOW1D: + return &txp1ds; + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + return &txp2d; + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + return &txp2ds; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + return &txp3d; + default: + compile_error(ctx, "unknown texture type: %s\n", + tgsi_texture_names[tex]); + break; + } + break; + } + compile_assert(ctx, 0); + return NULL; +} + +static struct tgsi_src_register * +get_tex_coord(struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst, + const struct tex_info *tinf) +{ + struct tgsi_src_register *coord = &inst->Src[0].Register; + struct ir3_instruction *instr; + unsigned tex = inst->Texture.Texture; + bool needs_mov = false; + unsigned i; + + /* cat5 instruction cannot seem to handle const or relative: */ + if (is_rel_or_const(coord)) + needs_mov = true; + + /* 1D textures we fix up w/ 0.0 as 2nd coord: */ + if ((tex == TGSI_TEXTURE_1D) || (tex == TGSI_TEXTURE_SHADOW1D)) + needs_mov = true; + + /* The texture sample instructions need to coord in successive + * registers/components (ie. src.xy but not src.yx). And TXP + * needs the .w component in .z for 2D.. so in some cases we + * might need to emit some mov instructions to shuffle things + * around: + */ + for (i = 1; (i < 4) && (tinf->order[i] >= 0) && !needs_mov; i++) + if (src_swiz(coord, i) != (src_swiz(coord, 0) + tinf->order[i])) + needs_mov = true; + + if (needs_mov) { + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + unsigned j; + + type_t type_mov = get_ftype(ctx); + + /* need to move things around: */ + tmp_src = get_internal_temp(ctx, &tmp_dst); + + for (j = 0; j < 4; j++) { + if (tinf->order[j] < 0) + continue; + instr = instr_create(ctx, 1, 0); /* mov */ + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + add_dst_reg(ctx, instr, &tmp_dst, j); + add_src_reg(ctx, instr, coord, + src_swiz(coord, tinf->order[j])); + } + + /* fix up .y coord: */ + if ((tex == TGSI_TEXTURE_1D) || + (tex == TGSI_TEXTURE_SHADOW1D)) { + instr = instr_create(ctx, 1, 0); /* mov */ + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + add_dst_reg(ctx, instr, &tmp_dst, 1); /* .y */ + ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5; + } + + coord = tmp_src; + } + + return coord; +} + +static void +trans_samp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + struct tgsi_src_register *coord; + struct tgsi_src_register *samp = &inst->Src[1].Register; + const struct tex_info *tinf; + + tinf = get_tex_info(ctx, inst); + coord = get_tex_coord(ctx, inst, tinf); + + instr = instr_create(ctx, 5, t->opc); + instr->cat5.type = get_ftype(ctx); + instr->cat5.samp = samp->Index; + instr->cat5.tex = samp->Index; + instr->flags |= tinf->flags; + + add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask); + add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf->src_wrmask); +} + +/* + * SEQ(a,b) = (a == b) ? 1.0 : 0.0 + * cmps.f.eq tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SNE(a,b) = (a != b) ? 1.0 : 0.0 + * cmps.f.ne tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SGE(a,b) = (a >= b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SLE(a,b) = (a <= b) ? 1.0 : 0.0 + * cmps.f.le tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SGT(a,b) = (a > b) ? 1.0 : 0.0 + * cmps.f.gt tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SLT(a,b) = (a < b) ? 1.0 : 0.0 + * cmps.f.lt tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * CMP(a,b,c) = (a < 0.0) ? b : c + * cmps.f.lt tmp0, a, {0.0} + * sel.b16 dst, b, tmp0, c + */ +static void +trans_cmp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_src_register constval0; + /* final instruction for CMP() uses orig src1 and src2: */ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *a0, *a1, *a2; + unsigned condition; + + tmp_src = get_internal_temp(ctx, &tmp_dst); + + a0 = &inst->Src[0].Register; /* a */ + a1 = &inst->Src[1].Register; /* b */ + + switch (t->tgsi_opc) { + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_FSEQ: + condition = IR3_COND_EQ; + break; + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_FSNE: + condition = IR3_COND_NE; + break; + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_FSGE: + condition = IR3_COND_GE; + break; + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_FSLT: + condition = IR3_COND_LT; + break; + case TGSI_OPCODE_SLE: + condition = IR3_COND_LE; + break; + case TGSI_OPCODE_SGT: + condition = IR3_COND_GT; + break; + case TGSI_OPCODE_CMP: + get_immediate(ctx, &constval0, fui(0.0)); + a0 = &inst->Src[0].Register; /* a */ + a1 = &constval0; /* {0.0} */ + condition = IR3_COND_LT; + break; + default: + compile_assert(ctx, 0); + return; + } + + if (is_const(a0) && is_const(a1)) + a0 = get_unconst(ctx, a0); + + /* cmps.f. tmp, a0, a1 */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + instr->cat2.condition = condition; + vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); + + switch (t->tgsi_opc) { + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_FSEQ: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_FSGE: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_FSNE: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_FSLT: + /* cov.u16f16 dst, tmp0 */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = get_utype(ctx); + instr->cat1.dst_type = get_ftype(ctx); + vectorize(ctx, instr, dst, 1, tmp_src, 0); + break; + case TGSI_OPCODE_CMP: + a1 = &inst->Src[1].Register; + a2 = &inst->Src[2].Register; + /* sel.{b32,b16} dst, src2, tmp, src1 */ + instr = instr_create(ctx, 3, OPC_SEL_B32); + vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); + + break; + } + + put_dst(ctx, inst, dst); +} + +/* + * USNE(a,b) = (a != b) ? 1 : 0 + * cmps.u32.ne dst, a, b + * + * USEQ(a,b) = (a == b) ? 1 : 0 + * cmps.u32.eq dst, a, b + * + * ISGE(a,b) = (a > b) ? 1 : 0 + * cmps.s32.ge dst, a, b + * + * USGE(a,b) = (a > b) ? 1 : 0 + * cmps.u32.ge dst, a, b + * + * ISLT(a,b) = (a < b) ? 1 : 0 + * cmps.s32.lt dst, a, b + * + * USLT(a,b) = (a < b) ? 1 : 0 + * cmps.u32.lt dst, a, b + * + * UCMP(a,b,c) = (a < 0) ? b : c + * cmps.u32.lt tmp0, a, {0} + * sel.b16 dst, b, tmp0, c + */ +static void +trans_icmp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register constval0; + struct tgsi_src_register *a0, *a1, *a2; + unsigned condition; + + a0 = &inst->Src[0].Register; /* a */ + a1 = &inst->Src[1].Register; /* b */ + + switch (t->tgsi_opc) { + case TGSI_OPCODE_USNE: + condition = IR3_COND_NE; + break; + case TGSI_OPCODE_USEQ: + condition = IR3_COND_EQ; + break; + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_USGE: + condition = IR3_COND_GE; + break; + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_USLT: + condition = IR3_COND_LT; + break; + case TGSI_OPCODE_UCMP: + get_immediate(ctx, &constval0, 0); + a0 = &inst->Src[0].Register; /* a */ + a1 = &constval0; /* {0} */ + condition = IR3_COND_LT; + break; + + default: + compile_assert(ctx, 0); + return; + } + + if (is_const(a0) && is_const(a1)) + a0 = get_unconst(ctx, a0); + + if (t->tgsi_opc == TGSI_OPCODE_UCMP) { + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + tmp_src = get_internal_temp(ctx, &tmp_dst); + /* cmps.u32.lt tmp, a0, a1 */ + instr = instr_create(ctx, 2, t->opc); + instr->cat2.condition = condition; + vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); + + a1 = &inst->Src[1].Register; + a2 = &inst->Src[2].Register; + /* sel.{b32,b16} dst, src2, tmp, src1 */ + instr = instr_create(ctx, 3, OPC_SEL_B32); + vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); + } else { + /* cmps.{u32,s32}. dst, a0, a1 */ + instr = instr_create(ctx, 2, t->opc); + instr->cat2.condition = condition; + vectorize(ctx, instr, dst, 2, a0, 0, a1, 0); + } + put_dst(ctx, inst, dst); +} + +/* + * Conditional / Flow control + */ + +static void +push_branch(struct ir3_compile_context *ctx, bool inv, + struct ir3_instruction *instr, struct ir3_instruction *cond) +{ + unsigned int idx = ctx->branch_count++; + compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch)); + ctx->branch[idx].instr = instr; + ctx->branch[idx].inv = inv; + /* else side of branch has same condition: */ + if (!inv) + ctx->branch[idx].cond = cond; +} + +static struct ir3_instruction * +pop_branch(struct ir3_compile_context *ctx) +{ + unsigned int idx = --ctx->branch_count; + return ctx->branch[idx].instr; +} + +static void +trans_if(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr, *cond; + struct tgsi_src_register *src = &inst->Src[0].Register; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_src_register constval; + + get_immediate(ctx, &constval, fui(0.0)); + tmp_src = get_internal_temp(ctx, &tmp_dst); + + if (is_const(src)) + src = get_unconst(ctx, src); + + /* cmps.f.ne tmp0, b, {0.0} */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + add_dst_reg(ctx, instr, &tmp_dst, 0); + add_src_reg(ctx, instr, src, src->SwizzleX); + add_src_reg(ctx, instr, &constval, constval.SwizzleX); + instr->cat2.condition = IR3_COND_NE; + + compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */ + cond = instr->regs[1]->instr; + + /* meta:flow tmp0 */ + instr = instr_create(ctx, -1, OPC_META_FLOW); + ir3_reg_create(instr, 0, 0); /* dummy dst */ + add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); + + push_branch(ctx, false, instr, cond); + instr->flow.if_block = push_block(ctx); +} + +static void +trans_else(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + + pop_block(ctx); + + instr = pop_branch(ctx); + + compile_assert(ctx, (instr->category == -1) && + (instr->opc == OPC_META_FLOW)); + + push_branch(ctx, true, instr, NULL); + instr->flow.else_block = push_block(ctx); +} + +static struct ir3_instruction * +find_temporary(struct ir3_block *block, unsigned n) +{ + if (block->parent && !block->temporaries[n]) + return find_temporary(block->parent, n); + return block->temporaries[n]; +} + +static struct ir3_instruction * +find_output(struct ir3_block *block, unsigned n) +{ + if (block->parent && !block->outputs[n]) + return find_output(block->parent, n); + return block->outputs[n]; +} + +static struct ir3_instruction * +create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond, + struct ir3_instruction *a, struct ir3_instruction *b) +{ + struct ir3_instruction *phi; + + compile_assert(ctx, cond); + + /* Either side of the condition could be null.. which + * indicates a variable written on only one side of the + * branch. Normally this should only be variables not + * used outside of that side of the branch. So we could + * just 'return a ? a : b;' in that case. But for better + * defined undefined behavior we just stick in imm{0.0}. + * In the common case of a value only used within the + * one side of the branch, the PHI instruction will not + * get scheduled + */ + if (!a) + a = create_immed(ctx, 0.0); + if (!b) + b = create_immed(ctx, 0.0); + + phi = instr_create(ctx, -1, OPC_META_PHI); + ir3_reg_create(phi, 0, 0); /* dummy dst */ + ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond; + ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a; + ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b; + + return phi; +} + +static void +trans_endif(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct ir3_block *ifb, *elseb; + struct ir3_instruction **ifout, **elseout; + unsigned i, ifnout = 0, elsenout = 0; + + pop_block(ctx); + + instr = pop_branch(ctx); + + compile_assert(ctx, (instr->category == -1) && + (instr->opc == OPC_META_FLOW)); + + ifb = instr->flow.if_block; + elseb = instr->flow.else_block; + /* if there is no else block, the parent block is used for the + * branch-not-taken src of the PHI instructions: + */ + if (!elseb) + elseb = ifb->parent; + + /* worst case sizes: */ + ifnout = ifb->ntemporaries + ifb->noutputs; + elsenout = elseb->ntemporaries + elseb->noutputs; + + ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout); + if (elseb != ifb->parent) + elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout); + + ifnout = 0; + elsenout = 0; + + /* generate PHI instructions for any temporaries written: */ + for (i = 0; i < ifb->ntemporaries; i++) { + struct ir3_instruction *a = ifb->temporaries[i]; + struct ir3_instruction *b = elseb->temporaries[i]; + + /* if temporary written in if-block, or if else block + * is present and temporary written in else-block: + */ + if (a || ((elseb != ifb->parent) && b)) { + struct ir3_instruction *phi; + + /* if only written on one side, find the closest + * enclosing update on other side: + */ + if (!a) + a = find_temporary(ifb, i); + if (!b) + b = find_temporary(elseb, i); + + ifout[ifnout] = a; + a = create_output(ifb, a, ifnout++); + + if (elseb != ifb->parent) { + elseout[elsenout] = b; + b = create_output(elseb, b, elsenout++); + } + + phi = create_phi(ctx, instr, a, b); + ctx->block->temporaries[i] = phi; + } + } + + compile_assert(ctx, ifb->noutputs == elseb->noutputs); + + /* .. and any outputs written: */ + for (i = 0; i < ifb->noutputs; i++) { + struct ir3_instruction *a = ifb->outputs[i]; + struct ir3_instruction *b = elseb->outputs[i]; + + /* if output written in if-block, or if else block + * is present and output written in else-block: + */ + if (a || ((elseb != ifb->parent) && b)) { + struct ir3_instruction *phi; + + /* if only written on one side, find the closest + * enclosing update on other side: + */ + if (!a) + a = find_output(ifb, i); + if (!b) + b = find_output(elseb, i); + + ifout[ifnout] = a; + a = create_output(ifb, a, ifnout++); + + if (elseb != ifb->parent) { + elseout[elsenout] = b; + b = create_output(elseb, b, elsenout++); + } + + phi = create_phi(ctx, instr, a, b); + ctx->block->outputs[i] = phi; + } + } + + ifb->noutputs = ifnout; + ifb->outputs = ifout; + + if (elseb != ifb->parent) { + elseb->noutputs = elsenout; + elseb->outputs = elseout; + } + + // TODO maybe we want to compact block->inputs? +} + +/* + * Kill + */ + +static void +trans_kill(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr, *immed, *cond = NULL; + bool inv = false; + + switch (t->tgsi_opc) { + case TGSI_OPCODE_KILL: + /* unconditional kill, use enclosing if condition: */ + if (ctx->branch_count > 0) { + unsigned int idx = ctx->branch_count - 1; + cond = ctx->branch[idx].cond; + inv = ctx->branch[idx].inv; + } else { + cond = create_immed(ctx, 1.0); + } + + break; + } + + compile_assert(ctx, cond); + + immed = create_immed(ctx, 0.0); + + /* cmps.f.ne p0.x, cond, {0.0} */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + instr->cat2.condition = IR3_COND_NE; + ir3_reg_create(instr, regid(REG_P0, 0), 0); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; + cond = instr; + + /* kill p0.x */ + instr = instr_create(ctx, 0, OPC_KILL); + instr->cat0.inv = inv; + ir3_reg_create(instr, 0, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; + + ctx->kill[ctx->kill_count++] = instr; +} + +/* + * Kill-If + */ + +static void +trans_killif(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_src_register *src = &inst->Src[0].Register; + struct ir3_instruction *instr, *immed, *cond = NULL; + bool inv = false; + + immed = create_immed(ctx, 0.0); + + /* cmps.f.ne p0.x, cond, {0.0} */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + instr->cat2.condition = IR3_COND_NE; + ir3_reg_create(instr, regid(REG_P0, 0), 0); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; + add_src_reg(ctx, instr, src, src->SwizzleX); + + cond = instr; + + /* kill p0.x */ + instr = instr_create(ctx, 0, OPC_KILL); + instr->cat0.inv = inv; + ir3_reg_create(instr, 0, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; + + ctx->kill[ctx->kill_count++] = instr; + +} +/* + * I2F / U2F / F2I / F2U + */ + +static void +trans_cov(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + + // cov.f32s32 dst, tmp0 / + instr = instr_create(ctx, 1, 0); + switch (t->tgsi_opc) { + case TGSI_OPCODE_U2F: + instr->cat1.src_type = TYPE_U32; + instr->cat1.dst_type = TYPE_F32; + break; + case TGSI_OPCODE_I2F: + instr->cat1.src_type = TYPE_S32; + instr->cat1.dst_type = TYPE_F32; + break; + case TGSI_OPCODE_F2U: + instr->cat1.src_type = TYPE_F32; + instr->cat1.dst_type = TYPE_U32; + break; + case TGSI_OPCODE_F2I: + instr->cat1.src_type = TYPE_F32; + instr->cat1.dst_type = TYPE_S32; + break; + + } + vectorize(ctx, instr, dst, 1, src, 0); +} + +/* + * Handlers for TGSI instructions which do have 1:1 mapping to native + * instructions: + */ + +static void +instr_cat0(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + instr_create(ctx, 0, t->opc); +} + +static void +instr_cat1(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + create_mov(ctx, dst, src); + put_dst(ctx, inst, dst); +} + +static void +instr_cat2(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct ir3_instruction *instr; + unsigned src0_flags = 0, src1_flags = 0; + + switch (t->tgsi_opc) { + case TGSI_OPCODE_ABS: + case TGSI_OPCODE_IABS: + src0_flags = IR3_REG_ABS; + break; + case TGSI_OPCODE_SUB: + case TGSI_OPCODE_INEG: + src1_flags = IR3_REG_NEGATE; + break; + } + + switch (t->opc) { + case OPC_ABSNEG_F: + case OPC_ABSNEG_S: + case OPC_CLZ_B: + case OPC_CLZ_S: + case OPC_SIGN_F: + case OPC_FLOOR_F: + case OPC_CEIL_F: + case OPC_RNDNE_F: + case OPC_RNDAZ_F: + case OPC_TRUNC_F: + case OPC_NOT_B: + case OPC_BFREV_B: + case OPC_SETRM: + case OPC_CBITS_B: + /* these only have one src reg */ + instr = instr_create(ctx, 2, t->opc); + vectorize(ctx, instr, dst, 1, src0, src0_flags); + break; + default: + if (is_const(src0) && is_const(src1)) + src0 = get_unconst(ctx, src0); + + instr = instr_create(ctx, 2, t->opc); + vectorize(ctx, instr, dst, 2, src0, src0_flags, + src1, src1_flags); + break; + } + + put_dst(ctx, inst, dst); +} + +static void +instr_cat3(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct ir3_instruction *instr; + + /* in particular, can't handle const for src1 for cat3.. + * for mad, we can swap first two src's if needed: + */ + if (is_rel_or_const(src1)) { + if (is_mad(t->opc) && !is_rel_or_const(src0)) { + struct tgsi_src_register *tmp; + tmp = src0; + src0 = src1; + src1 = tmp; + } else { + src1 = get_unconst(ctx, src1); + } + } + + instr = instr_create(ctx, 3, t->opc); + vectorize(ctx, instr, dst, 3, src0, 0, src1, 0, + &inst->Src[2].Register, 0); + put_dst(ctx, inst, dst); +} + +static void +instr_cat4(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + struct ir3_instruction *instr; + unsigned i; + + /* seems like blob compiler avoids const as src.. */ + if (is_const(src)) + src = get_unconst(ctx, src); + + /* we need to replicate into each component: */ + for (i = 0; i < 4; i++) { + if (dst->WriteMask & (1 << i)) { + instr = instr_create(ctx, 4, t->opc); + add_dst_reg(ctx, instr, dst, i); + add_src_reg(ctx, instr, src, src->SwizzleX); + } + } + + put_dst(ctx, inst, dst); +} + +static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { +#define INSTR(n, f, ...) \ + [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ } + + INSTR(MOV, instr_cat1), + INSTR(RCP, instr_cat4, .opc = OPC_RCP), + INSTR(RSQ, instr_cat4, .opc = OPC_RSQ), + INSTR(SQRT, instr_cat4, .opc = OPC_SQRT), + INSTR(MUL, instr_cat2, .opc = OPC_MUL_F), + INSTR(ADD, instr_cat2, .opc = OPC_ADD_F), + INSTR(SUB, instr_cat2, .opc = OPC_ADD_F), + INSTR(MIN, instr_cat2, .opc = OPC_MIN_F), + INSTR(MAX, instr_cat2, .opc = OPC_MAX_F), + INSTR(UADD, instr_cat2, .opc = OPC_ADD_U), + INSTR(IMIN, instr_cat2, .opc = OPC_MIN_S), + INSTR(UMIN, instr_cat2, .opc = OPC_MIN_U), + INSTR(IMAX, instr_cat2, .opc = OPC_MAX_S), + INSTR(UMAX, instr_cat2, .opc = OPC_MAX_U), + INSTR(AND, instr_cat2, .opc = OPC_AND_B), + INSTR(OR, instr_cat2, .opc = OPC_OR_B), + INSTR(NOT, instr_cat2, .opc = OPC_NOT_B), + INSTR(XOR, instr_cat2, .opc = OPC_XOR_B), + INSTR(UMUL, instr_cat2, .opc = OPC_MUL_U), + INSTR(SHL, instr_cat2, .opc = OPC_SHL_B), + INSTR(USHR, instr_cat2, .opc = OPC_SHR_B), + INSTR(ISHR, instr_cat2, .opc = OPC_ASHR_B), + INSTR(IABS, instr_cat2, .opc = OPC_ABSNEG_S), + INSTR(INEG, instr_cat2, .opc = OPC_ABSNEG_S), + INSTR(AND, instr_cat2, .opc = OPC_AND_B), + INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16), + INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F), + INSTR(CLAMP, trans_clamp), + INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F), + INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F), + INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F), + INSTR(CEIL, instr_cat2, .opc = OPC_CEIL_F), + INSTR(ARL, trans_arl), + INSTR(EX2, instr_cat4, .opc = OPC_EXP2), + INSTR(LG2, instr_cat4, .opc = OPC_LOG2), + INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F), + INSTR(COS, instr_cat4, .opc = OPC_COS), + INSTR(SIN, instr_cat4, .opc = OPC_SIN), + INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX), + INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP), + INSTR(SGT, trans_cmp), + INSTR(SLT, trans_cmp), + INSTR(FSLT, trans_cmp), + INSTR(SGE, trans_cmp), + INSTR(FSGE, trans_cmp), + INSTR(SLE, trans_cmp), + INSTR(SNE, trans_cmp), + INSTR(FSNE, trans_cmp), + INSTR(SEQ, trans_cmp), + INSTR(FSEQ, trans_cmp), + INSTR(CMP, trans_cmp), + INSTR(USNE, trans_icmp, .opc = OPC_CMPS_U), + INSTR(USEQ, trans_icmp, .opc = OPC_CMPS_U), + INSTR(ISGE, trans_icmp, .opc = OPC_CMPS_S), + INSTR(USGE, trans_icmp, .opc = OPC_CMPS_U), + INSTR(ISLT, trans_icmp, .opc = OPC_CMPS_S), + INSTR(USLT, trans_icmp, .opc = OPC_CMPS_U), + INSTR(UCMP, trans_icmp, .opc = OPC_CMPS_U), + INSTR(IF, trans_if), + INSTR(UIF, trans_if), + INSTR(ELSE, trans_else), + INSTR(ENDIF, trans_endif), + INSTR(END, instr_cat0, .opc = OPC_END), + INSTR(KILL, trans_kill, .opc = OPC_KILL), + INSTR(KILL_IF, trans_killif, .opc = OPC_KILL), + INSTR(I2F, trans_cov), + INSTR(U2F, trans_cov), + INSTR(F2I, trans_cov), + INSTR(F2U, trans_cov), +}; + +static ir3_semantic +decl_semantic(const struct tgsi_declaration_semantic *sem) +{ + return ir3_semantic_name(sem->Name, sem->Index); +} + +static struct ir3_instruction * +decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid, + unsigned j, unsigned inloc) +{ + struct ir3_instruction *instr; + struct ir3_register *src; + + /* bary.f dst, #inloc, r0.x */ + instr = instr_create(ctx, 2, OPC_BARY_F); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc; + src = ir3_reg_create(instr, 0, IR3_REG_SSA); + src->wrmask = 0x3; + src->instr = ctx->frag_pos; + + return instr; +} + +/* TGSI_SEMANTIC_POSITION + * """""""""""""""""""""" + * + * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that + * fragment shader input contains the fragment's window position. The X + * component starts at zero and always increases from left to right. + * The Y component starts at zero and always increases but Y=0 may either + * indicate the top of the window or the bottom depending on the fragment + * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN). + * The Z coordinate ranges from 0 to 1 to represent depth from the front + * to the back of the Z buffer. The W component contains the reciprocol + * of the interpolated vertex position W component. + */ +static struct ir3_instruction * +decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid, + unsigned j) +{ + struct ir3_instruction *instr, *src; + + compile_assert(ctx, !ctx->frag_coord[j]); + + ctx->frag_coord[j] = create_input(ctx->block, NULL, 0); + + + switch (j) { + case 0: /* .x */ + case 1: /* .y */ + /* for frag_coord, we get unsigned values.. we need + * to subtract (integer) 8 and divide by 16 (right- + * shift by 4) then convert to float: + */ + + /* add.s tmp, src, -8 */ + instr = instr_create(ctx, 2, OPC_ADD_S); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j]; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8; + src = instr; + + /* shr.b tmp, tmp, 4 */ + instr = instr_create(ctx, 2, OPC_SHR_B); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4; + src = instr; + + /* mov.u32f32 dst, tmp */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = TYPE_U32; + instr->cat1.dst_type = TYPE_F32; + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + + break; + case 2: /* .z */ + case 3: /* .w */ + /* seems that we can use these as-is: */ + instr = ctx->frag_coord[j]; + break; + default: + compile_error(ctx, "invalid channel\n"); + instr = create_immed(ctx, 0.0); + break; + } + + return instr; +} + +/* TGSI_SEMANTIC_FACE + * """""""""""""""""" + * + * This label applies to fragment shader inputs only and indicates that + * the register contains front/back-face information of the form (F, 0, + * 0, 1). The first component will be positive when the fragment belongs + * to a front-facing polygon, and negative when the fragment belongs to a + * back-facing polygon. + */ +static struct ir3_instruction * +decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid, + unsigned j) +{ + struct ir3_instruction *instr, *src; + + switch (j) { + case 0: /* .x */ + compile_assert(ctx, !ctx->frag_face); + + ctx->frag_face = create_input(ctx->block, NULL, 0); + + /* for faceness, we always get -1 or 0 (int).. but TGSI expects + * positive vs negative float.. and piglit further seems to + * expect -1.0 or 1.0: + * + * mul.s tmp, hr0.x, 2 + * add.s tmp, tmp, 1 + * mov.s16f32, dst, tmp + * + */ + + instr = instr_create(ctx, 2, OPC_MUL_S); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; + src = instr; + + instr = instr_create(ctx, 2, OPC_ADD_S); + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; + src = instr; + + instr = instr_create(ctx, 1, 0); /* mov */ + instr->cat1.src_type = TYPE_S32; + instr->cat1.dst_type = TYPE_F32; + ir3_reg_create(instr, regid, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; + + break; + case 1: /* .y */ + case 2: /* .z */ + instr = create_immed(ctx, 0.0); + break; + case 3: /* .w */ + instr = create_immed(ctx, 1.0); + break; + default: + compile_error(ctx, "invalid channel\n"); + instr = create_immed(ctx, 0.0); + break; + } + + return instr; +} + +static void +decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned name = decl->Semantic.Name; + unsigned i; + + /* I don't think we should get frag shader input without + * semantic info? Otherwise how do inputs get linked to + * vert outputs? + */ + compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) || + decl->Declaration.Semantic); + + for (i = decl->Range.First; i <= decl->Range.Last; i++) { + unsigned n = so->inputs_count++; + unsigned r = regid(i, 0); + unsigned ncomp, j; + + /* we'll figure out the actual components used after scheduling */ + ncomp = 4; + + DBG("decl in -> r%d", i); + + compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); + + so->inputs[n].semantic = decl_semantic(&decl->Semantic); + so->inputs[n].compmask = (1 << ncomp) - 1; + so->inputs[n].regid = r; + so->inputs[n].inloc = ctx->next_inloc; + + for (j = 0; j < ncomp; j++) { + struct ir3_instruction *instr = NULL; + + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + /* for fragment shaders, POSITION and FACE are handled + * specially, not using normal varying / bary.f + */ + if (name == TGSI_SEMANTIC_POSITION) { + so->inputs[n].bary = false; + so->frag_coord = true; + instr = decl_in_frag_coord(ctx, r + j, j); + } else if (name == TGSI_SEMANTIC_FACE) { + so->inputs[n].bary = false; + so->frag_face = true; + instr = decl_in_frag_face(ctx, r + j, j); + } else { + so->inputs[n].bary = true; + instr = decl_in_frag_bary(ctx, r + j, j, + so->inputs[n].inloc + j - 8); + } + } else { + instr = create_input(ctx->block, NULL, (i * 4) + j); + } + + ctx->block->inputs[(i * 4) + j] = instr; + } + + if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) { + ctx->next_inloc += ncomp; + so->total_in += ncomp; + } + } +} + +static void +decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned comp = 0; + unsigned name = decl->Semantic.Name; + unsigned i; + + compile_assert(ctx, decl->Declaration.Semantic); + + DBG("decl out[%d] -> r%d", name, decl->Range.First); + + if (ctx->type == TGSI_PROCESSOR_VERTEX) { + switch (name) { + case TGSI_SEMANTIC_POSITION: + so->writes_pos = true; + break; + case TGSI_SEMANTIC_PSIZE: + so->writes_psize = true; + break; + case TGSI_SEMANTIC_COLOR: + case TGSI_SEMANTIC_BCOLOR: + case TGSI_SEMANTIC_GENERIC: + case TGSI_SEMANTIC_FOG: + case TGSI_SEMANTIC_TEXCOORD: + break; + default: + compile_error(ctx, "unknown VS semantic name: %s\n", + tgsi_semantic_names[name]); + } + } else { + switch (name) { + case TGSI_SEMANTIC_POSITION: + comp = 2; /* tgsi will write to .z component */ + so->writes_pos = true; + break; + case TGSI_SEMANTIC_COLOR: + break; + default: + compile_error(ctx, "unknown FS semantic name: %s\n", + tgsi_semantic_names[name]); + } + } + + for (i = decl->Range.First; i <= decl->Range.Last; i++) { + unsigned n = so->outputs_count++; + unsigned ncomp, j; + + ncomp = 4; + + compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); + + so->outputs[n].semantic = decl_semantic(&decl->Semantic); + so->outputs[n].regid = regid(i, comp); + + /* avoid undefined outputs, stick a dummy mov from imm{0.0}, + * which if the output is actually assigned will be over- + * written + */ + for (j = 0; j < ncomp; j++) + ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0); + } +} + +/* from TGSI perspective, we actually have inputs. But most of the "inputs" + * for a fragment shader are just bary.f instructions. The *actual* inputs + * from the hw perspective are the frag_pos and optionally frag_coord and + * frag_face. + */ +static void +fixup_frag_inputs(struct ir3_compile_context *ctx) +{ + struct ir3_shader_variant *so = ctx->so; + struct ir3_block *block = ctx->block; + struct ir3_instruction **inputs; + struct ir3_instruction *instr; + int n, regid = 0; + + block->ninputs = 0; + + n = 4; /* always have frag_pos */ + n += COND(so->frag_face, 4); + n += COND(so->frag_coord, 4); + + inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *))); + + if (so->frag_face) { + /* this ultimately gets assigned to hr0.x so doesn't conflict + * with frag_coord/frag_pos.. + */ + inputs[block->ninputs++] = ctx->frag_face; + ctx->frag_face->regs[0]->num = 0; + + /* remaining channels not used, but let's avoid confusing + * other parts that expect inputs to come in groups of vec4 + */ + inputs[block->ninputs++] = NULL; + inputs[block->ninputs++] = NULL; + inputs[block->ninputs++] = NULL; + } + + /* since we don't know where to set the regid for frag_coord, + * we have to use r0.x for it. But we don't want to *always* + * use r1.x for frag_pos as that could increase the register + * footprint on simple shaders: + */ + if (so->frag_coord) { + ctx->frag_coord[0]->regs[0]->num = regid++; + ctx->frag_coord[1]->regs[0]->num = regid++; + ctx->frag_coord[2]->regs[0]->num = regid++; + ctx->frag_coord[3]->regs[0]->num = regid++; + + inputs[block->ninputs++] = ctx->frag_coord[0]; + inputs[block->ninputs++] = ctx->frag_coord[1]; + inputs[block->ninputs++] = ctx->frag_coord[2]; + inputs[block->ninputs++] = ctx->frag_coord[3]; + } + + /* we always have frag_pos: */ + so->pos_regid = regid; + + /* r0.x */ + instr = create_input(block, NULL, block->ninputs); + instr->regs[0]->num = regid++; + inputs[block->ninputs++] = instr; + ctx->frag_pos->regs[1]->instr = instr; + + /* r0.y */ + instr = create_input(block, NULL, block->ninputs); + instr->regs[0]->num = regid++; + inputs[block->ninputs++] = instr; + ctx->frag_pos->regs[2]->instr = instr; + + block->inputs = inputs; +} + +static void +compile_instructions(struct ir3_compile_context *ctx) +{ + push_block(ctx); + + /* for fragment shader, we have a single input register (usually + * r0.xy) which is used as the base for bary.f varying fetch instrs: + */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + struct ir3_instruction *instr; + instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ + ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ + ctx->frag_pos = instr; + } + + while (!tgsi_parse_end_of_tokens(&ctx->parser)) { + tgsi_parse_token(&ctx->parser); + + switch (ctx->parser.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_DECLARATION: { + struct tgsi_full_declaration *decl = + &ctx->parser.FullToken.FullDeclaration; + if (decl->Declaration.File == TGSI_FILE_OUTPUT) { + decl_out(ctx, decl); + } else if (decl->Declaration.File == TGSI_FILE_INPUT) { + decl_in(ctx, decl); + } + break; + } + case TGSI_TOKEN_TYPE_IMMEDIATE: { + /* TODO: if we know the immediate is small enough, and only + * used with instructions that can embed an immediate, we + * can skip this: + */ + struct tgsi_full_immediate *imm = + &ctx->parser.FullToken.FullImmediate; + unsigned n = ctx->so->immediates_count++; + compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates)); + memcpy(ctx->so->immediates[n].val, imm->u, 16); + break; + } + case TGSI_TOKEN_TYPE_INSTRUCTION: { + struct tgsi_full_instruction *inst = + &ctx->parser.FullToken.FullInstruction; + unsigned opc = inst->Instruction.Opcode; + const struct instr_translater *t = &translaters[opc]; + + if (t->fxn) { + t->fxn(t, ctx, inst); + ctx->num_internal_temps = 0; + } else { + compile_error(ctx, "unknown TGSI opc: %s\n", + tgsi_get_opcode_name(opc)); + } + + switch (inst->Instruction.Saturate) { + case TGSI_SAT_ZERO_ONE: + create_clamp_imm(ctx, &inst->Dst[0].Register, + fui(0.0), fui(1.0)); + break; + case TGSI_SAT_MINUS_PLUS_ONE: + create_clamp_imm(ctx, &inst->Dst[0].Register, + fui(-1.0), fui(1.0)); + break; + } + + instr_finish(ctx); + + break; + } + default: + break; + } + } +} + +static void +compile_dump(struct ir3_compile_context *ctx) +{ + const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag"; + static unsigned n = 0; + char fname[16]; + FILE *f; + snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++); + f = fopen(fname, "w"); + if (!f) + return; + ir3_block_depth(ctx->block); + ir3_dump(ctx->ir, name, ctx->block, f); + fclose(f); +} + +int +ir3_compile_shader(struct ir3_shader_variant *so, + const struct tgsi_token *tokens, struct ir3_shader_key key) +{ + struct ir3_compile_context ctx; + struct ir3_block *block; + struct ir3_instruction **inputs; + unsigned i, j, actual_in; + int ret = 0; + + assert(!so->ir); + + so->ir = ir3_create(); + + assert(so->ir); + + if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) { + ret = -1; + goto out; + } + + compile_instructions(&ctx); + + block = ctx.block; + + /* keep track of the inputs from TGSI perspective.. */ + inputs = block->inputs; + + /* but fixup actual inputs for frag shader: */ + if (ctx.type == TGSI_PROCESSOR_FRAGMENT) + fixup_frag_inputs(&ctx); + + /* at this point, for binning pass, throw away unneeded outputs: */ + if (key.binning_pass) { + for (i = 0, j = 0; i < so->outputs_count; i++) { + unsigned name = sem2name(so->outputs[i].semantic); + unsigned idx = sem2name(so->outputs[i].semantic); + + /* throw away everything but first position/psize */ + if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || + (name == TGSI_SEMANTIC_PSIZE))) { + if (i != j) { + so->outputs[j] = so->outputs[i]; + block->outputs[(j*4)+0] = block->outputs[(i*4)+0]; + block->outputs[(j*4)+1] = block->outputs[(i*4)+1]; + block->outputs[(j*4)+2] = block->outputs[(i*4)+2]; + block->outputs[(j*4)+3] = block->outputs[(i*4)+3]; + } + j++; + } + } + so->outputs_count = j; + block->noutputs = j * 4; + } + + /* at this point, we want the kill's in the outputs array too, + * so that they get scheduled (since they have no dst).. we've + * already ensured that the array is big enough in push_block(): + */ + if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { + for (i = 0; i < ctx.kill_count; i++) + block->outputs[block->noutputs++] = ctx.kill[i]; + } + + if (fd_mesa_debug & FD_DBG_OPTDUMP) + compile_dump(&ctx); + + ret = ir3_block_flatten(block); + if (ret < 0) + goto out; + if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP)) + compile_dump(&ctx); + + ir3_block_cp(block); + + if (fd_mesa_debug & FD_DBG_OPTDUMP) + compile_dump(&ctx); + + ir3_block_depth(block); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER DEPTH:\n"); + ir3_dump_instr_list(block->head); + } + + ir3_block_sched(block); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER SCHED:\n"); + ir3_dump_instr_list(block->head); + } + + ret = ir3_block_ra(block, so->type, key.half_precision, + so->frag_coord, so->frag_face, &so->has_samp); + if (ret) + goto out; + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER RA:\n"); + ir3_dump_instr_list(block->head); + } + + /* fixup input/outputs: */ + for (i = 0; i < so->outputs_count; i++) { + so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; + /* preserve hack for depth output.. tgsi writes depth to .z, + * but what we give the hw is the scalar register: + */ + if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) && + (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) + so->outputs[i].regid += 2; + } + /* Note that some or all channels of an input may be unused: */ + actual_in = 0; + for (i = 0; i < so->inputs_count; i++) { + unsigned j, regid = ~0, compmask = 0; + so->inputs[i].ncomp = 0; + for (j = 0; j < 4; j++) { + struct ir3_instruction *in = inputs[(i*4) + j]; + if (in) { + compmask |= (1 << j); + regid = in->regs[0]->num - j; + actual_in++; + so->inputs[i].ncomp++; + } + } + so->inputs[i].regid = regid; + so->inputs[i].compmask = compmask; + } + + /* fragment shader always gets full vec4's even if it doesn't + * fetch all components, but vertex shader we need to update + * with the actual number of components fetch, otherwise thing + * will hang due to mismaptch between VFD_DECODE's and + * TOTALATTRTOVS + */ + if (so->type == SHADER_VERTEX) + so->total_in = actual_in; + +out: + if (ret) { + ir3_destroy(so->ir); + so->ir = NULL; + } + compile_free(&ctx); + + return ret; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h new file mode 100644 index 00000000000..9b11b3d8abf --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h @@ -0,0 +1,42 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2013 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#ifndef FD3_COMPILER_H_ +#define FD3_COMPILER_H_ + +#include "ir3_shader.h" + + +int ir3_compile_shader(struct ir3_shader_variant *so, + const struct tgsi_token *tokens, + struct ir3_shader_key key); +int ir3_compile_shader_old(struct ir3_shader_variant *so, + const struct tgsi_token *tokens, + struct ir3_shader_key key); + +#endif /* FD3_COMPILER_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c new file mode 100644 index 00000000000..1e1ca7ad813 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c @@ -0,0 +1,1524 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2013 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include + +#include "pipe/p_state.h" +#include "util/u_string.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_ureg.h" +#include "tgsi/tgsi_info.h" +#include "tgsi/tgsi_strings.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_scan.h" + +#include "freedreno_lowering.h" +#include "freedreno_util.h" + +#include "ir3_compiler.h" +#include "ir3_shader.h" + +#include "instr-a3xx.h" +#include "ir3.h" + + +struct ir3_compile_context { + const struct tgsi_token *tokens; + bool free_tokens; + struct ir3 *ir; + struct ir3_block *block; + struct ir3_shader_variant *so; + + struct tgsi_parse_context parser; + unsigned type; + + struct tgsi_shader_info info; + + /* last input dst (for setting (ei) flag): */ + struct ir3_register *last_input; + + /* last instruction with relative addressing: */ + struct ir3_instruction *last_rel; + + /* for calculating input/output positions/linkages: */ + unsigned next_inloc; + + unsigned num_internal_temps; + struct tgsi_src_register internal_temps[6]; + + /* track registers which need to synchronize w/ "complex alu" cat3 + * instruction pipeline: + */ + regmask_t needs_ss; + + /* track registers which need to synchronize with texture fetch + * pipeline: + */ + regmask_t needs_sy; + + /* inputs start at r0, temporaries start after last input, and + * outputs start after last temporary. + * + * We could be more clever, because this is not a hw restriction, + * but probably best just to implement an optimizing pass to + * reduce the # of registers used and get rid of redundant mov's + * (to output register). + */ + unsigned base_reg[TGSI_FILE_COUNT]; + + /* idx/slot for last compiler generated immediate */ + unsigned immediate_idx; + + /* stack of branch instructions that start (potentially nested) + * branch instructions, so that we can fix up the branch targets + * so that we can fix up the branch target on the corresponding + * END instruction + */ + struct ir3_instruction *branch[16]; + unsigned int branch_count; + + /* used when dst is same as one of the src, to avoid overwriting a + * src element before the remaining scalar instructions that make + * up the vector operation + */ + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; +}; + + +static void vectorize(struct ir3_compile_context *ctx, + struct ir3_instruction *instr, struct tgsi_dst_register *dst, + int nsrcs, ...); +static void create_mov(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, struct tgsi_src_register *src); + +static unsigned +compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, + const struct tgsi_token *tokens) +{ + unsigned ret, base = 0; + struct tgsi_shader_info *info = &ctx->info; + const struct fd_lowering_config lconfig = { + .color_two_side = so->key.color_two_side, + .lower_DST = true, + .lower_XPD = true, + .lower_SCS = true, + .lower_LRP = true, + .lower_FRC = true, + .lower_POW = true, + .lower_LIT = true, + .lower_EXP = true, + .lower_LOG = true, + .lower_DP4 = true, + .lower_DP3 = true, + .lower_DPH = true, + .lower_DP2 = true, + .lower_DP2A = true, + }; + + ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info); + ctx->free_tokens = !!ctx->tokens; + if (!ctx->tokens) { + /* no lowering */ + ctx->tokens = tokens; + } + ctx->ir = so->ir; + ctx->block = ir3_block_create(ctx->ir, 0, 0, 0); + ctx->so = so; + ctx->last_input = NULL; + ctx->last_rel = NULL; + ctx->next_inloc = 8; + ctx->num_internal_temps = 0; + ctx->branch_count = 0; + + regmask_init(&ctx->needs_ss); + regmask_init(&ctx->needs_sy); + memset(ctx->base_reg, 0, sizeof(ctx->base_reg)); + + /* Immediates go after constants: */ + ctx->base_reg[TGSI_FILE_CONSTANT] = 0; + ctx->base_reg[TGSI_FILE_IMMEDIATE] = + info->file_max[TGSI_FILE_CONSTANT] + 1; + + /* if full precision and fragment shader, don't clobber + * r0.x w/ bary fetch: + */ + if ((so->type == SHADER_FRAGMENT) && !so->key.half_precision) + base = 1; + + /* Temporaries after outputs after inputs: */ + ctx->base_reg[TGSI_FILE_INPUT] = base; + ctx->base_reg[TGSI_FILE_OUTPUT] = base + + info->file_max[TGSI_FILE_INPUT] + 1; + ctx->base_reg[TGSI_FILE_TEMPORARY] = base + + info->file_max[TGSI_FILE_INPUT] + 1 + + info->file_max[TGSI_FILE_OUTPUT] + 1; + + so->first_immediate = ctx->base_reg[TGSI_FILE_IMMEDIATE]; + ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); + + ret = tgsi_parse_init(&ctx->parser, ctx->tokens); + if (ret != TGSI_PARSE_OK) + return ret; + + ctx->type = ctx->parser.FullHeader.Processor.Processor; + + return ret; +} + +static void +compile_error(struct ir3_compile_context *ctx, const char *format, ...) +{ + va_list ap; + va_start(ap, format); + _debug_vprintf(format, ap); + va_end(ap); + tgsi_dump(ctx->tokens, 0); + debug_assert(0); +} + +#define compile_assert(ctx, cond) do { \ + if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ + } while (0) + +static void +compile_free(struct ir3_compile_context *ctx) +{ + if (ctx->free_tokens) + free((void *)ctx->tokens); + tgsi_parse_free(&ctx->parser); +} + +struct instr_translater { + void (*fxn)(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst); + unsigned tgsi_opc; + opc_t opc; + opc_t hopc; /* opc to use for half_precision mode, if different */ + unsigned arg; +}; + +static void +handle_last_rel(struct ir3_compile_context *ctx) +{ + if (ctx->last_rel) { + ctx->last_rel->flags |= IR3_INSTR_UL; + ctx->last_rel = NULL; + } +} + +static struct ir3_instruction * +instr_create(struct ir3_compile_context *ctx, int category, opc_t opc) +{ + return ir3_instr_create(ctx->block, category, opc); +} + +static void +add_nop(struct ir3_compile_context *ctx, unsigned count) +{ + while (count-- > 0) + instr_create(ctx, 0, OPC_NOP); +} + +static unsigned +src_flags(struct ir3_compile_context *ctx, struct ir3_register *reg) +{ + unsigned flags = 0; + + if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) + return flags; + + if (regmask_get(&ctx->needs_ss, reg)) { + flags |= IR3_INSTR_SS; + regmask_init(&ctx->needs_ss); + } + + if (regmask_get(&ctx->needs_sy, reg)) { + flags |= IR3_INSTR_SY; + regmask_init(&ctx->needs_sy); + } + + return flags; +} + +static struct ir3_register * +add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_dst_register *dst, unsigned chan) +{ + unsigned flags = 0, num = 0; + struct ir3_register *reg; + + switch (dst->File) { + case TGSI_FILE_OUTPUT: + case TGSI_FILE_TEMPORARY: + num = dst->Index + ctx->base_reg[dst->File]; + break; + case TGSI_FILE_ADDRESS: + num = REG_A0; + break; + default: + compile_error(ctx, "unsupported dst register file: %s\n", + tgsi_file_name(dst->File)); + break; + } + + if (dst->Indirect) + flags |= IR3_REG_RELATIV; + if (ctx->so->key.half_precision) + flags |= IR3_REG_HALF; + + reg = ir3_reg_create(instr, regid(num, chan), flags); + + if (dst->Indirect) + ctx->last_rel = instr; + + return reg; +} + +static struct ir3_register * +add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_src_register *src, unsigned chan) +{ + unsigned flags = 0, num = 0; + struct ir3_register *reg; + + /* TODO we need to use a mov to temp for const >= 64.. or maybe + * we could use relative addressing.. + */ + compile_assert(ctx, src->Index < 64); + + switch (src->File) { + case TGSI_FILE_IMMEDIATE: + /* TODO if possible, use actual immediate instead of const.. but + * TGSI has vec4 immediates, we can only embed scalar (of limited + * size, depending on instruction..) + */ + case TGSI_FILE_CONSTANT: + flags |= IR3_REG_CONST; + num = src->Index + ctx->base_reg[src->File]; + break; + case TGSI_FILE_OUTPUT: + /* NOTE: we should only end up w/ OUTPUT file for things like + * clamp()'ing saturated dst instructions + */ + case TGSI_FILE_INPUT: + case TGSI_FILE_TEMPORARY: + num = src->Index + ctx->base_reg[src->File]; + break; + default: + compile_error(ctx, "unsupported src register file: %s\n", + tgsi_file_name(src->File)); + break; + } + + if (src->Absolute) + flags |= IR3_REG_ABS; + if (src->Negate) + flags |= IR3_REG_NEGATE; + if (src->Indirect) + flags |= IR3_REG_RELATIV; + if (ctx->so->key.half_precision) + flags |= IR3_REG_HALF; + + reg = ir3_reg_create(instr, regid(num, chan), flags); + + if (src->Indirect) + ctx->last_rel = instr; + + instr->flags |= src_flags(ctx, reg); + + return reg; +} + +static void +src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) +{ + src->File = dst->File; + src->Indirect = dst->Indirect; + src->Dimension = dst->Dimension; + src->Index = dst->Index; + src->Absolute = 0; + src->Negate = 0; + src->SwizzleX = TGSI_SWIZZLE_X; + src->SwizzleY = TGSI_SWIZZLE_Y; + src->SwizzleZ = TGSI_SWIZZLE_Z; + src->SwizzleW = TGSI_SWIZZLE_W; +} + +/* Get internal-temp src/dst to use for a sequence of instructions + * generated by a single TGSI op. + */ +static struct tgsi_src_register * +get_internal_temp(struct ir3_compile_context *ctx, + struct tgsi_dst_register *tmp_dst) +{ + struct tgsi_src_register *tmp_src; + int n; + + tmp_dst->File = TGSI_FILE_TEMPORARY; + tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; + tmp_dst->Indirect = 0; + tmp_dst->Dimension = 0; + + /* assign next temporary: */ + n = ctx->num_internal_temps++; + compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); + tmp_src = &ctx->internal_temps[n]; + + tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1; + + src_from_dst(tmp_src, tmp_dst); + + return tmp_src; +} + +/* Get internal half-precision temp src/dst to use for a sequence of + * instructions generated by a single TGSI op. + */ +static struct tgsi_src_register * +get_internal_temp_hr(struct ir3_compile_context *ctx, + struct tgsi_dst_register *tmp_dst) +{ + struct tgsi_src_register *tmp_src; + int n; + + if (ctx->so->key.half_precision) + return get_internal_temp(ctx, tmp_dst); + + tmp_dst->File = TGSI_FILE_TEMPORARY; + tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; + tmp_dst->Indirect = 0; + tmp_dst->Dimension = 0; + + /* assign next temporary: */ + n = ctx->num_internal_temps++; + compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); + tmp_src = &ctx->internal_temps[n]; + + /* just use hr0 because no one else should be using half- + * precision regs: + */ + tmp_dst->Index = 0; + + src_from_dst(tmp_src, tmp_dst); + + return tmp_src; +} + +static inline bool +is_const(struct tgsi_src_register *src) +{ + return (src->File == TGSI_FILE_CONSTANT) || + (src->File == TGSI_FILE_IMMEDIATE); +} + +static inline bool +is_relative(struct tgsi_src_register *src) +{ + return src->Indirect; +} + +static inline bool +is_rel_or_const(struct tgsi_src_register *src) +{ + return is_relative(src) || is_const(src); +} + +static type_t +get_ftype(struct ir3_compile_context *ctx) +{ + return ctx->so->key.half_precision ? TYPE_F16 : TYPE_F32; +} + +static type_t +get_utype(struct ir3_compile_context *ctx) +{ + return ctx->so->key.half_precision ? TYPE_U16 : TYPE_U32; +} + +static unsigned +src_swiz(struct tgsi_src_register *src, int chan) +{ + switch (chan) { + case 0: return src->SwizzleX; + case 1: return src->SwizzleY; + case 2: return src->SwizzleZ; + case 3: return src->SwizzleW; + } + assert(0); + return 0; +} + +/* for instructions that cannot take a const register as src, if needed + * generate a move to temporary gpr: + */ +static struct tgsi_src_register * +get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src) +{ + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + + compile_assert(ctx, is_rel_or_const(src)); + + tmp_src = get_internal_temp(ctx, &tmp_dst); + + create_mov(ctx, &tmp_dst, src); + + return tmp_src; +} + +static void +get_immediate(struct ir3_compile_context *ctx, + struct tgsi_src_register *reg, uint32_t val) +{ + unsigned neg, swiz, idx, i; + /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ + static const unsigned swiz2tgsi[] = { + TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, + }; + + for (i = 0; i < ctx->immediate_idx; i++) { + swiz = i % 4; + idx = i / 4; + + if (ctx->so->immediates[idx].val[swiz] == val) { + neg = 0; + break; + } + + if (ctx->so->immediates[idx].val[swiz] == -val) { + neg = 1; + break; + } + } + + if (i == ctx->immediate_idx) { + /* need to generate a new immediate: */ + swiz = i % 4; + idx = i / 4; + neg = 0; + ctx->so->immediates[idx].val[swiz] = val; + ctx->so->immediates_count = idx + 1; + ctx->immediate_idx++; + } + + reg->File = TGSI_FILE_IMMEDIATE; + reg->Indirect = 0; + reg->Dimension = 0; + reg->Index = idx; + reg->Absolute = 0; + reg->Negate = neg; + reg->SwizzleX = swiz2tgsi[swiz]; + reg->SwizzleY = swiz2tgsi[swiz]; + reg->SwizzleZ = swiz2tgsi[swiz]; + reg->SwizzleW = swiz2tgsi[swiz]; +} + +static void +create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst, + struct tgsi_src_register *src) +{ + type_t type_mov = get_ftype(ctx); + unsigned i; + + for (i = 0; i < 4; i++) { + /* move to destination: */ + if (dst->WriteMask & (1 << i)) { + struct ir3_instruction *instr; + + if (src->Absolute || src->Negate) { + /* can't have abs or neg on a mov instr, so use + * absneg.f instead to handle these cases: + */ + instr = instr_create(ctx, 2, OPC_ABSNEG_F); + } else { + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + } + + add_dst_reg(ctx, instr, dst, i); + add_src_reg(ctx, instr, src, src_swiz(src, i)); + } else { + add_nop(ctx, 1); + } + } +} + +static void +create_clamp(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, struct tgsi_src_register *val, + struct tgsi_src_register *minval, struct tgsi_src_register *maxval) +{ + struct ir3_instruction *instr; + + instr = instr_create(ctx, 2, OPC_MAX_F); + vectorize(ctx, instr, dst, 2, val, 0, minval, 0); + + instr = instr_create(ctx, 2, OPC_MIN_F); + vectorize(ctx, instr, dst, 2, val, 0, maxval, 0); +} + +static void +create_clamp_imm(struct ir3_compile_context *ctx, + struct tgsi_dst_register *dst, + uint32_t minval, uint32_t maxval) +{ + struct tgsi_src_register minconst, maxconst; + struct tgsi_src_register src; + + src_from_dst(&src, dst); + + get_immediate(ctx, &minconst, minval); + get_immediate(ctx, &maxconst, maxval); + + create_clamp(ctx, dst, &src, &minconst, &maxconst); +} + +static struct tgsi_dst_register * +get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + unsigned i; + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + struct tgsi_src_register *src = &inst->Src[i].Register; + if ((src->File == dst->File) && (src->Index == dst->Index)) { + if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) && + (src->SwizzleX == TGSI_SWIZZLE_X) && + (src->SwizzleY == TGSI_SWIZZLE_Y) && + (src->SwizzleZ == TGSI_SWIZZLE_Z) && + (src->SwizzleW == TGSI_SWIZZLE_W)) + continue; + ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst); + ctx->tmp_dst.WriteMask = dst->WriteMask; + dst = &ctx->tmp_dst; + break; + } + } + return dst; +} + +static void +put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst, + struct tgsi_dst_register *dst) +{ + /* if necessary, add mov back into original dst: */ + if (dst != &inst->Dst[0].Register) { + create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src); + } +} + +/* helper to generate the necessary repeat and/or additional instructions + * to turn a scalar instruction into a vector operation: + */ +static void +vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr, + struct tgsi_dst_register *dst, int nsrcs, ...) +{ + va_list ap; + int i, j, n = 0; + bool indirect = dst->Indirect; + + add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X); + + va_start(ap, nsrcs); + for (j = 0; j < nsrcs; j++) { + struct tgsi_src_register *src = + va_arg(ap, struct tgsi_src_register *); + unsigned flags = va_arg(ap, unsigned); + struct ir3_register *reg; + if (flags & IR3_REG_IMMED) { + reg = ir3_reg_create(instr, 0, IR3_REG_IMMED); + /* this is an ugly cast.. should have put flags first! */ + reg->iim_val = *(int *)&src; + } else { + reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X); + indirect |= src->Indirect; + } + reg->flags |= flags & ~IR3_REG_NEGATE; + if (flags & IR3_REG_NEGATE) + reg->flags ^= IR3_REG_NEGATE; + } + va_end(ap); + + for (i = 0; i < 4; i++) { + if (dst->WriteMask & (1 << i)) { + struct ir3_instruction *cur; + + if (n++ == 0) { + cur = instr; + } else { + cur = ir3_instr_clone(instr); + cur->flags &= ~(IR3_INSTR_SY | IR3_INSTR_SS | IR3_INSTR_JP); + } + + /* fix-up dst register component: */ + cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i); + + /* fix-up src register component: */ + va_start(ap, nsrcs); + for (j = 0; j < nsrcs; j++) { + struct tgsi_src_register *src = + va_arg(ap, struct tgsi_src_register *); + unsigned flags = va_arg(ap, unsigned); + if (!(flags & IR3_REG_IMMED)) { + cur->regs[j+1]->num = + regid(cur->regs[j+1]->num >> 2, + src_swiz(src, i)); + cur->flags |= src_flags(ctx, cur->regs[j+1]); + } + } + va_end(ap); + + if (indirect) + ctx->last_rel = cur; + } + } + + /* pad w/ nop's.. at least until we are clever enough to + * figure out if we really need to.. + */ + add_nop(ctx, 4 - n); +} + +/* + * Handlers for TGSI instructions which do not have a 1:1 mapping to + * native instructions: + */ + +static void +trans_clamp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct tgsi_src_register *src2 = &inst->Src[2].Register; + + create_clamp(ctx, dst, src0, src1, src2); + + put_dst(ctx, inst, dst); +} + +/* ARL(x) = x, but mova from hrN.x to a0.. */ +static void +trans_arl(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_dst_register *dst = &inst->Dst[0].Register; + struct tgsi_src_register *src = &inst->Src[0].Register; + unsigned chan = src->SwizzleX; + compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); + + handle_last_rel(ctx); + + tmp_src = get_internal_temp_hr(ctx, &tmp_dst); + + /* cov.{f32,f16}s16 Rtmp, Rsrc */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = get_ftype(ctx); + instr->cat1.dst_type = TYPE_S16; + add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, src, chan); + + add_nop(ctx, 3); + + /* shl.b Rtmp, Rtmp, 2 */ + instr = instr_create(ctx, 2, OPC_SHL_B); + add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; + + add_nop(ctx, 3); + + /* mova a0, Rtmp */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = TYPE_S16; + instr->cat1.dst_type = TYPE_S16; + add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; + add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; + + /* need to ensure 5 instr slots before a0 is used: */ + add_nop(ctx, 6); +} + +/* texture fetch/sample instructions: */ +static void +trans_samp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_register *r; + struct ir3_instruction *instr; + struct tgsi_src_register *coord = &inst->Src[0].Register; + struct tgsi_src_register *samp = &inst->Src[1].Register; + unsigned tex = inst->Texture.Texture; + int8_t *order; + unsigned i, flags = 0, src_wrmask; + bool needs_mov = false; + + switch (t->arg) { + case TGSI_OPCODE_TEX: + if (tex == TGSI_TEXTURE_2D) { + order = (int8_t[4]){ 0, 1, -1, -1 }; + src_wrmask = TGSI_WRITEMASK_XY; + } else { + order = (int8_t[4]){ 0, 1, 2, -1 }; + src_wrmask = TGSI_WRITEMASK_XYZ; + } + break; + case TGSI_OPCODE_TXP: + if (tex == TGSI_TEXTURE_2D) { + order = (int8_t[4]){ 0, 1, 3, -1 }; + src_wrmask = TGSI_WRITEMASK_XYZ; + } else { + order = (int8_t[4]){ 0, 1, 2, 3 }; + src_wrmask = TGSI_WRITEMASK_XYZW; + } + flags |= IR3_INSTR_P; + break; + default: + compile_assert(ctx, 0); + break; + } + + if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE)) { + add_nop(ctx, 3); + flags |= IR3_INSTR_3D; + } + + /* cat5 instruction cannot seem to handle const or relative: */ + if (is_rel_or_const(coord)) + needs_mov = true; + + /* The texture sample instructions need to coord in successive + * registers/components (ie. src.xy but not src.yx). And TXP + * needs the .w component in .z for 2D.. so in some cases we + * might need to emit some mov instructions to shuffle things + * around: + */ + for (i = 1; (i < 4) && (order[i] >= 0) && !needs_mov; i++) + if (src_swiz(coord, i) != (src_swiz(coord, 0) + order[i])) + needs_mov = true; + + if (needs_mov) { + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + unsigned j; + + type_t type_mov = get_ftype(ctx); + + /* need to move things around: */ + tmp_src = get_internal_temp(ctx, &tmp_dst); + + for (j = 0; (j < 4) && (order[j] >= 0); j++) { + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = type_mov; + instr->cat1.dst_type = type_mov; + add_dst_reg(ctx, instr, &tmp_dst, j); + add_src_reg(ctx, instr, coord, + src_swiz(coord, order[j])); + } + + coord = tmp_src; + + add_nop(ctx, 4 - j); + } + + instr = instr_create(ctx, 5, t->opc); + instr->cat5.type = get_ftype(ctx); + instr->cat5.samp = samp->Index; + instr->cat5.tex = samp->Index; + instr->flags |= flags; + + r = add_dst_reg(ctx, instr, &inst->Dst[0].Register, 0); + r->wrmask = inst->Dst[0].Register.WriteMask; + + add_src_reg(ctx, instr, coord, coord->SwizzleX)->wrmask = src_wrmask; + + /* after add_src_reg() so we don't set (sy) on sam instr itself! */ + regmask_set(&ctx->needs_sy, r); +} + +/* + * SEQ(a,b) = (a == b) ? 1.0 : 0.0 + * cmps.f.eq tmp0, b, a + * cov.u16f16 dst, tmp0 + * + * SNE(a,b) = (a != b) ? 1.0 : 0.0 + * cmps.f.eq tmp0, b, a + * add.s tmp0, tmp0, -1 + * sel.f16 dst, {0.0}, tmp0, {1.0} + * + * SGE(a,b) = (a >= b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, a, b + * cov.u16f16 dst, tmp0 + * + * SLE(a,b) = (a <= b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, b, a + * cov.u16f16 dst, tmp0 + * + * SGT(a,b) = (a > b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, b, a + * add.s tmp0, tmp0, -1 + * sel.f16 dst, {0.0}, tmp0, {1.0} + * + * SLT(a,b) = (a < b) ? 1.0 : 0.0 + * cmps.f.ge tmp0, a, b + * add.s tmp0, tmp0, -1 + * sel.f16 dst, {0.0}, tmp0, {1.0} + * + * CMP(a,b,c) = (a < 0.0) ? b : c + * cmps.f.ge tmp0, a, {0.0} + * add.s tmp0, tmp0, -1 + * sel.f16 dst, c, tmp0, b + */ +static void +trans_cmp(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; + struct tgsi_src_register constval0, constval1; + /* final instruction for CMP() uses orig src1 and src2: */ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *a0, *a1; + unsigned condition; + + tmp_src = get_internal_temp(ctx, &tmp_dst); + + switch (t->tgsi_opc) { + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SNE: + a0 = &inst->Src[1].Register; /* b */ + a1 = &inst->Src[0].Register; /* a */ + condition = IR3_COND_EQ; + break; + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SLT: + a0 = &inst->Src[0].Register; /* a */ + a1 = &inst->Src[1].Register; /* b */ + condition = IR3_COND_GE; + break; + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SGT: + a0 = &inst->Src[1].Register; /* b */ + a1 = &inst->Src[0].Register; /* a */ + condition = IR3_COND_GE; + break; + case TGSI_OPCODE_CMP: + get_immediate(ctx, &constval0, fui(0.0)); + a0 = &inst->Src[0].Register; /* a */ + a1 = &constval0; /* {0.0} */ + condition = IR3_COND_GE; + break; + default: + compile_assert(ctx, 0); + return; + } + + if (is_const(a0) && is_const(a1)) + a0 = get_unconst(ctx, a0); + + /* cmps.f.ge tmp, a0, a1 */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + instr->cat2.condition = condition; + vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); + + switch (t->tgsi_opc) { + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SLE: + /* cov.u16f16 dst, tmp0 */ + instr = instr_create(ctx, 1, 0); + instr->cat1.src_type = get_utype(ctx); + instr->cat1.dst_type = get_ftype(ctx); + vectorize(ctx, instr, dst, 1, tmp_src, 0); + break; + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_CMP: + /* add.s tmp, tmp, -1 */ + instr = instr_create(ctx, 2, OPC_ADD_S); + vectorize(ctx, instr, &tmp_dst, 2, tmp_src, 0, -1, IR3_REG_IMMED); + + if (t->tgsi_opc == TGSI_OPCODE_CMP) { + /* sel.{f32,f16} dst, src2, tmp, src1 */ + instr = instr_create(ctx, 3, + ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32); + vectorize(ctx, instr, dst, 3, + &inst->Src[2].Register, 0, + tmp_src, 0, + &inst->Src[1].Register, 0); + } else { + get_immediate(ctx, &constval0, fui(0.0)); + get_immediate(ctx, &constval1, fui(1.0)); + /* sel.{f32,f16} dst, {0.0}, tmp0, {1.0} */ + instr = instr_create(ctx, 3, + ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32); + vectorize(ctx, instr, dst, 3, + &constval0, 0, tmp_src, 0, &constval1, 0); + } + + break; + } + + put_dst(ctx, inst, dst); +} + +/* + * Conditional / Flow control + */ + +static unsigned +find_instruction(struct ir3_compile_context *ctx, struct ir3_instruction *instr) +{ + unsigned i; + for (i = 0; i < ctx->ir->instrs_count; i++) + if (ctx->ir->instrs[i] == instr) + return i; + return ~0; +} + +static void +push_branch(struct ir3_compile_context *ctx, struct ir3_instruction *instr) +{ + ctx->branch[ctx->branch_count++] = instr; +} + +static void +pop_branch(struct ir3_compile_context *ctx) +{ + struct ir3_instruction *instr; + + /* if we were clever enough, we'd patch this up after the fact, + * and set (jp) flag on whatever the next instruction was, rather + * than inserting an extra nop.. + */ + instr = instr_create(ctx, 0, OPC_NOP); + instr->flags |= IR3_INSTR_JP; + + /* pop the branch instruction from the stack and fix up branch target: */ + instr = ctx->branch[--ctx->branch_count]; + instr->cat0.immed = ctx->ir->instrs_count - find_instruction(ctx, instr) - 1; +} + +/* We probably don't really want to translate if/else/endif into branches.. + * the blob driver evaluates both legs of the if and then uses the sel + * instruction to pick which sides of the branch to "keep".. but figuring + * that out will take somewhat more compiler smarts. So hopefully branches + * don't kill performance too badly. + */ +static void +trans_if(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + struct tgsi_src_register *src = &inst->Src[0].Register; + struct tgsi_src_register constval; + + get_immediate(ctx, &constval, fui(0.0)); + + if (is_const(src)) + src = get_unconst(ctx, src); + + instr = instr_create(ctx, 2, OPC_CMPS_F); + ir3_reg_create(instr, regid(REG_P0, 0), 0); + add_src_reg(ctx, instr, src, src->SwizzleX); + add_src_reg(ctx, instr, &constval, constval.SwizzleX); + instr->cat2.condition = IR3_COND_EQ; + + instr = instr_create(ctx, 0, OPC_BR); + push_branch(ctx, instr); +} + +static void +trans_else(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr; + + /* for first half of if/else/endif, generate a jump past the else: */ + instr = instr_create(ctx, 0, OPC_JUMP); + + pop_branch(ctx); + push_branch(ctx, instr); +} + +static void +trans_endif(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + pop_branch(ctx); +} + +/* + * Handlers for TGSI instructions which do have 1:1 mapping to native + * instructions: + */ + +static void +instr_cat0(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + instr_create(ctx, 0, t->opc); +} + +static void +instr_cat1(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + + /* mov instructions can't handle a negate on src: */ + if (src->Negate) { + struct tgsi_src_register constval; + struct ir3_instruction *instr; + + /* since right now, we are using uniformly either TYPE_F16 or + * TYPE_F32, and we don't utilize the conversion possibilities + * of mov instructions, we can get away with substituting an + * add.f which can handle negate. Might need to revisit this + * in the future if we start supporting widening/narrowing or + * conversion to/from integer.. + */ + instr = instr_create(ctx, 2, OPC_ADD_F); + get_immediate(ctx, &constval, fui(0.0)); + vectorize(ctx, instr, dst, 2, src, 0, &constval, 0); + } else { + create_mov(ctx, dst, src); + /* create_mov() generates vector sequence, so no vectorize() */ + } + put_dst(ctx, inst, dst); +} + +static void +instr_cat2(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct ir3_instruction *instr; + unsigned src0_flags = 0, src1_flags = 0; + + switch (t->tgsi_opc) { + case TGSI_OPCODE_ABS: + src0_flags = IR3_REG_ABS; + break; + case TGSI_OPCODE_SUB: + src1_flags = IR3_REG_NEGATE; + break; + } + + switch (t->opc) { + case OPC_ABSNEG_F: + case OPC_ABSNEG_S: + case OPC_CLZ_B: + case OPC_CLZ_S: + case OPC_SIGN_F: + case OPC_FLOOR_F: + case OPC_CEIL_F: + case OPC_RNDNE_F: + case OPC_RNDAZ_F: + case OPC_TRUNC_F: + case OPC_NOT_B: + case OPC_BFREV_B: + case OPC_SETRM: + case OPC_CBITS_B: + /* these only have one src reg */ + instr = instr_create(ctx, 2, t->opc); + vectorize(ctx, instr, dst, 1, src0, src0_flags); + break; + default: + if (is_const(src0) && is_const(src1)) + src0 = get_unconst(ctx, src0); + + instr = instr_create(ctx, 2, t->opc); + vectorize(ctx, instr, dst, 2, src0, src0_flags, + src1, src1_flags); + break; + } + + put_dst(ctx, inst, dst); +} + +static void +instr_cat3(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src0 = &inst->Src[0].Register; + struct tgsi_src_register *src1 = &inst->Src[1].Register; + struct ir3_instruction *instr; + + /* in particular, can't handle const for src1 for cat3.. + * for mad, we can swap first two src's if needed: + */ + if (is_rel_or_const(src1)) { + if (is_mad(t->opc) && !is_rel_or_const(src0)) { + struct tgsi_src_register *tmp; + tmp = src0; + src0 = src1; + src1 = tmp; + } else { + src1 = get_unconst(ctx, src1); + } + } + + instr = instr_create(ctx, 3, + ctx->so->key.half_precision ? t->hopc : t->opc); + vectorize(ctx, instr, dst, 3, src0, 0, src1, 0, + &inst->Src[2].Register, 0); + put_dst(ctx, inst, dst); +} + +static void +instr_cat4(const struct instr_translater *t, + struct ir3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_src_register *src = &inst->Src[0].Register; + struct ir3_instruction *instr; + unsigned i, n; + + /* seems like blob compiler avoids const as src.. */ + if (is_const(src)) + src = get_unconst(ctx, src); + + /* worst case: */ + add_nop(ctx, 6); + + /* we need to replicate into each component: */ + for (i = 0, n = 0; i < 4; i++) { + if (dst->WriteMask & (1 << i)) { + if (n++) + add_nop(ctx, 1); + instr = instr_create(ctx, 4, t->opc); + add_dst_reg(ctx, instr, dst, i); + add_src_reg(ctx, instr, src, src->SwizzleX); + } + } + + regmask_set(&ctx->needs_ss, instr->regs[0]); + put_dst(ctx, inst, dst); +} + +static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { +#define INSTR(n, f, ...) \ + [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ } + + INSTR(MOV, instr_cat1), + INSTR(RCP, instr_cat4, .opc = OPC_RCP), + INSTR(RSQ, instr_cat4, .opc = OPC_RSQ), + INSTR(SQRT, instr_cat4, .opc = OPC_SQRT), + INSTR(MUL, instr_cat2, .opc = OPC_MUL_F), + INSTR(ADD, instr_cat2, .opc = OPC_ADD_F), + INSTR(SUB, instr_cat2, .opc = OPC_ADD_F), + INSTR(MIN, instr_cat2, .opc = OPC_MIN_F), + INSTR(MAX, instr_cat2, .opc = OPC_MAX_F), + INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16), + INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F), + INSTR(CLAMP, trans_clamp), + INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F), + INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F), + INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F), + INSTR(ARL, trans_arl), + INSTR(EX2, instr_cat4, .opc = OPC_EXP2), + INSTR(LG2, instr_cat4, .opc = OPC_LOG2), + INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F), + INSTR(COS, instr_cat4, .opc = OPC_COS), + INSTR(SIN, instr_cat4, .opc = OPC_SIN), + INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX), + INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP), + INSTR(SGT, trans_cmp), + INSTR(SLT, trans_cmp), + INSTR(SGE, trans_cmp), + INSTR(SLE, trans_cmp), + INSTR(SNE, trans_cmp), + INSTR(SEQ, trans_cmp), + INSTR(CMP, trans_cmp), + INSTR(IF, trans_if), + INSTR(ELSE, trans_else), + INSTR(ENDIF, trans_endif), + INSTR(END, instr_cat0, .opc = OPC_END), + INSTR(KILL, instr_cat0, .opc = OPC_KILL), +}; + +static ir3_semantic +decl_semantic(const struct tgsi_declaration_semantic *sem) +{ + return ir3_semantic_name(sem->Name, sem->Index); +} + +static int +decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned base = ctx->base_reg[TGSI_FILE_INPUT]; + unsigned i, flags = 0; + int nop = 0; + + /* I don't think we should get frag shader input without + * semantic info? Otherwise how do inputs get linked to + * vert outputs? + */ + compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) || + decl->Declaration.Semantic); + + if (ctx->so->key.half_precision) + flags |= IR3_REG_HALF; + + for (i = decl->Range.First; i <= decl->Range.Last; i++) { + unsigned n = so->inputs_count++; + unsigned r = regid(i + base, 0); + unsigned ncomp; + + /* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */ + ncomp = 4; + + DBG("decl in -> r%d", i + base); // XXX + + compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); + + so->inputs[n].semantic = decl_semantic(&decl->Semantic); + so->inputs[n].compmask = (1 << ncomp) - 1; + so->inputs[n].ncomp = ncomp; + so->inputs[n].regid = r; + so->inputs[n].inloc = ctx->next_inloc; + so->inputs[n].bary = true; /* all that is supported */ + ctx->next_inloc += ncomp; + + so->total_in += ncomp; + + /* for frag shaders, we need to generate the corresponding bary instr: */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + unsigned j; + + for (j = 0; j < ncomp; j++) { + struct ir3_instruction *instr; + struct ir3_register *dst; + + instr = instr_create(ctx, 2, OPC_BARY_F); + + /* dst register: */ + dst = ir3_reg_create(instr, r + j, flags); + ctx->last_input = dst; + + /* input position: */ + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = + so->inputs[n].inloc + j - 8; + + /* input base (always r0.xy): */ + ir3_reg_create(instr, regid(0,0), 0)->wrmask = 0x3; + } + + nop = 6; + } + } + + return nop; +} + +static void +decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + struct ir3_shader_variant *so = ctx->so; + unsigned base = ctx->base_reg[TGSI_FILE_OUTPUT]; + unsigned comp = 0; + unsigned name = decl->Semantic.Name; + unsigned i; + + compile_assert(ctx, decl->Declaration.Semantic); // TODO is this ever not true? + + DBG("decl out[%d] -> r%d", name, decl->Range.First + base); // XXX + + if (ctx->type == TGSI_PROCESSOR_VERTEX) { + switch (name) { + case TGSI_SEMANTIC_POSITION: + so->writes_pos = true; + break; + case TGSI_SEMANTIC_PSIZE: + so->writes_psize = true; + break; + case TGSI_SEMANTIC_COLOR: + case TGSI_SEMANTIC_BCOLOR: + case TGSI_SEMANTIC_GENERIC: + case TGSI_SEMANTIC_FOG: + case TGSI_SEMANTIC_TEXCOORD: + break; + default: + compile_error(ctx, "unknown VS semantic name: %s\n", + tgsi_semantic_names[name]); + } + } else { + switch (name) { + case TGSI_SEMANTIC_POSITION: + comp = 2; /* tgsi will write to .z component */ + so->writes_pos = true; + break; + case TGSI_SEMANTIC_COLOR: + break; + default: + compile_error(ctx, "unknown FS semantic name: %s\n", + tgsi_semantic_names[name]); + } + } + + for (i = decl->Range.First; i <= decl->Range.Last; i++) { + unsigned n = so->outputs_count++; + compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); + so->outputs[n].semantic = decl_semantic(&decl->Semantic); + so->outputs[n].regid = regid(i + base, comp); + } +} + +static void +decl_samp(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) +{ + ctx->so->has_samp = true; +} + +static void +compile_instructions(struct ir3_compile_context *ctx) +{ + struct ir3 *ir = ctx->ir; + int nop = 0; + + while (!tgsi_parse_end_of_tokens(&ctx->parser)) { + tgsi_parse_token(&ctx->parser); + + switch (ctx->parser.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_DECLARATION: { + struct tgsi_full_declaration *decl = + &ctx->parser.FullToken.FullDeclaration; + if (decl->Declaration.File == TGSI_FILE_OUTPUT) { + decl_out(ctx, decl); + } else if (decl->Declaration.File == TGSI_FILE_INPUT) { + nop = decl_in(ctx, decl); + } else if (decl->Declaration.File == TGSI_FILE_SAMPLER) { + decl_samp(ctx, decl); + } + break; + } + case TGSI_TOKEN_TYPE_IMMEDIATE: { + /* TODO: if we know the immediate is small enough, and only + * used with instructions that can embed an immediate, we + * can skip this: + */ + struct tgsi_full_immediate *imm = + &ctx->parser.FullToken.FullImmediate; + unsigned n = ctx->so->immediates_count++; + memcpy(ctx->so->immediates[n].val, imm->u, 16); + break; + } + case TGSI_TOKEN_TYPE_INSTRUCTION: { + struct tgsi_full_instruction *inst = + &ctx->parser.FullToken.FullInstruction; + unsigned opc = inst->Instruction.Opcode; + const struct instr_translater *t = &translaters[opc]; + + add_nop(ctx, nop); + nop = 0; + + if (t->fxn) { + t->fxn(t, ctx, inst); + ctx->num_internal_temps = 0; + } else { + compile_error(ctx, "unknown TGSI opc: %s\n", + tgsi_get_opcode_name(opc)); + } + + switch (inst->Instruction.Saturate) { + case TGSI_SAT_ZERO_ONE: + create_clamp_imm(ctx, &inst->Dst[0].Register, + fui(0.0), fui(1.0)); + break; + case TGSI_SAT_MINUS_PLUS_ONE: + create_clamp_imm(ctx, &inst->Dst[0].Register, + fui(-1.0), fui(1.0)); + break; + } + + break; + } + default: + break; + } + } + + if (ir->instrs_count > 0) + ir->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; + + if (ctx->last_input) + ctx->last_input->flags |= IR3_REG_EI; + + handle_last_rel(ctx); +} + +int +ir3_compile_shader_old(struct ir3_shader_variant *so, + const struct tgsi_token *tokens, struct ir3_shader_key key) +{ + struct ir3_compile_context ctx; + + assert(!so->ir); + + so->ir = ir3_create(); + + assert(so->ir); + + if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) + return -1; + + compile_instructions(&ctx); + + compile_free(&ctx); + + return 0; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c new file mode 100644 index 00000000000..73c2a27c6eb --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -0,0 +1,158 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include "ir3.h" + +/* + * Copy Propagate: + * + * TODO probably want some sort of visitor sort of interface to + * avoid duplicating the same graph traversal logic everywhere.. + * + */ + +static void block_cp(struct ir3_block *block); +static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep); + +static bool is_eligible_mov(struct ir3_instruction *instr) +{ + if ((instr->category == 1) && + (instr->cat1.src_type == instr->cat1.dst_type)) { + struct ir3_register *dst = instr->regs[0]; + struct ir3_register *src = instr->regs[1]; + if (dst->flags & IR3_REG_ADDR) + return false; + if ((src->flags & IR3_REG_SSA) && + /* TODO: propagate abs/neg modifiers if possible */ + !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))) + return true; + } + return false; +} + +static void walk_children(struct ir3_instruction *instr, bool keep) +{ + unsigned i; + + /* walk down the graph from each src: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) + src->instr = instr_cp(src->instr, keep); + } +} + +static struct ir3_instruction * +instr_cp_fanin(struct ir3_instruction *instr) +{ + unsigned i; + + /* we need to handle fanin specially, to detect cases + * when we need to keep a mov + */ + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) { + struct ir3_instruction *cand = + instr_cp(src->instr, false); + + /* if the candidate is a fanout, then keep + * the move. + * + * This is a bit, um, fragile, but it should + * catch the extra mov's that the front-end + * puts in for us already in these cases. + */ + if (is_meta(cand) && (cand->opc == OPC_META_FO)) + cand = instr_cp(src->instr, true); + + src->instr = cand; + } + } + + walk_children(instr, false); + + return instr; + +} + +static struct ir3_instruction * +instr_cp(struct ir3_instruction *instr, bool keep) +{ + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return instr; + + if (is_meta(instr) && (instr->opc == OPC_META_FI)) + return instr_cp_fanin(instr); + + if (is_eligible_mov(instr) && !keep) { + struct ir3_register *src = instr->regs[1]; + return instr_cp(src->instr, false); + } + + walk_children(instr, false); + + return instr; +} + +static void block_cp(struct ir3_block *block) +{ + unsigned i, j; + + for (i = 0; i < block->noutputs; i++) { + if (block->outputs[i]) { + struct ir3_instruction *out = + instr_cp(block->outputs[i], false); + + /* To deal with things like this: + * + * 43: MOV OUT[2], TEMP[5] + * 44: MOV OUT[0], TEMP[5] + * + * we need to ensure that no two outputs point to + * the same instruction + */ + for (j = 0; j < i; j++) { + if (block->outputs[j] == out) { + out = instr_cp(block->outputs[i], true); + break; + } + } + + block->outputs[i] = out; + } + } +} + +void ir3_block_cp(struct ir3_block *block) +{ + ir3_clear_mark(block->shader); + block_cp(block); +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c new file mode 100644 index 00000000000..dcc0362f0c8 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -0,0 +1,159 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include "util/u_math.h" + +#include "ir3.h" + +/* + * Instruction Depth: + * + * Calculates weighted instruction depth, ie. the sum of # of needed + * instructions plus delay slots back to original input (ie INPUT or + * CONST). That is to say, an instructions depth is: + * + * depth(instr) { + * d = 0; + * // for each src register: + * foreach (src in instr->regs[1..n]) + * d = max(d, delayslots(src->instr, n) + depth(src->instr)); + * return d + 1; + * } + * + * After an instruction's depth is calculated, it is inserted into the + * blocks depth sorted list, which is used by the scheduling pass. + */ + +/* calculate required # of delay slots between the instruction that + * assigns a value and the one that consumes + */ +int ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n) +{ + /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal + * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch + * handled with sync bits + */ + + if (is_meta(assigner)) + return 0; + + if (writes_addr(assigner)) + return 6; + + /* handled via sync flags: */ + if (is_sfu(assigner) || is_tex(assigner)) + return 0; + + /* assigner must be alu: */ + if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer)) { + return 6; + } else if ((consumer->category == 3) && + is_mad(consumer->opc) && (n == 2)) { + /* special case, 3rd src to cat3 not required on first cycle */ + return 1; + } else { + return 3; + } +} + +static void insert_by_depth(struct ir3_instruction *instr) +{ + struct ir3_block *block = instr->block; + struct ir3_instruction *n = block->head; + struct ir3_instruction *p = NULL; + + while (n && (n != instr) && (n->depth > instr->depth)) { + p = n; + n = n->next; + } + + instr->next = n; + if (p) + p->next = instr; + else + block->head = instr; +} + +static void ir3_instr_depth(struct ir3_instruction *instr) +{ + unsigned i; + + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return; + + instr->depth = 0; + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) { + unsigned sd; + + /* visit child to compute it's depth: */ + ir3_instr_depth(src->instr); + + sd = ir3_delayslots(src->instr, instr, i-1) + + src->instr->depth; + + instr->depth = MAX2(instr->depth, sd); + } + } + + /* meta-instructions don't add cycles, other than PHI.. which + * might translate to a real instruction.. + * + * well, not entirely true, fan-in/out, etc might need to need + * to generate some extra mov's in edge cases, etc.. probably + * we might want to do depth calculation considering the worst + * case for these?? + */ + if (!is_meta(instr)) + instr->depth++; + + insert_by_depth(instr); +} + +void ir3_block_depth(struct ir3_block *block) +{ + unsigned i; + + block->head = NULL; + + ir3_clear_mark(block->shader); + for (i = 0; i < block->noutputs; i++) + if (block->outputs[i]) + ir3_instr_depth(block->outputs[i]); + + /* at this point, any unvisited input is unused: */ + for (i = 0; i < block->ninputs; i++) { + struct ir3_instruction *in = block->inputs[i]; + if (in && !ir3_instr_check_mark(in)) + block->inputs[i] = NULL; + } +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c new file mode 100644 index 00000000000..1a6f49d51cd --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_dump.c @@ -0,0 +1,425 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include + +#include "ir3.h" + +#define PTRID(x) ((unsigned long)(x)) + +struct ir3_dump_ctx { + FILE *f; + bool verbose; +}; + +static void dump_instr_name(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr) +{ + /* for debugging: */ + if (ctx->verbose) { +#ifdef DEBUG + fprintf(ctx->f, "%04u:", instr->serialno); +#endif + fprintf(ctx->f, "%03u: ", instr->depth); + } + + if (instr->flags & IR3_INSTR_SY) + fprintf(ctx->f, "(sy)"); + if (instr->flags & IR3_INSTR_SS) + fprintf(ctx->f, "(ss)"); + + if (is_meta(instr)) { + switch(instr->opc) { + case OPC_META_PHI: + fprintf(ctx->f, "Φ"); + break; + case OPC_META_DEREF: + fprintf(ctx->f, "(*)"); + break; + default: + /* shouldn't hit here.. just for debugging: */ + switch (instr->opc) { + case OPC_META_INPUT: fprintf(ctx->f, "_meta:in"); break; + case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out"); break; + case OPC_META_FO: fprintf(ctx->f, "_meta:fo"); break; + case OPC_META_FI: fprintf(ctx->f, "_meta:fi"); break; + case OPC_META_FLOW: fprintf(ctx->f, "_meta:flow"); break; + + default: fprintf(ctx->f, "_meta:%d", instr->opc); break; + } + break; + } + } else if (instr->category == 1) { + static const char *type[] = { + [TYPE_F16] = "f16", + [TYPE_F32] = "f32", + [TYPE_U16] = "u16", + [TYPE_U32] = "u32", + [TYPE_S16] = "s16", + [TYPE_S32] = "s32", + [TYPE_U8] = "u8", + [TYPE_S8] = "s8", + }; + if (instr->cat1.src_type == instr->cat1.dst_type) + fprintf(ctx->f, "mov"); + else + fprintf(ctx->f, "cov"); + fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]); + } else { + fprintf(ctx->f, "%s", ir3_instr_name(instr)); + if (instr->flags & IR3_INSTR_3D) + fprintf(ctx->f, ".3d"); + if (instr->flags & IR3_INSTR_A) + fprintf(ctx->f, ".a"); + if (instr->flags & IR3_INSTR_O) + fprintf(ctx->f, ".o"); + if (instr->flags & IR3_INSTR_P) + fprintf(ctx->f, ".p"); + if (instr->flags & IR3_INSTR_S) + fprintf(ctx->f, ".s"); + if (instr->flags & IR3_INSTR_S2EN) + fprintf(ctx->f, ".s2en"); + } +} + +static void dump_reg_name(struct ir3_dump_ctx *ctx, + struct ir3_register *reg) +{ + if ((reg->flags & IR3_REG_ABS) && (reg->flags & IR3_REG_NEGATE)) + fprintf(ctx->f, "(absneg)"); + else if (reg->flags & IR3_REG_NEGATE) + fprintf(ctx->f, "(neg)"); + else if (reg->flags & IR3_REG_ABS) + fprintf(ctx->f, "(abs)"); + + if (reg->flags & IR3_REG_IMMED) { + fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); + } else if (reg->flags & IR3_REG_SSA) { + if (ctx->verbose) { + fprintf(ctx->f, "_["); + dump_instr_name(ctx, reg->instr); + fprintf(ctx->f, "]"); + } + } else { + if (reg->flags & IR3_REG_HALF) + fprintf(ctx->f, "h"); + if (reg->flags & IR3_REG_CONST) + fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); + else + fprintf(ctx->f, "r%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); + } +} + +static void ir3_instr_dump(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr); +static void ir3_block_dump(struct ir3_dump_ctx *ctx, + struct ir3_block *block, const char *name); + +static void dump_instr(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr) +{ + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return; + + /* some meta-instructions need to be handled specially: */ + if (is_meta(instr)) { + if ((instr->opc == OPC_META_FO) || + (instr->opc == OPC_META_FI)) { + unsigned i; + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) + dump_instr(ctx, reg->instr); + } + } else if (instr->opc == OPC_META_FLOW) { + struct ir3_register *reg = instr->regs[1]; + ir3_block_dump(ctx, instr->flow.if_block, "if"); + if (instr->flow.else_block) + ir3_block_dump(ctx, instr->flow.else_block, "else"); + if (reg->flags & IR3_REG_SSA) + dump_instr(ctx, reg->instr); + } else if ((instr->opc == OPC_META_PHI) || + (instr->opc == OPC_META_DEREF)) { + /* treat like a normal instruction: */ + ir3_instr_dump(ctx, instr); + } + } else { + ir3_instr_dump(ctx, instr); + } +} + +/* arrarraggh! if link is to something outside of the current block, we + * need to defer emitting the link until the end of the block, since the + * edge triggers pre-creation of the node it links to inside the cluster, + * even though it is meant to be outside.. + */ +static struct { + char buf[40960]; + unsigned n; +} edge_buf; + +/* helper to print or defer: */ +static void printdef(struct ir3_dump_ctx *ctx, + bool defer, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + if (defer) { + unsigned n = edge_buf.n; + n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n, + fmt, ap); + edge_buf.n = n; + } else { + vfprintf(ctx->f, fmt, ap); + } + va_end(ap); +} + +static void dump_link2(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr, const char *target, bool defer) +{ + /* some meta-instructions need to be handled specially: */ + if (is_meta(instr)) { + if (instr->opc == OPC_META_INPUT) { + printdef(ctx, defer, "input%lx::w -> %s", + PTRID(instr->inout.block), + instr->regs[0]->num, target); + } else if (instr->opc == OPC_META_FO) { + struct ir3_register *reg = instr->regs[1]; + dump_link2(ctx, reg->instr, target, defer); + printdef(ctx, defer, "[label=\".%c\"]", + "xyzw"[instr->fo.off & 0x3]); + } else if (instr->opc == OPC_META_FI) { + unsigned i; + + /* recursively dump all parents and links */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + dump_link2(ctx, reg->instr, target, defer); + printdef(ctx, defer, "[label=\".%c\"]", + "xyzw"[(i - 1) & 0x3]); + } + } + } else if (instr->opc == OPC_META_OUTPUT) { + printdef(ctx, defer, "output%lx::w -> %s", + PTRID(instr->inout.block), + instr->regs[0]->num, target); + } else if ((instr->opc == OPC_META_PHI) || + (instr->opc == OPC_META_DEREF)) { + /* treat like a normal instruction: */ + printdef(ctx, defer, "instr%lx: -> %s", PTRID(instr), target); + } + } else { + printdef(ctx, defer, "instr%lx: -> %s", PTRID(instr), target); + } +} + +static void dump_link(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr, + struct ir3_block *block, const char *target) +{ + bool defer = instr->block != block; + dump_link2(ctx, instr, target, defer); + printdef(ctx, defer, "\n"); +} + +static struct ir3_register *follow_flow(struct ir3_register *reg) +{ + if (reg->flags & IR3_REG_SSA) { + struct ir3_instruction *instr = reg->instr; + /* go with the flow.. */ + if (is_meta(instr) && (instr->opc == OPC_META_FLOW)) + return instr->regs[1]; + } + return reg; +} + +static void ir3_instr_dump(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + + fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{", + PTRID(instr)); + dump_instr_name(ctx, instr); + + /* destination register: */ + fprintf(ctx->f, "|"); + + /* source register(s): */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = follow_flow(instr->regs[i]); + + fprintf(ctx->f, "|"); + + if (reg->flags & IR3_REG_SSA) + fprintf(ctx->f, " ", (i - 1)); + + dump_reg_name(ctx, reg); + } + + fprintf(ctx->f, "}\"];\n"); + + /* and recursively dump dependent instructions: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + char target[32]; /* link target */ + + if (!(reg->flags & IR3_REG_SSA)) + continue; + + snprintf(target, sizeof(target), "instr%lx:", + PTRID(instr), (i - 1)); + + dump_instr(ctx, reg->instr); + dump_link(ctx, follow_flow(reg)->instr, instr->block, target); + } +} + +static void ir3_block_dump(struct ir3_dump_ctx *ctx, + struct ir3_block *block, const char *name) +{ + unsigned i, n; + + n = edge_buf.n; + + fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block)); + fprintf(ctx->f, "label=\"%s\";\n", name); + + /* draw inputs: */ + fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block)); + for (i = 0; i < block->ninputs; i++) + if (block->inputs[i]) + fprintf(ctx->f, "| i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); + fprintf(ctx->f, "\"];\n"); + + /* draw instruction graph: */ + for (i = 0; i < block->noutputs; i++) + dump_instr(ctx, block->outputs[i]); + + /* draw outputs: */ + fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block)); + for (i = 0; i < block->noutputs; i++) + fprintf(ctx->f, "| o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); + fprintf(ctx->f, "\"];\n"); + + /* and links to outputs: */ + for (i = 0; i < block->noutputs; i++) { + char target[32]; /* link target */ + + /* NOTE: there could be outputs that are never assigned, + * so skip them + */ + if (!block->outputs[i]) + continue; + + snprintf(target, sizeof(target), "output%lx::e", + PTRID(block), i); + + dump_link(ctx, block->outputs[i], block, target); + } + + fprintf(ctx->f, "}\n"); + + /* and links to inputs: */ + if (block->parent) { + for (i = 0; i < block->ninputs; i++) { + char target[32]; /* link target */ + + if (!block->inputs[i]) + continue; + + dump_instr(ctx, block->inputs[i]); + + snprintf(target, sizeof(target), "input%lx::e", + PTRID(block), i); + + dump_link(ctx, block->inputs[i], block, target); + } + } + + /* dump deferred edges: */ + if (edge_buf.n > n) { + fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]); + edge_buf.n = n; + } +} + +void ir3_dump(struct ir3 *shader, const char *name, + struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */, + FILE *f) +{ + struct ir3_dump_ctx ctx = { + .f = f, + }; + ir3_clear_mark(shader); + fprintf(ctx.f, "digraph G {\n"); + fprintf(ctx.f, "rankdir=RL;\n"); + fprintf(ctx.f, "nodesep=0.25;\n"); + fprintf(ctx.f, "ranksep=1.5;\n"); + ir3_block_dump(&ctx, block, name); + fprintf(ctx.f, "}\n"); +} + +/* + * For Debugging: + */ + +void +ir3_dump_instr_single(struct ir3_instruction *instr) +{ + struct ir3_dump_ctx ctx = { + .f = stdout, + .verbose = true, + }; + unsigned i; + + dump_instr_name(&ctx, instr); + for (i = 0; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + printf(i ? ", " : " "); + dump_reg_name(&ctx, reg); + } + printf("\n"); +} + +void +ir3_dump_instr_list(struct ir3_instruction *instr) +{ + unsigned n = 0; + + while (instr) { + ir3_dump_instr_single(instr); + if (!is_meta(instr)) + n++; + instr = instr->next; + } + printf("%u instructions\n", n); +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c new file mode 100644 index 00000000000..9389227034c --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c @@ -0,0 +1,155 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include + +#include "ir3.h" + +/* + * Flatten: flatten out legs of if/else, etc + * + * TODO probably should use some heuristic to decide to not flatten + * if one side of the other is too large / deeply nested / whatever? + */ + +struct ir3_flatten_ctx { + struct ir3_block *block; + unsigned cnt; +}; + +static struct ir3_register *unwrap(struct ir3_register *reg) +{ + + if (reg->flags & IR3_REG_SSA) { + struct ir3_instruction *instr = reg->instr; + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_OUTPUT: + case OPC_META_FLOW: + if (instr->regs_count > 1) + return instr->regs[1]; + return NULL; + default: + break; + } + } + } + return reg; +} + +static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return; + + instr->block = ctx->block; + + /* TODO: maybe some threshold to decide whether to + * flatten or not?? + */ + if (is_meta(instr)) { + if (instr->opc == OPC_META_PHI) { + struct ir3_register *cond, *t, *f; + + cond = unwrap(instr->regs[1]); + t = unwrap(instr->regs[2]); /* true val */ + f = unwrap(instr->regs[3]); /* false val */ + + /* must have cond, but t or f may be null if only written + * one one side of the if/else (in which case we can just + * convert the PHI to a simple move). + */ + assert(cond); + assert(t || f); + + if (t && f) { + /* convert the PHI instruction to sel.{b16,b32} */ + instr->category = 3; + + /* instruction type based on dst size: */ + if (instr->regs[0]->flags & IR3_REG_HALF) + instr->opc = OPC_SEL_B16; + else + instr->opc = OPC_SEL_B32; + + instr->regs[1] = t; + instr->regs[2] = cond; + instr->regs[3] = f; + } else { + /* convert to simple mov: */ + instr->category = 1; + instr->cat1.dst_type = TYPE_F32; + instr->cat1.src_type = TYPE_F32; + instr->regs_count = 2; + instr->regs[1] = t ? t : f; + } + + ctx->cnt++; + } else if ((instr->opc == OPC_META_INPUT) && + (instr->regs_count == 2)) { + type_t ftype; + + if (instr->regs[0]->flags & IR3_REG_HALF) + ftype = TYPE_F16; + else + ftype = TYPE_F32; + + /* convert meta:input to mov: */ + instr->category = 1; + instr->cat1.src_type = ftype; + instr->cat1.dst_type = ftype; + } + } + + /* recursively visit children: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) + ir3_instr_flatten(ctx, src->instr); + } +} + +/* return >= 0 is # of phi's flattened, < 0 is error */ +int ir3_block_flatten(struct ir3_block *block) +{ + struct ir3_flatten_ctx ctx = { + .block = block, + }; + unsigned i; + + ir3_clear_mark(block->shader); + for(i = 0; i < block->noutputs; i++) + if (block->outputs[i]) + ir3_instr_flatten(&ctx, block->outputs[i]); + + return ctx.cnt; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c new file mode 100644 index 00000000000..b916dd51393 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -0,0 +1,790 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include "pipe/p_shader_tokens.h" +#include "util/u_math.h" + +#include "ir3.h" +#include "ir3_visitor.h" + +/* + * Register Assignment: + * + * NOTE: currently only works on a single basic block.. need to think + * about how multiple basic blocks are going to get scheduled. But + * I think I want to re-arrange how blocks work, ie. get rid of the + * block nesting thing.. + * + * NOTE: we could do register coalescing (eliminate moves) as part of + * the RA step.. OTOH I think we need to do scheduling before register + * assignment. And if we remove a mov that effects scheduling (unless + * we leave a placeholder nop, which seems lame), so I'm not really + * sure how practical this is to do both in a single stage. But OTOH + * I'm not really sure a sane way for the CP stage to realize when it + * cannot remove a mov due to multi-register constraints.. + * + */ + +struct ir3_ra_ctx { + struct ir3_block *block; + enum shader_t type; + bool half_precision; + bool frag_coord; + bool frag_face; + bool has_samp; + int cnt; + bool error; +}; + +/* sorta ugly way to retrofit half-precision support.. rather than + * passing extra param around, just OR in a high bit. All the low + * value arithmetic (ie. +/- offset within a contiguous vec4, etc) + * will continue to work as long as you don't underflow (and that + * would go badly anyways). + */ +#define REG_HALF 0x8000 + +struct ir3_ra_assignment { + int8_t off; /* offset of instruction dst within range */ + uint8_t num; /* number of components for the range */ +}; + +static void ra_assign(struct ir3_ra_ctx *ctx, + struct ir3_instruction *assigner, int num); +static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr); + +/* + * Register Allocation: + */ + +#define REG(n, wm, f) (struct ir3_register){ \ + .flags = (f), \ + .num = (n), \ + .wrmask = TGSI_WRITEMASK_ ## wm, \ + } + +/* check that the register exists, is a GPR and is not special (a0/p0) */ +static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n) +{ + if ((n < instr->regs_count) && reg_gpr(instr->regs[n])) + return instr->regs[n]; + return NULL; +} + +static int output_base(struct ir3_ra_ctx *ctx) +{ + /* ugg, for fragment shader we need to have input at r0.x + * (or at least if there is a way to configure it, I can't + * see how because the blob driver always uses r0.x (ie. + * all zeros) + */ + if (ctx->type == SHADER_FRAGMENT) { + if (ctx->half_precision) + return ctx->frag_face ? 4 : 3; + return ctx->frag_coord ? 8 : 4; + } + return 0; +} + +/* live means read before written */ +static void compute_liveregs(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, regmask_t *liveregs) +{ + struct ir3_block *block = instr->block; + regmask_t written; + unsigned i, j; + + regmask_init(liveregs); + regmask_init(&written); + + for (instr = instr->next; instr; instr = instr->next) { + struct ir3_register *r; + + if (is_meta(instr)) + continue; + + /* check first src's read: */ + for (j = 1; j < instr->regs_count; j++) { + r = reg_check(instr, j); + if (r) + regmask_set_if_not(liveregs, r, &written); + } + + /* then dst written (if assigned already): */ + if (instr->flags & IR3_INSTR_MARK) { + r = reg_check(instr, 0); + if (r) + regmask_set(&written, r); + } + } + + /* be sure to account for output registers too: */ + for (i = 0; i < block->noutputs; i++) { + struct ir3_register reg = REG(output_base(ctx) + i, X, 0); + regmask_set_if_not(liveregs, ®, &written); + } +} + +/* calculate registers that are clobbered before last use of 'assigner'. + * This needs to be done backwards, although it could possibly be + * combined into compute_liveregs(). (Ie. compute_liveregs() could + * reverse the list, then do this part backwards reversing the list + * again back to original order.) Otoh, probably I should try to + * construct a proper interference graph instead. + * + * XXX this need to follow the same recursion path that is used for + * to rename/assign registers (ie. ra_assign_src()).. this is a bit + * ugly right now, maybe refactor into node iterator sort of things + * that iterates nodes in the correct order? + */ +static bool compute_clobbers(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, struct ir3_instruction *assigner, + regmask_t *liveregs) +{ + unsigned i; + bool live = false, was_live = false; + + if (instr == NULL) { + struct ir3_block *block = ctx->block; + + /* if at the end, check outputs: */ + for (i = 0; i < block->noutputs; i++) + if (block->outputs[i] == assigner) + return true; + return false; + } + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) { + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: + // TODO + assert(0); + break; + case OPC_META_FO: + case OPC_META_FI: + was_live |= compute_clobbers(ctx, instr->next, + instr, liveregs); + break; + default: + break; + } + } + live = true; + break; + } + } + + was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs); + + if (was_live && (instr->regs_count > 0) && + (instr->flags & IR3_INSTR_MARK) && + !is_meta(instr)) + regmask_set(liveregs, instr->regs[0]); + + return live || was_live; +} + +static int find_available(regmask_t *liveregs, int size, bool half) +{ + unsigned i; + unsigned f = half ? IR3_REG_HALF : 0; + for (i = 0; i < MAX_REG - size; i++) { + if (!regmask_get(liveregs, ®(i, X, f))) { + unsigned start = i++; + for (; (i < MAX_REG) && ((i - start) < size); i++) + if (regmask_get(liveregs, ®(i, X, f))) + break; + if ((i - start) >= size) + return start; + } + } + assert(0); + return -1; +} + +static int alloc_block(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, int size) +{ + if (!instr) { + /* special case, allocating shader outputs. At this + * point, nothing is allocated, just start the shader + * outputs at r0.x and let compute_liveregs() take + * care of the rest from here: + */ + return 0; + } else { + struct ir3_register *dst = instr->regs[0]; + regmask_t liveregs; + + compute_liveregs(ctx, instr, &liveregs); + + // XXX XXX XXX XXX XXX XXX XXX XXX XXX + // XXX hack.. maybe ra_calc should give us a list of + // instrs to compute_clobbers() on? + if (is_meta(instr) && (instr->opc == OPC_META_INPUT) && + (instr->regs_count == 1)) { + unsigned i, base = instr->regs[0]->num & ~0x3; + for (i = 0; i < 4; i++) { + struct ir3_instruction *in = ctx->block->inputs[base + i]; + if (in) + compute_clobbers(ctx, in->next, in, &liveregs); + } + } else + // XXX XXX XXX XXX XXX XXX XXX XXX XXX + compute_clobbers(ctx, instr->next, instr, &liveregs); + + return find_available(&liveregs, size, + !!(dst->flags & IR3_REG_HALF)); + } +} + +/* + * Constraint Calculation: + */ + +struct ra_calc_visitor { + struct ir3_visitor base; + struct ir3_ra_assignment a; +}; + +static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v) +{ + return (struct ra_calc_visitor *)v; +} + +/* calculate register assignment for the instruction. If the register + * written by this instruction is required to be part of a range, to + * handle other (input/output/sam/bary.f/etc) contiguous register range + * constraints, that is calculated handled here. + */ +static void ra_calc_dst(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_calc_visitor *c = ra_calc_visitor(v); + if (is_tex(instr)) { + c->a.off = 0; + c->a.num = 4; + } else { + c->a.off = 0; + c->a.num = 1; + } +} + +static void +ra_calc_dst_shader_input(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_calc_visitor *c = ra_calc_visitor(v); + struct ir3_block *block = instr->block; + struct ir3_register *dst = instr->regs[0]; + unsigned base = dst->num & ~0x3; + unsigned i, num = 0; + + assert(!(dst->flags & IR3_REG_IA)); + + /* check what input components we need: */ + for (i = 0; i < 4; i++) { + unsigned idx = base + i; + if ((idx < block->ninputs) && block->inputs[idx]) + num = i + 1; + } + + c->a.off = dst->num - base; + c->a.num = num; +} + +static void ra_calc_src_fanin(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_calc_visitor *c = ra_calc_visitor(v); + unsigned srcn = ir3_instr_regno(instr, reg) - 1; + c->a.off += srcn; + c->a.num += srcn; + c->a.num = MAX2(c->a.num, instr->regs_count - 1); +} + +static const struct ir3_visitor_funcs calc_visitor_funcs = { + .instr = ir3_visit_instr, + .dst_shader_input = ra_calc_dst_shader_input, + .dst_fanout = ra_calc_dst, + .dst_fanin = ra_calc_dst, + .dst = ra_calc_dst, + .src_fanout = ir3_visit_reg, + .src_fanin = ra_calc_src_fanin, + .src = ir3_visit_reg, +}; + +static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner) +{ + struct ra_calc_visitor v = { + .base.funcs = &calc_visitor_funcs, + }; + + ir3_visit_instr(&v.base, assigner); + + return v.a; +} + +/* + * Register Assignment: + */ + +struct ra_assign_visitor { + struct ir3_visitor base; + struct ir3_ra_ctx *ctx; + int num; +}; + +static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v) +{ + return (struct ra_assign_visitor *)v; +} + +static type_t half_type(type_t type) +{ + switch (type) { + case TYPE_F32: return TYPE_F16; + case TYPE_U32: return TYPE_U16; + case TYPE_S32: return TYPE_S16; + /* instructions may already be fixed up: */ + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return type; + default: + assert(0); + return ~0; + } +} + +/* some instructions need fix-up if dst register is half precision: */ +static void fixup_half_instr_dst(struct ir3_instruction *instr) +{ + switch (instr->category) { + case 1: /* move instructions */ + instr->cat1.dst_type = half_type(instr->cat1.dst_type); + break; + case 3: + switch (instr->opc) { + case OPC_MAD_F32: + instr->opc = OPC_MAD_F16; + break; + case OPC_SEL_B32: + instr->opc = OPC_SEL_B16; + break; + case OPC_SEL_S32: + instr->opc = OPC_SEL_S16; + break; + case OPC_SEL_F32: + instr->opc = OPC_SEL_F16; + break; + case OPC_SAD_S32: + instr->opc = OPC_SAD_S16; + break; + /* instructions may already be fixed up: */ + case OPC_MAD_F16: + case OPC_SEL_B16: + case OPC_SEL_S16: + case OPC_SEL_F16: + case OPC_SAD_S16: + break; + default: + assert(0); + break; + } + break; + case 5: + instr->cat5.type = half_type(instr->cat5.type); + break; + } +} +/* some instructions need fix-up if src register is half precision: */ +static void fixup_half_instr_src(struct ir3_instruction *instr) +{ + switch (instr->category) { + case 1: /* move instructions */ + instr->cat1.src_type = half_type(instr->cat1.src_type); + break; + } +} + +static void ra_assign_reg(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + + if (is_flow(instr) && (instr->opc == OPC_KILL)) + return; + + reg->flags &= ~IR3_REG_SSA; + reg->num = a->num & ~REG_HALF; + + assert(reg->num >= 0); + + if (a->num & REG_HALF) { + reg->flags |= IR3_REG_HALF; + /* if dst reg being assigned, patch up the instr: */ + if (reg == instr->regs[0]) + fixup_half_instr_dst(instr); + else + fixup_half_instr_src(instr); + } +} + +static void ra_assign_dst_shader_input(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + unsigned i, base = reg->num & ~0x3; + int off = base - reg->num; + + ra_assign_reg(v, instr, reg); + reg->flags |= IR3_REG_IA; + + /* trigger assignment of all our companion input components: */ + for (i = 0; i < 4; i++) { + struct ir3_instruction *in = instr->block->inputs[i+base]; + if (in && is_meta(in) && (in->opc == OPC_META_INPUT)) + ra_assign(a->ctx, in, a->num + off + i); + } +} + +static void ra_assign_dst_fanout(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + struct ir3_register *src = instr->regs[1]; + ra_assign_reg(v, instr, reg); + if (src->flags & IR3_REG_SSA) + ra_assign(a->ctx, src->instr, a->num - instr->fo.off); +} + +static void ra_assign_src_fanout(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + ra_assign_reg(v, instr, reg); + ra_assign(a->ctx, instr, a->num + instr->fo.off); +} + + +static void ra_assign_src_fanin(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + unsigned j, srcn = ir3_instr_regno(instr, reg) - 1; + ra_assign_reg(v, instr, reg); + ra_assign(a->ctx, instr, a->num - srcn); + for (j = 1; j < instr->regs_count; j++) { + struct ir3_register *reg = instr->regs[j]; + if (reg->flags & IR3_REG_SSA) /* could be renamed already */ + ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1); + } +} + +static const struct ir3_visitor_funcs assign_visitor_funcs = { + .instr = ir3_visit_instr, + .dst_shader_input = ra_assign_dst_shader_input, + .dst_fanout = ra_assign_dst_fanout, + .dst_fanin = ra_assign_reg, + .dst = ra_assign_reg, + .src_fanout = ra_assign_src_fanout, + .src_fanin = ra_assign_src_fanin, + .src = ra_assign_reg, +}; + +static void ra_assign(struct ir3_ra_ctx *ctx, + struct ir3_instruction *assigner, int num) +{ + struct ra_assign_visitor v = { + .base.funcs = &assign_visitor_funcs, + .ctx = ctx, + .num = num, + }; + + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(assigner)) { + debug_assert(assigner->regs[0]->num == (num & ~REG_HALF)); + if (assigner->regs[0]->num != (num & ~REG_HALF)) { + /* impossible situation, should have been resolved + * at an earlier stage by inserting extra mov's: + */ + ctx->error = true; + } + return; + } + + ir3_visit_instr(&v.base, assigner); +} + +/* + * + */ + +static void ir3_instr_ra(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr) +{ + struct ir3_register *dst; + unsigned num; + + /* skip over nop's */ + if (instr->regs_count == 0) + return; + + dst = instr->regs[0]; + + /* if we've already visited this instruction, bail now: */ + if (instr->flags & IR3_INSTR_MARK) + return; + + /* allocate register(s): */ + if (is_addr(instr)) { + num = instr->regs[2]->num; + } else if (reg_gpr(dst)) { + struct ir3_ra_assignment a; + a = ra_calc(instr); + num = alloc_block(ctx, instr, a.num) + a.off; + } else if (dst->flags & IR3_REG_ADDR) { + dst->flags &= ~IR3_REG_ADDR; + num = regid(REG_A0, 0) | REG_HALF; + } else { + /* predicate register (p0).. etc */ + return; + } + + ra_assign(ctx, instr, num); +} + +/* flatten into shader: */ +// XXX this should probably be somewhere else: +static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + struct ir3_instruction *n; + struct ir3 *shader = block->shader; + struct ir3_instruction *end = + ir3_instr_create(block, 0, OPC_END); + struct ir3_instruction *last_input = NULL; + struct ir3_instruction *last_rel = NULL; + regmask_t needs_ss_war; /* write after read */ + regmask_t needs_ss; + regmask_t needs_sy; + + regmask_init(&needs_ss_war); + regmask_init(&needs_ss); + regmask_init(&needs_sy); + + shader->instrs_count = 0; + + for (n = block->head; n; n = n->next) { + struct ir3_register *reg; + unsigned i; + + if (is_meta(n)) + continue; + + for (i = 1; i < n->regs_count; i++) { + reg = n->regs[i]; + + if (reg_gpr(reg)) { + + /* TODO: we probably only need (ss) for alu + * instr consuming sfu result.. need to make + * some tests for both this and (sy).. + */ + if (regmask_get(&needs_ss, reg)) { + n->flags |= IR3_INSTR_SS; + regmask_init(&needs_ss); + } + + if (regmask_get(&needs_sy, reg)) { + n->flags |= IR3_INSTR_SY; + regmask_init(&needs_sy); + } + } + + /* TODO: is it valid to have address reg loaded from a + * relative src (ie. mova a0, c)? If so, the + * last_rel check below should be moved ahead of this: + */ + if (reg->flags & IR3_REG_RELATIV) + last_rel = n; + } + + if (n->regs_count > 0) { + reg = n->regs[0]; + if (regmask_get(&needs_ss_war, reg)) { + n->flags |= IR3_INSTR_SS; + regmask_init(&needs_ss_war); // ??? I assume? + } + + if (last_rel && (reg->num == regid(REG_A0, 0))) { + last_rel->flags |= IR3_INSTR_UL; + last_rel = NULL; + } + } + + /* cat5+ does not have an (ss) bit, if needed we need to + * insert a nop to carry the sync flag. Would be kinda + * clever if we were aware of this during scheduling, but + * this should be a pretty rare case: + */ + if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) { + struct ir3_instruction *nop; + nop = ir3_instr_create(block, 0, OPC_NOP); + nop->flags |= IR3_INSTR_SS; + n->flags &= ~IR3_INSTR_SS; + } + + /* need to be able to set (ss) on first instruction: */ + if ((shader->instrs_count == 0) && (n->category >= 5)) + ir3_instr_create(block, 0, OPC_NOP); + + if (is_nop(n) && shader->instrs_count) { + struct ir3_instruction *last = + shader->instrs[shader->instrs_count-1]; + if (is_nop(last) && (last->repeat < 5)) { + last->repeat++; + last->flags |= n->flags; + continue; + } + } + + shader->instrs[shader->instrs_count++] = n; + + if (is_sfu(n)) + regmask_set(&needs_ss, n->regs[0]); + + if (is_tex(n)) { + /* this ends up being the # of samp instructions.. but that + * is ok, everything else only cares whether it is zero or + * not. We do this here, rather than when we encounter a + * SAMP decl, because (especially in binning pass shader) + * the samp instruction(s) could get eliminated if the + * result is not used. + */ + ctx->has_samp = true; + regmask_set(&needs_sy, n->regs[0]); + } + + /* both tex/sfu appear to not always immediately consume + * their src register(s): + */ + if (is_tex(n) || is_sfu(n)) { + for (i = 1; i < n->regs_count; i++) { + reg = n->regs[i]; + if (reg_gpr(reg)) + regmask_set(&needs_ss_war, reg); + } + } + + if (is_input(n)) + last_input = n; + } + + if (last_input) + last_input->regs[0]->flags |= IR3_REG_EI; + + if (last_rel) + last_rel->flags |= IR3_INSTR_UL; + + shader->instrs[shader->instrs_count++] = end; + + shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; +} + +static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + struct ir3_instruction *n; + + if (!block->parent) { + unsigned i, j; + int base, off = output_base(ctx); + + base = alloc_block(ctx, NULL, block->noutputs + off); + + if (ctx->half_precision) + base |= REG_HALF; + + for (i = 0; i < block->noutputs; i++) + if (block->outputs[i] && !is_kill(block->outputs[i])) + ra_assign(ctx, block->outputs[i], base + i + off); + + if (ctx->type == SHADER_FRAGMENT) { + i = 0; + if (ctx->frag_face) { + /* if we have frag_face, it gets hr0.x */ + ra_assign(ctx, block->inputs[i], REG_HALF | 0); + i += 4; + } + for (j = 0; i < block->ninputs; i++, j++) + if (block->inputs[i]) + ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j); + } else { + for (i = 0; i < block->ninputs; i++) + if (block->inputs[i]) + ir3_instr_ra(ctx, block->inputs[i]); + } + } + + /* then loop over instruction list and assign registers: + */ + n = block->head; + while (n) { + ir3_instr_ra(ctx, n); + if (ctx->error) + return -1; + n = n->next; + } + + legalize(ctx, block); + + return 0; +} + +int ir3_block_ra(struct ir3_block *block, enum shader_t type, + bool half_precision, bool frag_coord, bool frag_face, + bool *has_samp) +{ + struct ir3_ra_ctx ctx = { + .block = block, + .type = type, + .half_precision = half_precision, + .frag_coord = frag_coord, + .frag_face = frag_face, + }; + int ret; + + ir3_clear_mark(block->shader); + ret = block_ra(&ctx, block); + *has_samp = ctx.has_samp; + + return ret; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c new file mode 100644 index 00000000000..3ef67731926 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -0,0 +1,401 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + + +#include "util/u_math.h" + +#include "ir3.h" + +enum { + SCHEDULED = -1, + DELAYED = -2, +}; + +/* + * Instruction Scheduling: + * + * Using the depth sorted list from depth pass, attempt to recursively + * schedule deepest unscheduled path. The first instruction that cannot + * be scheduled, returns the required delay slots it needs, at which + * point we return back up to the top and attempt to schedule by next + * highest depth. After a sufficient number of instructions have been + * scheduled, return back to beginning of list and start again. If you + * reach the end of depth sorted list without being able to insert any + * instruction, insert nop's. Repeat until no more unscheduled + * instructions. + * + * There are a few special cases that need to be handled, since sched + * is currently independent of register allocation. Usages of address + * register (a0.x) or predicate register (p0.x) must be serialized. Ie. + * if you have two pairs of instructions that write the same special + * register and then read it, then those pairs cannot be interleaved. + * To solve this, when we are in such a scheduling "critical section", + * and we encounter a conflicting write to a special register, we try + * to schedule any remaining instructions that use that value first. + */ + +struct ir3_sched_ctx { + struct ir3_instruction *scheduled; /* last scheduled instr */ + struct ir3_instruction *addr; /* current a0.x user, if any */ + struct ir3_instruction *pred; /* current p0.x user, if any */ + unsigned cnt; +}; + +static struct ir3_instruction * +deepest(struct ir3_instruction **srcs, unsigned nsrcs) +{ + struct ir3_instruction *d = NULL; + unsigned i = 0, id = 0; + + while ((i < nsrcs) && !(d = srcs[id = i])) + i++; + + if (!d) + return NULL; + + for (; i < nsrcs; i++) + if (srcs[i] && (srcs[i]->depth > d->depth)) + d = srcs[id = i]; + + srcs[id] = NULL; + + return d; +} + +static unsigned distance(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr, unsigned maxd) +{ + struct ir3_instruction *n = ctx->scheduled; + unsigned d = 0; + while (n && (n != instr) && (d < maxd)) { + if (is_alu(n) || is_flow(n)) + d++; + n = n->next; + } + return d; +} + +/* TODO maybe we want double linked list? */ +static struct ir3_instruction * prev(struct ir3_instruction *instr) +{ + struct ir3_instruction *p = instr->block->head; + while (p && (p->next != instr)) + p = p->next; + return p; +} + +static void schedule(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr, bool remove) +{ + struct ir3_block *block = instr->block; + + /* maybe there is a better way to handle this than just stuffing + * a nop.. ideally we'd know about this constraint in the + * scheduling and depth calculation.. + */ + if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr)) + schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); + + /* remove from depth list: + */ + if (remove) { + struct ir3_instruction *p = prev(instr); + + /* NOTE: this can happen for inputs which are not + * read.. in that case there is no need to schedule + * the input, so just bail: + */ + if (instr != (p ? p->next : block->head)) + return; + + if (p) + p->next = instr->next; + else + block->head = instr->next; + } + + if (writes_addr(instr)) { + assert(ctx->addr == NULL); + ctx->addr = instr; + } + + if (writes_pred(instr)) { + assert(ctx->pred == NULL); + ctx->pred = instr; + } + + instr->flags |= IR3_INSTR_MARK; + + instr->next = ctx->scheduled; + ctx->scheduled = instr; + + ctx->cnt++; +} + +/* + * Delay-slot calculation. Follows fanin/fanout. + */ + +static unsigned delay_calc2(struct ir3_sched_ctx *ctx, + struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned srcn) +{ + unsigned delay = 0; + + if (is_meta(assigner)) { + unsigned i; + for (i = 1; i < assigner->regs_count; i++) { + struct ir3_register *reg = assigner->regs[i]; + if (reg->flags & IR3_REG_SSA) { + unsigned d = delay_calc2(ctx, reg->instr, + consumer, srcn); + delay = MAX2(delay, d); + } + } + } else { + delay = ir3_delayslots(assigner, consumer, srcn); + delay -= distance(ctx, assigner, delay); + } + + return delay; +} + +static unsigned delay_calc(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i, delay = 0; + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + unsigned d = delay_calc2(ctx, reg->instr, + instr, i - 1); + delay = MAX2(delay, d); + } + } + + return delay; +} + +/* A negative return value signals that an instruction has been newly + * scheduled, return back up to the top of the stack (to block_sched()) + */ +static int trysched(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1]; + struct ir3_instruction *src; + unsigned i, delay, nsrcs = 0; + + /* if already scheduled: */ + if (instr->flags & IR3_INSTR_MARK) + return 0; + + /* figure out our src's: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) + srcs[nsrcs++] = reg->instr; + } + + /* for each src register in sorted order: + */ + delay = 0; + while ((src = deepest(srcs, nsrcs))) { + delay = trysched(ctx, src); + if (delay) + return delay; + } + + /* all our dependents are scheduled, figure out if + * we have enough delay slots to schedule ourself: + */ + delay = delay_calc(ctx, instr); + if (delay) + return delay; + + /* if this is a write to address/predicate register, and that + * register is currently in use, we need to defer until it is + * free: + */ + if (writes_addr(instr) && ctx->addr) { + assert(ctx->addr != instr); + return DELAYED; + } + if (writes_pred(instr) && ctx->pred) { + assert(ctx->pred != instr); + return DELAYED; + } + + schedule(ctx, instr, true); + return SCHEDULED; +} + +static struct ir3_instruction * reverse(struct ir3_instruction *instr) +{ + struct ir3_instruction *reversed = NULL; + while (instr) { + struct ir3_instruction *next = instr->next; + instr->next = reversed; + reversed = instr; + instr = next; + } + return reversed; +} + +static bool uses_current_addr(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + if (is_addr(reg->instr)) { + struct ir3_instruction *addr; + addr = reg->instr->regs[1]->instr; /* the mova */ + if (ctx->addr == addr) + return true; + } + } + } + return false; +} + +static bool uses_current_pred(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if ((reg->flags & IR3_REG_SSA) && (ctx->pred == reg->instr)) + return true; + } + return false; +} + +/* when we encounter an instruction that writes to the address register + * when it is in use, we delay that instruction and try to schedule all + * other instructions using the current address register: + */ +static int block_sched_undelayed(struct ir3_sched_ctx *ctx, + struct ir3_block *block) +{ + struct ir3_instruction *instr = block->head; + bool addr_in_use = false; + bool pred_in_use = false; + unsigned cnt = ~0; + + while (instr) { + struct ir3_instruction *next = instr->next; + bool addr = uses_current_addr(ctx, instr); + bool pred = uses_current_pred(ctx, instr); + + if (addr || pred) { + int ret = trysched(ctx, instr); + if (ret == SCHEDULED) + cnt = 0; + else if (ret > 0) + cnt = MIN2(cnt, ret); + if (addr) + addr_in_use = true; + if (pred) + pred_in_use = true; + } + + instr = next; + } + + if (!addr_in_use) + ctx->addr = NULL; + + if (!pred_in_use) + ctx->pred = NULL; + + return cnt; +} + +static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) +{ + struct ir3_instruction *instr; + + /* schedule all the shader input's (meta-instr) first so that + * the RA step sees that the input registers contain a value + * from the start of the shader: + */ + if (!block->parent) { + unsigned i; + for (i = 0; i < block->ninputs; i++) { + struct ir3_instruction *in = block->inputs[i]; + if (in) + schedule(ctx, in, true); + } + } + + while ((instr = block->head)) { + /* NOTE: always grab next *before* trysched(), in case the + * instruction is actually scheduled (and therefore moved + * from depth list into scheduled list) + */ + struct ir3_instruction *next = instr->next; + int cnt = trysched(ctx, instr); + + if (cnt == DELAYED) + cnt = block_sched_undelayed(ctx, block); + + /* -1 is signal to return up stack, but to us means same as 0: */ + cnt = MAX2(0, cnt); + cnt += ctx->cnt; + instr = next; + + /* if deepest remaining instruction cannot be scheduled, try + * the increasingly more shallow instructions until needed + * number of delay slots is filled: + */ + while (instr && (cnt > ctx->cnt)) { + next = instr->next; + trysched(ctx, instr); + instr = next; + } + + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: + */ + while (cnt > ctx->cnt) + schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); + } + + /* at this point, scheduled list is in reverse order, so fix that: */ + block->head = reverse(ctx->scheduled); +} + +void ir3_block_sched(struct ir3_block *block) +{ + struct ir3_sched_ctx ctx = {0}; + ir3_clear_mark(block->shader); + block_sched(&ctx, block); +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c new file mode 100644 index 00000000000..ddf99dbc46e --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -0,0 +1,211 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include "pipe/p_state.h" +#include "util/u_string.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "util/u_format.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_parse.h" + +#include "freedreno_context.h" +#include "freedreno_lowering.h" +#include "freedreno_util.h" + +#include "ir3_shader.h" +#include "ir3_compiler.h" + + +static void +delete_variant(struct ir3_shader_variant *v) +{ + ir3_destroy(v->ir); + fd_bo_del(v->bo); + free(v); +} + +static void +assemble_variant(struct ir3_shader_variant *v) +{ + struct fd_context *ctx = fd_context(v->shader->pctx); + uint32_t sz, *bin; + + bin = ir3_assemble(v->ir, &v->info); + sz = v->info.sizedwords * 4; + + v->bo = fd_bo_new(ctx->dev, sz, + DRM_FREEDRENO_GEM_CACHE_WCOMBINE | + DRM_FREEDRENO_GEM_TYPE_KMEM); + + memcpy(fd_bo_map(v->bo), bin, sz); + + free(bin); + + v->instrlen = v->info.sizedwords / 8; + v->constlen = v->info.max_const + 1; +} + +/* for vertex shader, the inputs are loaded into registers before the shader + * is executed, so max_regs from the shader instructions might not properly + * reflect the # of registers actually used: + */ +static void +fixup_vp_regfootprint(struct ir3_shader_variant *v) +{ + unsigned i; + for (i = 0; i < v->inputs_count; i++) { + if (v->inputs[i].compmask) { + uint32_t regid = (v->inputs[i].regid + 3) >> 2; + v->info.max_reg = MAX2(v->info.max_reg, regid); + } + } + for (i = 0; i < v->outputs_count; i++) { + uint32_t regid = (v->outputs[i].regid + 3) >> 2; + v->info.max_reg = MAX2(v->info.max_reg, regid); + } +} + +static struct ir3_shader_variant * +create_variant(struct ir3_shader *shader, struct ir3_shader_key key) +{ + struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant); + const struct tgsi_token *tokens = shader->tokens; + int ret; + + if (!v) + return NULL; + + v->shader = shader; + v->key = key; + v->type = shader->type; + + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type, + key.binning_pass, key.color_two_side, key.half_precision); + tgsi_dump(tokens, 0); + } + + if (!(fd_mesa_debug & FD_DBG_NOOPT)) { + ret = ir3_compile_shader(v, tokens, key); + if (ret) { + debug_error("new compiler failed, trying fallback!"); + + v->inputs_count = 0; + v->outputs_count = 0; + v->total_in = 0; + v->has_samp = false; + v->immediates_count = 0; + } + } else { + ret = -1; /* force fallback to old compiler */ + } + + if (ret) + ret = ir3_compile_shader_old(v, tokens, key); + + if (ret) { + debug_error("compile failed!"); + goto fail; + } + + assemble_variant(v); + if (!v->bo) { + debug_error("assemble failed!"); + goto fail; + } + + if (shader->type == SHADER_VERTEX) + fixup_vp_regfootprint(v); + + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type, + key.binning_pass, key.color_two_side, key.half_precision); + disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type); + } + + return v; + +fail: + delete_variant(v); + return NULL; +} + +struct ir3_shader_variant * +ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key) +{ + struct ir3_shader_variant *v; + + /* some shader key values only apply to vertex or frag shader, + * so normalize the key to avoid constructing multiple identical + * variants: + */ + if (shader->type == SHADER_FRAGMENT) { + key.binning_pass = false; + } + if (shader->type == SHADER_VERTEX) { + key.color_two_side = false; + key.half_precision = false; + } + + for (v = shader->variants; v; v = v->next) + if (!memcmp(&key, &v->key, sizeof(key))) + return v; + + /* compile new variant if it doesn't exist already: */ + v = create_variant(shader, key); + v->next = shader->variants; + shader->variants = v; + + return v; +} + + +void +ir3_shader_destroy(struct ir3_shader *shader) +{ + struct ir3_shader_variant *v, *t; + for (v = shader->variants; v; ) { + t = v; + v = v->next; + delete_variant(t); + } + free((void *)shader->tokens); + free(shader); +} + +struct ir3_shader * +ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens, + enum shader_t type) +{ + struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader); + shader->pctx = pctx; + shader->type = type; + shader->tokens = tgsi_dup_tokens(tokens); + return shader; +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h new file mode 100644 index 00000000000..1a91fcbcb13 --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -0,0 +1,163 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#ifndef IR3_SHADER_H_ +#define IR3_SHADER_H_ + +#include "ir3.h" +#include "disasm.h" + +typedef uint16_t ir3_semantic; /* semantic name + index */ +static inline ir3_semantic +ir3_semantic_name(uint8_t name, uint16_t index) +{ + return (name << 8) | (index & 0xff); +} + +static inline uint8_t sem2name(ir3_semantic sem) +{ + return sem >> 8; +} + +static inline uint16_t sem2idx(ir3_semantic sem) +{ + return sem & 0xff; +} + +/* Configuration key used to identify a shader variant.. different + * shader variants can be used to implement features not supported + * in hw (two sided color), binning-pass vertex shader, etc. + */ +struct ir3_shader_key { + /* vertex shader variant parameters: */ + unsigned binning_pass : 1; + + /* fragment shader variant parameters: */ + unsigned color_two_side : 1; + unsigned half_precision : 1; +}; + +struct ir3_shader_variant { + struct fd_bo *bo; + + struct ir3_shader_key key; + + struct ir3_info info; + struct ir3 *ir; + + /* the instructions length is in units of instruction groups + * (4 instructions, 8 dwords): + */ + unsigned instrlen; + + /* the constants length is in units of vec4's, and is the sum of + * the uniforms and the built-in compiler constants + */ + unsigned constlen; + + /* About Linkage: + * + Let the frag shader determine the position/compmask for the + * varyings, since it is the place where we know if the varying + * is actually used, and if so, which components are used. So + * what the hw calls "outloc" is taken from the "inloc" of the + * frag shader. + * + From the vert shader, we only need the output regid + */ + + /* for frag shader, pos_regid holds the frag_pos, ie. what is passed + * to bary.f instructions + */ + uint8_t pos_regid; + bool frag_coord, frag_face; + + /* varyings/outputs: */ + unsigned outputs_count; + struct { + ir3_semantic semantic; + uint8_t regid; + } outputs[16 + 2]; /* +POSITION +PSIZE */ + bool writes_pos, writes_psize; + + /* vertices/inputs: */ + unsigned inputs_count; + struct { + ir3_semantic semantic; + uint8_t regid; + uint8_t compmask; + uint8_t ncomp; + /* in theory inloc of fs should match outloc of vs: */ + uint8_t inloc; + uint8_t bary; + } inputs[16 + 2]; /* +POSITION +FACE */ + + unsigned total_in; /* sum of inputs (scalar) */ + + /* do we have one or more texture sample instructions: */ + bool has_samp; + + /* const reg # of first immediate, ie. 1 == c1 + * (not regid, because TGSI thinks in terms of vec4 registers, + * not scalar registers) + */ + unsigned first_immediate; + unsigned immediates_count; + struct { + uint32_t val[4]; + } immediates[64]; + + /* shader variants form a linked list: */ + struct ir3_shader_variant *next; + + /* replicated here to avoid passing extra ptrs everywhere: */ + enum shader_t type; + struct ir3_shader *shader; +}; + +struct ir3_shader { + enum shader_t type; + + struct pipe_context *pctx; + const struct tgsi_token *tokens; + + struct ir3_shader_variant *variants; + + /* so far, only used for blit_prog shader.. values for + * VPC_VARYING_INTERP[i].MODE and VPC_VARYING_PS_REPL[i].MODE + */ + uint32_t vinterp[4], vpsrepl[4]; +}; + + +struct ir3_shader * ir3_shader_create(struct pipe_context *pctx, + const struct tgsi_token *tokens, enum shader_t type); +void ir3_shader_destroy(struct ir3_shader *shader); + +struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, + struct ir3_shader_key key); + +#endif /* IR3_SHADER_H_ */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_visitor.h b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h new file mode 100644 index 00000000000..1c60d1620ca --- /dev/null +++ b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h @@ -0,0 +1,154 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#ifndef IR3_VISITOR_H_ +#define IR3_VISITOR_H_ + +/** + * Visitor which follows dst to src relationships between instructions, + * first visiting the dst (writer) instruction, followed by src (reader) + * instruction(s). + * + * TODO maybe we want multiple different visitors to walk the + * graph in different ways? + */ + +struct ir3_visitor; + +typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v, + struct ir3_instruction *instr); + +typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg); + +struct ir3_visitor_funcs { + ir3_visit_instr_func instr; // TODO do we need?? + + ir3_visit_reg_func dst_shader_input; + ir3_visit_reg_func dst_block_input; + ir3_visit_reg_func dst_fanout; + ir3_visit_reg_func dst_fanin; + ir3_visit_reg_func dst; + + ir3_visit_reg_func src_block_input; + ir3_visit_reg_func src_fanout; + ir3_visit_reg_func src_fanin; + ir3_visit_reg_func src; +}; + +struct ir3_visitor { + const struct ir3_visitor_funcs *funcs; + bool error; +}; + +#include "util/u_debug.h" + +static void visit_instr_dst(struct ir3_visitor *v, + struct ir3_instruction *instr) +{ + struct ir3_register *reg = instr->regs[0]; + + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: + if (instr->regs_count == 1) + v->funcs->dst_shader_input(v, instr, reg); + else + v->funcs->dst_block_input(v, instr, reg); + return; + case OPC_META_FO: + v->funcs->dst_fanout(v, instr, reg); + return; + case OPC_META_FI: + v->funcs->dst_fanin(v, instr, reg); + return; + default: + break; + + } + } + + v->funcs->dst(v, instr, reg); +} + +static void visit_instr_src(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: + /* shader-input does not have a src, only block input: */ + debug_assert(instr->regs_count == 2); + v->funcs->src_block_input(v, instr, reg); + return; + case OPC_META_FO: + v->funcs->src_fanout(v, instr, reg); + return; + case OPC_META_FI: + v->funcs->src_fanin(v, instr, reg); + return; + default: + break; + + } + } + + v->funcs->src(v, instr, reg); +} + +static void ir3_visit_instr(struct ir3_visitor *v, + struct ir3_instruction *instr) +{ + struct ir3_instruction *n; + + /* visit instruction that assigns value: */ + if (instr->regs_count > 0) + visit_instr_dst(v, instr); + + /* and of any following instructions which read that value: */ + n = instr->next; + while (n && !v->error) { + unsigned i; + + for (i = 1; i < n->regs_count; i++) { + struct ir3_register *reg = n->regs[i]; + if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr)) + visit_instr_src(v, n, reg); + } + + n = n->next; + } +} + +static void ir3_visit_reg(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + /* no-op */ +} + +#endif /* IR3_VISITOR_H_ */