freedreno/ir3: split out shader compiler from a3xx
authorRob Clark <robclark@freedesktop.org>
Fri, 25 Jul 2014 15:15:59 +0000 (11:15 -0400)
committerRob Clark <robclark@freedesktop.org>
Fri, 25 Jul 2014 17:29:28 +0000 (13:29 -0400)
Move the bits we want to share between generations from fd3_program to
ir3_shader.  So overall structure is:

  fdN_shader_stateobj -> ir3_shader -> ir3_shader_variant -> ir3
                                    |- ...
                                    \- ir3_shader_variant -> ir3

So the ir3_shader becomes the topmost generation neutral object, which
manages the set of variants each of which generates, compiles, and
assembles it's own ir.

There is a bit of additional renaming to s/fd3_compiler/ir3_compiler/,
etc.

Keep the split between the gallium level stateobj and the shader helper
object because it might be a good idea to pre-compute some generation
specific register values (ie. anything that is independent of linking).

Signed-off-by: Rob Clark <robclark@freedesktop.org>
39 files changed:
src/gallium/drivers/freedreno/Makefile.am
src/gallium/drivers/freedreno/Makefile.sources
src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c [deleted file]
src/gallium/drivers/freedreno/a3xx/fd3_compiler.c [deleted file]
src/gallium/drivers/freedreno/a3xx/fd3_compiler.h [deleted file]
src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c [deleted file]
src/gallium/drivers/freedreno/a3xx/fd3_draw.c
src/gallium/drivers/freedreno/a3xx/fd3_emit.c
src/gallium/drivers/freedreno/a3xx/fd3_emit.h
src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
src/gallium/drivers/freedreno/a3xx/fd3_program.c
src/gallium/drivers/freedreno/a3xx/fd3_program.h
src/gallium/drivers/freedreno/a3xx/fd3_util.h
src/gallium/drivers/freedreno/a3xx/instr-a3xx.h [deleted file]
src/gallium/drivers/freedreno/a3xx/ir3.c [deleted file]
src/gallium/drivers/freedreno/a3xx/ir3.h [deleted file]
src/gallium/drivers/freedreno/a3xx/ir3_cp.c [deleted file]
src/gallium/drivers/freedreno/a3xx/ir3_depth.c [deleted file]
src/gallium/drivers/freedreno/a3xx/ir3_dump.c [deleted file]
src/gallium/drivers/freedreno/a3xx/ir3_flatten.c [deleted file]
src/gallium/drivers/freedreno/a3xx/ir3_ra.c [deleted file]
src/gallium/drivers/freedreno/a3xx/ir3_sched.c [deleted file]
src/gallium/drivers/freedreno/a3xx/ir3_visitor.h [deleted file]
src/gallium/drivers/freedreno/ir3/disasm-a3xx.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/instr-a3xx.h [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3.h [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_compiler.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_compiler.h [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_cp.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_depth.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_dump.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_flatten.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_ra.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_sched.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_shader.c [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_shader.h [new file with mode: 0644]
src/gallium/drivers/freedreno/ir3/ir3_visitor.h [new file with mode: 0644]

index 7947dd1a56e909307306237a6651290f6192ddc7..7d9c6e4933a64bc9fde58a3d28b803e1132a3106 100644 (file)
@@ -7,6 +7,7 @@ AM_CFLAGS = \
        -Wno-packed-bitfield-compat \
        -I$(top_srcdir)/src/gallium/drivers/freedreno/a3xx \
        -I$(top_srcdir)/src/gallium/drivers/freedreno/a2xx \
+       -I$(top_srcdir)/src/gallium/drivers/freedreno/ir3 \
        $(GALLIUM_DRIVER_CFLAGS) \
        $(FREEDRENO_CFLAGS)
 
@@ -15,4 +16,5 @@ noinst_LTLIBRARIES = libfreedreno.la
 libfreedreno_la_SOURCES = \
        $(C_SOURCES) \
        $(a2xx_SOURCES) \
-       $(a3xx_SOURCES)
+       $(a3xx_SOURCES) \
+       $(ir3_SOURCES)
index 0dc7fc08512a4840dee6d81d415b808cfe3950d8..85e0b7eda6fbca84a5298a81324a26cda6b96795 100644 (file)
@@ -33,8 +33,6 @@ a2xx_SOURCES := \
 
 a3xx_SOURCES := \
        a3xx/fd3_blend.c \
-       a3xx/fd3_compiler.c \
-       a3xx/fd3_compiler_old.c \
        a3xx/fd3_context.c \
        a3xx/fd3_draw.c \
        a3xx/fd3_emit.c \
@@ -45,12 +43,17 @@ a3xx_SOURCES := \
        a3xx/fd3_screen.c \
        a3xx/fd3_texture.c \
        a3xx/fd3_util.c \
-       a3xx/fd3_zsa.c \
-       a3xx/disasm-a3xx.c \
-       a3xx/ir3_cp.c \
-       a3xx/ir3_depth.c \
-       a3xx/ir3_dump.c \
-       a3xx/ir3_flatten.c \
-       a3xx/ir3_ra.c \
-       a3xx/ir3_sched.c \
-       a3xx/ir3.c
+       a3xx/fd3_zsa.c
+
+ir3_SOURCES := \
+       ir3/disasm-a3xx.c \
+       ir3/ir3_compiler.c \
+       ir3/ir3_compiler_old.c \
+       ir3/ir3_shader.c \
+       ir3/ir3_cp.c \
+       ir3/ir3_depth.c \
+       ir3/ir3_dump.c \
+       ir3/ir3_flatten.c \
+       ir3/ir3_ra.c \
+       ir3/ir3_sched.c \
+       ir3/ir3.c
diff --git a/src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c b/src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c
deleted file mode 100644 (file)
index 8c3704b..0000000
+++ /dev/null
@@ -1,805 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <assert.h>
-
-#include <util/u_debug.h>
-
-#include "disasm.h"
-#include "instr-a3xx.h"
-
-static enum debug_t debug;
-
-#define printf debug_printf
-
-static const char *levels[] = {
-               "",
-               "\t",
-               "\t\t",
-               "\t\t\t",
-               "\t\t\t\t",
-               "\t\t\t\t\t",
-               "\t\t\t\t\t\t",
-               "\t\t\t\t\t\t\t",
-               "\t\t\t\t\t\t\t\t",
-               "\t\t\t\t\t\t\t\t\t",
-               "x",
-               "x",
-               "x",
-               "x",
-               "x",
-               "x",
-};
-
-static const char *component = "xyzw";
-
-static const char *type[] = {
-               [TYPE_F16] = "f16",
-               [TYPE_F32] = "f32",
-               [TYPE_U16] = "u16",
-               [TYPE_U32] = "u32",
-               [TYPE_S16] = "s16",
-               [TYPE_S32] = "s32",
-               [TYPE_U8]  = "u8",
-               [TYPE_S8]  = "s8",
-};
-
-static void print_reg(reg_t reg, bool full, bool r, bool c, bool im,
-               bool neg, bool abs, bool addr_rel)
-{
-       const char type = c ? 'c' : 'r';
-
-       // XXX I prefer - and || for neg/abs, but preserving format used
-       // by libllvm-a3xx for easy diffing..
-
-       if (abs && neg)
-               printf("(absneg)");
-       else if (neg)
-               printf("(neg)");
-       else if (abs)
-               printf("(abs)");
-
-       if (r)
-               printf("(r)");
-
-       if (im) {
-               printf("%d", reg.iim_val);
-       } else if (addr_rel) {
-               /* I would just use %+d but trying to make it diff'able with
-                * libllvm-a3xx...
-                */
-               if (reg.iim_val < 0)
-                       printf("%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
-               else if (reg.iim_val > 0)
-                       printf("%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
-               else
-                       printf("%s%c<a0.x>", full ? "" : "h", type);
-       } else if ((reg.num == REG_A0) && !c) {
-               printf("a0.%c", component[reg.comp]);
-       } else if ((reg.num == REG_P0) && !c) {
-               printf("p0.%c", component[reg.comp]);
-       } else {
-               printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
-       }
-}
-
-
-/* current instruction repeat flag: */
-static unsigned repeat;
-
-static void print_reg_dst(reg_t reg, bool full, bool addr_rel)
-{
-       print_reg(reg, full, false, false, false, false, false, addr_rel);
-}
-
-static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im,
-               bool neg, bool abs, bool addr_rel)
-{
-       print_reg(reg, full, r, c, im, neg, abs, addr_rel);
-}
-
-static void print_instr_cat0(instr_t *instr)
-{
-       instr_cat0_t *cat0 = &instr->cat0;
-
-       switch (cat0->opc) {
-       case OPC_KILL:
-               printf(" %sp0.%c", cat0->inv ? "!" : "",
-                               component[cat0->comp]);
-               break;
-       case OPC_BR:
-               printf(" %sp0.%c, #%d", cat0->inv ? "!" : "",
-                               component[cat0->comp], cat0->immed);
-               break;
-       case OPC_JUMP:
-       case OPC_CALL:
-               printf(" #%d", cat0->immed);
-               break;
-       }
-
-       if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4))
-               printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4);
-}
-
-static void print_instr_cat1(instr_t *instr)
-{
-       instr_cat1_t *cat1 = &instr->cat1;
-
-       if (cat1->ul)
-               printf("(ul)");
-
-       if (cat1->src_type == cat1->dst_type) {
-               if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
-                       /* special case (nmemonic?): */
-                       printf("mova");
-               } else {
-                       printf("mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
-               }
-       } else {
-               printf("cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
-       }
-
-       printf(" ");
-
-       if (cat1->even)
-               printf("(even)");
-
-       if (cat1->pos_inf)
-               printf("(pos_infinity)");
-
-       print_reg_dst((reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
-                       cat1->dst_rel);
-
-       printf(", ");
-
-       /* ugg, have to special case this.. vs print_reg().. */
-       if (cat1->src_im) {
-               if (type_float(cat1->src_type))
-                       printf("(%f)", cat1->fim_val);
-               else
-                       printf("%d", cat1->iim_val);
-       } else if (cat1->src_rel && !cat1->src_c) {
-               /* I would just use %+d but trying to make it diff'able with
-                * libllvm-a3xx...
-                */
-               char type = cat1->src_rel_c ? 'c' : 'r';
-               if (cat1->off < 0)
-                       printf("%c<a0.x - %d>", type, -cat1->off);
-               else if (cat1->off > 0)
-                       printf("%c<a0.x + %d>", type, cat1->off);
-               else
-                       printf("c<a0.x>");
-       } else {
-               print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32,
-                               cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
-       }
-
-       if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
-               printf("\t{1: %x}", cat1->must_be_0);
-}
-
-static void print_instr_cat2(instr_t *instr)
-{
-       instr_cat2_t *cat2 = &instr->cat2;
-       static const char *cond[] = {
-                       "lt",
-                       "le",
-                       "gt",
-                       "ge",
-                       "eq",
-                       "ne",
-                       "?6?",
-       };
-
-       switch (cat2->opc) {
-       case OPC_CMPS_F:
-       case OPC_CMPS_U:
-       case OPC_CMPS_S:
-       case OPC_CMPV_F:
-       case OPC_CMPV_U:
-       case OPC_CMPV_S:
-               printf(".%s", cond[cat2->cond]);
-               break;
-       }
-
-       printf(" ");
-       if (cat2->ei)
-               printf("(ei)");
-       print_reg_dst((reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
-       printf(", ");
-
-       if (cat2->c1.src1_c) {
-               print_reg_src((reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
-                               cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
-                               cat2->src1_abs, false);
-       } else if (cat2->rel1.src1_rel) {
-               print_reg_src((reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
-                               cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
-                               cat2->src1_abs, cat2->rel1.src1_rel);
-       } else {
-               print_reg_src((reg_t)(cat2->src1), cat2->full, cat2->src1_r,
-                               false, cat2->src1_im, cat2->src1_neg,
-                               cat2->src1_abs, false);
-       }
-
-       switch (cat2->opc) {
-       case OPC_ABSNEG_F:
-       case OPC_ABSNEG_S:
-       case OPC_CLZ_B:
-       case OPC_CLZ_S:
-       case OPC_SIGN_F:
-       case OPC_FLOOR_F:
-       case OPC_CEIL_F:
-       case OPC_RNDNE_F:
-       case OPC_RNDAZ_F:
-       case OPC_TRUNC_F:
-       case OPC_NOT_B:
-       case OPC_BFREV_B:
-       case OPC_SETRM:
-       case OPC_CBITS_B:
-               /* these only have one src reg */
-               break;
-       default:
-               printf(", ");
-               if (cat2->c2.src2_c) {
-                       print_reg_src((reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
-                                       cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
-                                       cat2->src2_abs, false);
-               } else if (cat2->rel2.src2_rel) {
-                       print_reg_src((reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
-                                       cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
-                                       cat2->src2_abs, cat2->rel2.src2_rel);
-               } else {
-                       print_reg_src((reg_t)(cat2->src2), cat2->full, cat2->src2_r,
-                                       false, cat2->src2_im, cat2->src2_neg,
-                                       cat2->src2_abs, false);
-               }
-               break;
-       }
-}
-
-static void print_instr_cat3(instr_t *instr)
-{
-       instr_cat3_t *cat3 = &instr->cat3;
-       bool full = instr_cat3_full(cat3);
-
-       printf(" ");
-       print_reg_dst((reg_t)(cat3->dst), full ^ cat3->dst_half, false);
-       printf(", ");
-       if (cat3->c1.src1_c) {
-               print_reg_src((reg_t)(cat3->c1.src1), full,
-                               cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
-                               false, false);
-       } else if (cat3->rel1.src1_rel) {
-               print_reg_src((reg_t)(cat3->rel1.src1), full,
-                               cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
-                               false, cat3->rel1.src1_rel);
-       } else {
-               print_reg_src((reg_t)(cat3->src1), full,
-                               cat3->src1_r, false, false, cat3->src1_neg,
-                               false, false);
-       }
-       printf(", ");
-       print_reg_src((reg_t)cat3->src2, full,
-                       cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
-                       false, false);
-       printf(", ");
-       if (cat3->c2.src3_c) {
-               print_reg_src((reg_t)(cat3->c2.src3), full,
-                               cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
-                               false, false);
-       } else if (cat3->rel2.src3_rel) {
-               print_reg_src((reg_t)(cat3->rel2.src3), full,
-                               cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
-                               false, cat3->rel2.src3_rel);
-       } else {
-               print_reg_src((reg_t)(cat3->src3), full,
-                               cat3->src3_r, false, false, cat3->src3_neg,
-                               false, false);
-       }
-}
-
-static void print_instr_cat4(instr_t *instr)
-{
-       instr_cat4_t *cat4 = &instr->cat4;
-
-       printf(" ");
-       print_reg_dst((reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
-       printf(", ");
-
-       if (cat4->c.src_c) {
-               print_reg_src((reg_t)(cat4->c.src), cat4->full,
-                               cat4->src_r, cat4->c.src_c, cat4->src_im,
-                               cat4->src_neg, cat4->src_abs, false);
-       } else if (cat4->rel.src_rel) {
-               print_reg_src((reg_t)(cat4->rel.src), cat4->full,
-                               cat4->src_r, cat4->rel.src_c, cat4->src_im,
-                               cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
-       } else {
-               print_reg_src((reg_t)(cat4->src), cat4->full,
-                               cat4->src_r, false, cat4->src_im,
-                               cat4->src_neg, cat4->src_abs, false);
-       }
-
-       if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
-               printf("\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
-}
-
-static void print_instr_cat5(instr_t *instr)
-{
-       static const struct {
-               bool src1, src2, samp, tex;
-       } info[0x1f] = {
-                       [OPC_ISAM]     = { true,  false, true,  true,  },
-                       [OPC_ISAML]    = { true,  true,  true,  true,  },
-                       [OPC_ISAMM]    = { true,  false, true,  true,  },
-                       [OPC_SAM]      = { true,  false, true,  true,  },
-                       [OPC_SAMB]     = { true,  true,  true,  true,  },
-                       [OPC_SAML]     = { true,  true,  true,  true,  },
-                       [OPC_SAMGQ]    = { true,  false, true,  true,  },
-                       [OPC_GETLOD]   = { true,  false, true,  true,  },
-                       [OPC_CONV]     = { true,  true,  true,  true,  },
-                       [OPC_CONVM]    = { true,  true,  true,  true,  },
-                       [OPC_GETSIZE]  = { true,  false, false, true,  },
-                       [OPC_GETBUF]   = { false, false, false, true,  },
-                       [OPC_GETPOS]   = { true,  false, false, true,  },
-                       [OPC_GETINFO]  = { false, false, false, true,  },
-                       [OPC_DSX]      = { true,  false, false, false, },
-                       [OPC_DSY]      = { true,  false, false, false, },
-                       [OPC_GATHER4R] = { true,  false, true,  true,  },
-                       [OPC_GATHER4G] = { true,  false, true,  true,  },
-                       [OPC_GATHER4B] = { true,  false, true,  true,  },
-                       [OPC_GATHER4A] = { true,  false, true,  true,  },
-                       [OPC_SAMGP0]   = { true,  false, true,  true,  },
-                       [OPC_SAMGP1]   = { true,  false, true,  true,  },
-                       [OPC_SAMGP2]   = { true,  false, true,  true,  },
-                       [OPC_SAMGP3]   = { true,  false, true,  true,  },
-                       [OPC_DSXPP_1]  = { true,  false, false, false, },
-                       [OPC_DSYPP_1]  = { true,  false, false, false, },
-                       [OPC_RGETPOS]  = { false, false, false, false, },
-                       [OPC_RGETINFO] = { false, false, false, false, },
-       };
-       instr_cat5_t *cat5 = &instr->cat5;
-       int i;
-
-       if (cat5->is_3d)   printf(".3d");
-       if (cat5->is_a)    printf(".a");
-       if (cat5->is_o)    printf(".o");
-       if (cat5->is_p)    printf(".p");
-       if (cat5->is_s)    printf(".s");
-       if (cat5->is_s2en) printf(".s2en");
-
-       printf(" ");
-
-       switch (cat5->opc) {
-       case OPC_DSXPP_1:
-       case OPC_DSYPP_1:
-               break;
-       default:
-               printf("(%s)", type[cat5->type]);
-               break;
-       }
-
-       printf("(");
-       for (i = 0; i < 4; i++)
-               if (cat5->wrmask & (1 << i))
-                       printf("%c", "xyzw"[i]);
-       printf(")");
-
-       print_reg_dst((reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
-
-       if (info[cat5->opc].src1) {
-               printf(", ");
-               print_reg_src((reg_t)(cat5->src1), cat5->full, false, false, false,
-                               false, false, false);
-       }
-
-       if (cat5->is_s2en) {
-               printf(", ");
-               print_reg_src((reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
-                               false, false, false);
-               printf(", ");
-               print_reg_src((reg_t)(cat5->s2en.src3), false, false, false, false,
-                               false, false, false);
-       } else {
-               if (cat5->is_o || info[cat5->opc].src2) {
-                       printf(", ");
-                       print_reg_src((reg_t)(cat5->norm.src2), cat5->full,
-                                       false, false, false, false, false, false);
-               }
-               if (info[cat5->opc].samp)
-                       printf(", s#%d", cat5->norm.samp);
-               if (info[cat5->opc].tex)
-                       printf(", t#%d", cat5->norm.tex);
-       }
-
-       if (debug & PRINT_VERBOSE) {
-               if (cat5->is_s2en) {
-                       if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
-                               printf("\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
-               } else {
-                       if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
-                               printf("\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
-               }
-       }
-}
-
-static int32_t u2i(uint32_t val, int nbits)
-{
-       return ((val >> (nbits-1)) * ~((1 << nbits) - 1)) | val;
-}
-
-static void print_instr_cat6(instr_t *instr)
-{
-       instr_cat6_t *cat6 = &instr->cat6;
-
-       printf(".%s ", type[cat6->type]);
-
-       switch (cat6->opc) {
-       case OPC_LDG:
-       case OPC_LDP:
-       case OPC_LDL:
-       case OPC_LDLW:
-       case OPC_LDLV:
-               /* load instructions: */
-               print_reg_dst((reg_t)(cat6->a.dst), type_size(cat6->type) == 32, false);
-               printf(",");
-               switch (cat6->opc) {
-               case OPC_LDG:
-                       printf("g");
-                       break;
-               case OPC_LDP:
-                       printf("p");
-                       break;
-               case OPC_LDL:
-               case OPC_LDLW:
-               case OPC_LDLV:
-                       printf("l");
-                       break;
-               }
-               printf("[");
-               print_reg_src((reg_t)(cat6->a.src), true,
-                               false, false, false, false, false, false);
-               if (cat6->a.off)
-                       printf("%+d", cat6->a.off);
-               printf("]");
-               break;
-       case OPC_PREFETCH:
-               /* similar to load instructions: */
-               printf("g[");
-               print_reg_src((reg_t)(cat6->a.src), true,
-                               false, false, false, false, false, false);
-               if (cat6->a.off)
-                       printf("%+d", cat6->a.off);
-               printf("]");
-               break;
-       case OPC_STG:
-       case OPC_STP:
-       case OPC_STL:
-       case OPC_STLW:
-               /* store instructions: */
-               switch (cat6->opc) {
-               case OPC_STG:
-                       printf("g");
-                       break;
-               case OPC_STP:
-                       printf("p");
-                       break;
-               case OPC_STL:
-               case OPC_STLW:
-                       printf("l");
-                       break;
-               }
-               printf("[");
-               print_reg_dst((reg_t)(cat6->b.dst), true, false);
-               if (cat6->b.off || cat6->b.off_hi)
-                       printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13));
-               printf("]");
-               printf(",");
-               print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32,
-                               false, false, false, false, false, false);
-
-               break;
-       case OPC_STI:
-               /* sti has same encoding as other store instructions, but
-                * slightly different syntax:
-                */
-               print_reg_dst((reg_t)(cat6->b.dst), false /* XXX is it always half? */, false);
-               if (cat6->b.off || cat6->b.off_hi)
-                       printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13));
-               printf(",");
-               print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32,
-                               false, false, false, false, false, false);
-               break;
-       }
-
-       printf(", %d", cat6->iim_val);
-
-       if (debug & PRINT_VERBOSE) {
-               switch (cat6->opc) {
-               case OPC_LDG:
-               case OPC_LDP:
-                       /* load instructions: */
-                       if (cat6->a.dummy1|cat6->a.dummy2|cat6->a.dummy3)
-                               printf("\t{6: %x,%x,%x}", cat6->a.dummy1, cat6->a.dummy2, cat6->a.dummy3);
-                       if ((cat6->a.must_be_one1 != 1) || (cat6->a.must_be_one2 != 1))
-                               printf("{?? %d,%d ??}", cat6->a.must_be_one1, cat6->a.must_be_one2);
-                       break;
-               case OPC_STG:
-               case OPC_STP:
-               case OPC_STI:
-                       /* store instructions: */
-                       if (cat6->b.dummy1|cat6->b.dummy2)
-                               printf("\t{6: %x,%x}", cat6->b.dummy1, cat6->b.dummy2);
-                       if ((cat6->b.must_be_one1 != 1) || (cat6->b.must_be_one2 != 1) ||
-                                       (cat6->b.must_be_zero1 != 0))
-                               printf("{?? %d,%d,%d ??}", cat6->b.must_be_one1, cat6->b.must_be_one2,
-                                               cat6->b.must_be_zero1);
-                       break;
-               }
-       }
-}
-
-/* size of largest OPC field of all the instruction categories: */
-#define NOPC_BITS 6
-
-struct opc_info {
-       uint16_t cat;
-       uint16_t opc;
-       const char *name;
-       void (*print)(instr_t *instr);
-} opcs[1 << (3+NOPC_BITS)] = {
-#define OPC(cat, opc, name) [((cat) << NOPC_BITS) | (opc)] = { (cat), (opc), #name, print_instr_cat##cat }
-       /* category 0: */
-       OPC(0, OPC_NOP,          nop),
-       OPC(0, OPC_BR,           br),
-       OPC(0, OPC_JUMP,         jump),
-       OPC(0, OPC_CALL,         call),
-       OPC(0, OPC_RET,          ret),
-       OPC(0, OPC_KILL,         kill),
-       OPC(0, OPC_END,          end),
-       OPC(0, OPC_EMIT,         emit),
-       OPC(0, OPC_CUT,          cut),
-       OPC(0, OPC_CHMASK,       chmask),
-       OPC(0, OPC_CHSH,         chsh),
-       OPC(0, OPC_FLOW_REV,     flow_rev),
-
-       /* category 1: */
-       OPC(1, 0, ),
-
-       /* category 2: */
-       OPC(2, OPC_ADD_F,        add.f),
-       OPC(2, OPC_MIN_F,        min.f),
-       OPC(2, OPC_MAX_F,        max.f),
-       OPC(2, OPC_MUL_F,        mul.f),
-       OPC(2, OPC_SIGN_F,       sign.f),
-       OPC(2, OPC_CMPS_F,       cmps.f),
-       OPC(2, OPC_ABSNEG_F,     absneg.f),
-       OPC(2, OPC_CMPV_F,       cmpv.f),
-       OPC(2, OPC_FLOOR_F,      floor.f),
-       OPC(2, OPC_CEIL_F,       ceil.f),
-       OPC(2, OPC_RNDNE_F,      rndne.f),
-       OPC(2, OPC_RNDAZ_F,      rndaz.f),
-       OPC(2, OPC_TRUNC_F,      trunc.f),
-       OPC(2, OPC_ADD_U,        add.u),
-       OPC(2, OPC_ADD_S,        add.s),
-       OPC(2, OPC_SUB_U,        sub.u),
-       OPC(2, OPC_SUB_S,        sub.s),
-       OPC(2, OPC_CMPS_U,       cmps.u),
-       OPC(2, OPC_CMPS_S,       cmps.s),
-       OPC(2, OPC_MIN_U,        min.u),
-       OPC(2, OPC_MIN_S,        min.s),
-       OPC(2, OPC_MAX_U,        max.u),
-       OPC(2, OPC_MAX_S,        max.s),
-       OPC(2, OPC_ABSNEG_S,     absneg.s),
-       OPC(2, OPC_AND_B,        and.b),
-       OPC(2, OPC_OR_B,         or.b),
-       OPC(2, OPC_NOT_B,        not.b),
-       OPC(2, OPC_XOR_B,        xor.b),
-       OPC(2, OPC_CMPV_U,       cmpv.u),
-       OPC(2, OPC_CMPV_S,       cmpv.s),
-       OPC(2, OPC_MUL_U,        mul.u),
-       OPC(2, OPC_MUL_S,        mul.s),
-       OPC(2, OPC_MULL_U,       mull.u),
-       OPC(2, OPC_BFREV_B,      bfrev.b),
-       OPC(2, OPC_CLZ_S,        clz.s),
-       OPC(2, OPC_CLZ_B,        clz.b),
-       OPC(2, OPC_SHL_B,        shl.b),
-       OPC(2, OPC_SHR_B,        shr.b),
-       OPC(2, OPC_ASHR_B,       ashr.b),
-       OPC(2, OPC_BARY_F,       bary.f),
-       OPC(2, OPC_MGEN_B,       mgen.b),
-       OPC(2, OPC_GETBIT_B,     getbit.b),
-       OPC(2, OPC_SETRM,        setrm),
-       OPC(2, OPC_CBITS_B,      cbits.b),
-       OPC(2, OPC_SHB,          shb),
-       OPC(2, OPC_MSAD,         msad),
-
-       /* category 3: */
-       OPC(3, OPC_MAD_U16,      mad.u16),
-       OPC(3, OPC_MADSH_U16,    madsh.u16),
-       OPC(3, OPC_MAD_S16,      mad.s16),
-       OPC(3, OPC_MADSH_M16,    madsh.m16),
-       OPC(3, OPC_MAD_U24,      mad.u24),
-       OPC(3, OPC_MAD_S24,      mad.s24),
-       OPC(3, OPC_MAD_F16,      mad.f16),
-       OPC(3, OPC_MAD_F32,      mad.f32),
-       OPC(3, OPC_SEL_B16,      sel.b16),
-       OPC(3, OPC_SEL_B32,      sel.b32),
-       OPC(3, OPC_SEL_S16,      sel.s16),
-       OPC(3, OPC_SEL_S32,      sel.s32),
-       OPC(3, OPC_SEL_F16,      sel.f16),
-       OPC(3, OPC_SEL_F32,      sel.f32),
-       OPC(3, OPC_SAD_S16,      sad.s16),
-       OPC(3, OPC_SAD_S32,      sad.s32),
-
-       /* category 4: */
-       OPC(4, OPC_RCP,          rcp),
-       OPC(4, OPC_RSQ,          rsq),
-       OPC(4, OPC_LOG2,         log2),
-       OPC(4, OPC_EXP2,         exp2),
-       OPC(4, OPC_SIN,          sin),
-       OPC(4, OPC_COS,          cos),
-       OPC(4, OPC_SQRT,         sqrt),
-
-       /* category 5: */
-       OPC(5, OPC_ISAM,         isam),
-       OPC(5, OPC_ISAML,        isaml),
-       OPC(5, OPC_ISAMM,        isamm),
-       OPC(5, OPC_SAM,          sam),
-       OPC(5, OPC_SAMB,         samb),
-       OPC(5, OPC_SAML,         saml),
-       OPC(5, OPC_SAMGQ,        samgq),
-       OPC(5, OPC_GETLOD,       getlod),
-       OPC(5, OPC_CONV,         conv),
-       OPC(5, OPC_CONVM,        convm),
-       OPC(5, OPC_GETSIZE,      getsize),
-       OPC(5, OPC_GETBUF,       getbuf),
-       OPC(5, OPC_GETPOS,       getpos),
-       OPC(5, OPC_GETINFO,      getinfo),
-       OPC(5, OPC_DSX,          dsx),
-       OPC(5, OPC_DSY,          dsy),
-       OPC(5, OPC_GATHER4R,     gather4r),
-       OPC(5, OPC_GATHER4G,     gather4g),
-       OPC(5, OPC_GATHER4B,     gather4b),
-       OPC(5, OPC_GATHER4A,     gather4a),
-       OPC(5, OPC_SAMGP0,       samgp0),
-       OPC(5, OPC_SAMGP1,       samgp1),
-       OPC(5, OPC_SAMGP2,       samgp2),
-       OPC(5, OPC_SAMGP3,       samgp3),
-       OPC(5, OPC_DSXPP_1,      dsxpp.1),
-       OPC(5, OPC_DSYPP_1,      dsypp.1),
-       OPC(5, OPC_RGETPOS,      rgetpos),
-       OPC(5, OPC_RGETINFO,     rgetinfo),
-
-
-       /* category 6: */
-       OPC(6, OPC_LDG,          ldg),
-       OPC(6, OPC_LDL,          ldl),
-       OPC(6, OPC_LDP,          ldp),
-       OPC(6, OPC_STG,          stg),
-       OPC(6, OPC_STL,          stl),
-       OPC(6, OPC_STP,          stp),
-       OPC(6, OPC_STI,          sti),
-       OPC(6, OPC_G2L,          g2l),
-       OPC(6, OPC_L2G,          l2g),
-       OPC(6, OPC_PREFETCH,     prefetch),
-       OPC(6, OPC_LDLW,         ldlw),
-       OPC(6, OPC_STLW,         stlw),
-       OPC(6, OPC_RESFMT,       resfmt),
-       OPC(6, OPC_RESINFO,      resinf),
-       OPC(6, OPC_ATOMIC_ADD_L,     atomic.add.l),
-       OPC(6, OPC_ATOMIC_SUB_L,     atomic.sub.l),
-       OPC(6, OPC_ATOMIC_XCHG_L,    atomic.xchg.l),
-       OPC(6, OPC_ATOMIC_INC_L,     atomic.inc.l),
-       OPC(6, OPC_ATOMIC_DEC_L,     atomic.dec.l),
-       OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l),
-       OPC(6, OPC_ATOMIC_MIN_L,     atomic.min.l),
-       OPC(6, OPC_ATOMIC_MAX_L,     atomic.max.l),
-       OPC(6, OPC_ATOMIC_AND_L,     atomic.and.l),
-       OPC(6, OPC_ATOMIC_OR_L,      atomic.or.l),
-       OPC(6, OPC_ATOMIC_XOR_L,     atomic.xor.l),
-       OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.4d),
-       OPC(6, OPC_STGB_4D_4,    stgb.4d.4),
-       OPC(6, OPC_STIB,         stib),
-       OPC(6, OPC_LDC_4,        ldc.4),
-       OPC(6, OPC_LDLV,         ldlv),
-
-
-#undef OPC
-};
-
-#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
-
-// XXX hack.. probably should move this table somewhere common:
-#include "ir3.h"
-const char *ir3_instr_name(struct ir3_instruction *instr)
-{
-       if (instr->category == -1) return "??meta??";
-       return opcs[(instr->category << NOPC_BITS) | instr->opc].name;
-}
-
-static void print_instr(uint32_t *dwords, int level, int n)
-{
-       instr_t *instr = (instr_t *)dwords;
-       uint32_t opc = instr_opc(instr);
-       const char *name;
-
-       printf("%s%04d[%08xx_%08xx] ", levels[level], n, dwords[1], dwords[0]);
-
-#if 0
-       /* print unknown bits: */
-       if (debug & PRINT_RAW)
-               printf("[%08xx_%08xx] ", dwords[1] & 0x001ff800, dwords[0] & 0x00000000);
-
-       if (debug & PRINT_VERBOSE)
-               printf("%d,%02d ", instr->opc_cat, opc);
-#endif
-
-       /* NOTE: order flags are printed is a bit fugly.. but for now I
-        * try to match the order in llvm-a3xx disassembler for easy
-        * diff'ing..
-        */
-
-       if (instr->sync)
-               printf("(sy)");
-       if (instr->ss && (instr->opc_cat <= 4))
-               printf("(ss)");
-       if (instr->jmp_tgt)
-               printf("(jp)");
-       if (instr->repeat && (instr->opc_cat <= 4)) {
-               printf("(rpt%d)", instr->repeat);
-               repeat = instr->repeat;
-       } else {
-               repeat = 0;
-       }
-       if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
-               printf("(ul)");
-
-       name = GETINFO(instr)->name;
-
-       if (name) {
-               printf("%s", name);
-               GETINFO(instr)->print(instr);
-       } else {
-               printf("unknown(%d,%d)", instr->opc_cat, opc);
-       }
-
-       printf("\n");
-}
-
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type)
-{
-       int i;
-
-       assert((sizedwords % 2) == 0);
-
-       for (i = 0; i < sizedwords; i += 2)
-               print_instr(&dwords[i], level, i/2);
-
-       return 0;
-}
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
deleted file mode 100644 (file)
index 0c22e55..0000000
+++ /dev/null
@@ -1,2638 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <stdarg.h>
-
-#include "pipe/p_state.h"
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_ureg.h"
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_strings.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_scan.h"
-
-#include "freedreno_lowering.h"
-
-#include "fd3_compiler.h"
-#include "fd3_program.h"
-
-#include "instr-a3xx.h"
-#include "ir3.h"
-
-struct fd3_compile_context {
-       const struct tgsi_token *tokens;
-       bool free_tokens;
-       struct ir3 *ir;
-       struct fd3_shader_variant *so;
-
-       struct ir3_block *block;
-       struct ir3_instruction *current_instr;
-
-       /* we need to defer updates to block->outputs[] until the end
-        * of an instruction (so we don't see new value until *after*
-        * the src registers are processed)
-        */
-       struct {
-               struct ir3_instruction *instr, **instrp;
-       } output_updates[16];
-       unsigned num_output_updates;
-
-       /* are we in a sequence of "atomic" instructions?
-        */
-       bool atomic;
-
-       /* For fragment shaders, from the hw perspective the only
-        * actual input is r0.xy position register passed to bary.f.
-        * But TGSI doesn't know that, it still declares things as
-        * IN[] registers.  So we do all the input tracking normally
-        * and fix things up after compile_instructions()
-        *
-        * NOTE that frag_pos is the hardware position (possibly it
-        * is actually an index or tag or some such.. it is *not*
-        * values that can be directly used for gl_FragCoord..)
-        */
-       struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
-
-       struct tgsi_parse_context parser;
-       unsigned type;
-
-       struct tgsi_shader_info info;
-
-       /* for calculating input/output positions/linkages: */
-       unsigned next_inloc;
-
-       unsigned num_internal_temps;
-       struct tgsi_src_register internal_temps[6];
-
-       /* idx/slot for last compiler generated immediate */
-       unsigned immediate_idx;
-
-       /* stack of branch instructions that mark (potentially nested)
-        * branch if/else/loop/etc
-        */
-       struct {
-               struct ir3_instruction *instr, *cond;
-               bool inv;   /* true iff in else leg of branch */
-       } branch[16];
-       unsigned int branch_count;
-
-       /* list of kill instructions: */
-       struct ir3_instruction *kill[16];
-       unsigned int kill_count;
-
-       /* used when dst is same as one of the src, to avoid overwriting a
-        * src element before the remaining scalar instructions that make
-        * up the vector operation
-        */
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register *tmp_src;
-};
-
-
-static void vectorize(struct fd3_compile_context *ctx,
-               struct ir3_instruction *instr, struct tgsi_dst_register *dst,
-               int nsrcs, ...);
-static void create_mov(struct fd3_compile_context *ctx,
-               struct tgsi_dst_register *dst, struct tgsi_src_register *src);
-static type_t get_ftype(struct fd3_compile_context *ctx);
-
-static unsigned
-compile_init(struct fd3_compile_context *ctx, struct fd3_shader_variant *so,
-               const struct tgsi_token *tokens)
-{
-       unsigned ret;
-       struct tgsi_shader_info *info = &ctx->info;
-       const struct fd_lowering_config lconfig = {
-                       .color_two_side = so->key.color_two_side,
-                       .lower_DST  = true,
-                       .lower_XPD  = true,
-                       .lower_SCS  = true,
-                       .lower_LRP  = true,
-                       .lower_FRC  = true,
-                       .lower_POW  = true,
-                       .lower_LIT  = true,
-                       .lower_EXP  = true,
-                       .lower_LOG  = true,
-                       .lower_DP4  = true,
-                       .lower_DP3  = true,
-                       .lower_DPH  = true,
-                       .lower_DP2  = true,
-                       .lower_DP2A = true,
-       };
-
-       ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
-       ctx->free_tokens = !!ctx->tokens;
-       if (!ctx->tokens) {
-               /* no lowering */
-               ctx->tokens = tokens;
-       }
-       ctx->ir = so->ir;
-       ctx->so = so;
-       ctx->next_inloc = 8;
-       ctx->num_internal_temps = 0;
-       ctx->branch_count = 0;
-       ctx->kill_count = 0;
-       ctx->block = NULL;
-       ctx->current_instr = NULL;
-       ctx->num_output_updates = 0;
-       ctx->atomic = false;
-       ctx->frag_pos = NULL;
-       ctx->frag_face = NULL;
-
-       memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
-
-#define FM(x) (1 << TGSI_FILE_##x)
-       /* optimize can't deal with relative addressing: */
-       if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
-               return TGSI_PARSE_ERROR;
-
-       /* Immediates go after constants: */
-       so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
-       ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
-
-       ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
-       if (ret != TGSI_PARSE_OK)
-               return ret;
-
-       ctx->type = ctx->parser.FullHeader.Processor.Processor;
-
-       return ret;
-}
-
-static void
-compile_error(struct fd3_compile_context *ctx, const char *format, ...)
-{
-       va_list ap;
-       va_start(ap, format);
-       _debug_vprintf(format, ap);
-       va_end(ap);
-       tgsi_dump(ctx->tokens, 0);
-       debug_assert(0);
-}
-
-#define compile_assert(ctx, cond) do { \
-               if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
-       } while (0)
-
-static void
-compile_free(struct fd3_compile_context *ctx)
-{
-       if (ctx->free_tokens)
-               free((void *)ctx->tokens);
-       tgsi_parse_free(&ctx->parser);
-}
-
-struct instr_translater {
-       void (*fxn)(const struct instr_translater *t,
-                       struct fd3_compile_context *ctx,
-                       struct tgsi_full_instruction *inst);
-       unsigned tgsi_opc;
-       opc_t opc;
-       opc_t hopc;    /* opc to use for half_precision mode, if different */
-       unsigned arg;
-};
-
-static void
-instr_finish(struct fd3_compile_context *ctx)
-{
-       unsigned i;
-
-       if (ctx->atomic)
-               return;
-
-       for (i = 0; i < ctx->num_output_updates; i++)
-               *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
-
-       ctx->num_output_updates = 0;
-}
-
-/* For "atomic" groups of instructions, for example the four scalar
- * instructions to perform a vec4 operation.  Basically this just
- * blocks out handling of output_updates so the next scalar instruction
- * still sees the result from before the start of the atomic group.
- *
- * NOTE: when used properly, this could probably replace get/put_dst()
- * stuff.
- */
-static void
-instr_atomic_start(struct fd3_compile_context *ctx)
-{
-       ctx->atomic = true;
-}
-
-static void
-instr_atomic_end(struct fd3_compile_context *ctx)
-{
-       ctx->atomic = false;
-       instr_finish(ctx);
-}
-
-static struct ir3_instruction *
-instr_create(struct fd3_compile_context *ctx, int category, opc_t opc)
-{
-       instr_finish(ctx);
-       return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
-}
-
-static struct ir3_instruction *
-instr_clone(struct fd3_compile_context *ctx, struct ir3_instruction *instr)
-{
-       instr_finish(ctx);
-       return (ctx->current_instr = ir3_instr_clone(instr));
-}
-
-static struct ir3_block *
-push_block(struct fd3_compile_context *ctx)
-{
-       struct ir3_block *block;
-       unsigned ntmp, nin, nout;
-
-#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
-
-       /* hmm, give ourselves room to create 4 extra temporaries (vec4):
-        */
-       ntmp = SCALAR_REGS(TEMPORARY);
-       ntmp += 4 * 4;
-
-       nout = SCALAR_REGS(OUTPUT);
-       nin  = SCALAR_REGS(INPUT);
-
-       /* for outermost block, 'inputs' are the actual shader INPUT
-        * register file.  Reads from INPUT registers always go back to
-        * top block.  For nested blocks, 'inputs' is used to track any
-        * TEMPORARY file register from one of the enclosing blocks that
-        * is ready in this block.
-        */
-       if (!ctx->block) {
-               /* NOTE: fragment shaders actually have two inputs (r0.xy, the
-                * position)
-                */
-               if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-                       int n = 2;
-                       if (ctx->info.reads_position)
-                               n += 4;
-                       if (ctx->info.uses_frontface)
-                               n += 4;
-                       nin = MAX2(n, nin);
-                       nout += ARRAY_SIZE(ctx->kill);
-               }
-       } else {
-               nin = ntmp;
-       }
-
-       block = ir3_block_create(ctx->ir, ntmp, nin, nout);
-
-       if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
-               block->noutputs -= ARRAY_SIZE(ctx->kill);
-
-       block->parent = ctx->block;
-       ctx->block = block;
-
-       return block;
-}
-
-static void
-pop_block(struct fd3_compile_context *ctx)
-{
-       ctx->block = ctx->block->parent;
-       compile_assert(ctx, ctx->block);
-}
-
-static struct ir3_instruction *
-create_output(struct ir3_block *block, struct ir3_instruction *instr,
-               unsigned n)
-{
-       struct ir3_instruction *out;
-
-       out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
-       out->inout.block = block;
-       ir3_reg_create(out, n, 0);
-       if (instr)
-               ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
-
-       return out;
-}
-
-static struct ir3_instruction *
-create_input(struct ir3_block *block, struct ir3_instruction *instr,
-               unsigned n)
-{
-       struct ir3_instruction *in;
-
-       in = ir3_instr_create(block, -1, OPC_META_INPUT);
-       in->inout.block = block;
-       ir3_reg_create(in, n, 0);
-       if (instr)
-               ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
-
-       return in;
-}
-
-static struct ir3_instruction *
-block_input(struct ir3_block *block, unsigned n)
-{
-       /* references to INPUT register file always go back up to
-        * top level:
-        */
-       if (block->parent)
-               return block_input(block->parent, n);
-       return block->inputs[n];
-}
-
-/* return temporary in scope, creating if needed meta-input node
- * to track block inputs
- */
-static struct ir3_instruction *
-block_temporary(struct ir3_block *block, unsigned n)
-{
-       /* references to TEMPORARY register file, find the nearest
-        * enclosing block which has already assigned this temporary,
-        * creating meta-input instructions along the way to keep
-        * track of block inputs
-        */
-       if (block->parent && !block->temporaries[n]) {
-               /* if already have input for this block, reuse: */
-               if (!block->inputs[n])
-                       block->inputs[n] = block_temporary(block->parent, n);
-
-               /* and create new input to return: */
-               return create_input(block, block->inputs[n], n);
-       }
-       return block->temporaries[n];
-}
-
-static struct ir3_instruction *
-create_immed(struct fd3_compile_context *ctx, float val)
-{
-       /* NOTE: *don't* use instr_create() here!
-        */
-       struct ir3_instruction *instr;
-       instr = ir3_instr_create(ctx->block, 1, 0);
-       instr->cat1.src_type = get_ftype(ctx);
-       instr->cat1.dst_type = get_ftype(ctx);
-       ir3_reg_create(instr, 0, 0);
-       ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
-       return instr;
-}
-
-static void
-ssa_dst(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
-               const struct tgsi_dst_register *dst, unsigned chan)
-{
-       unsigned n = regid(dst->Index, chan);
-       unsigned idx = ctx->num_output_updates;
-
-       compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
-
-       /* NOTE: defer update of temporaries[idx] or output[idx]
-        * until instr_finish(), so that if the current instruction
-        * reads the same TEMP/OUT[] it gets the old value:
-        *
-        * bleh.. this might be a bit easier to just figure out
-        * in instr_finish().  But at that point we've already
-        * lost information about OUTPUT vs TEMPORARY register
-        * file..
-        */
-
-       switch (dst->File) {
-       case TGSI_FILE_OUTPUT:
-               compile_assert(ctx, n < ctx->block->noutputs);
-               ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
-               ctx->output_updates[idx].instr = instr;
-               ctx->num_output_updates++;
-               break;
-       case TGSI_FILE_TEMPORARY:
-               compile_assert(ctx, n < ctx->block->ntemporaries);
-               ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
-               ctx->output_updates[idx].instr = instr;
-               ctx->num_output_updates++;
-               break;
-       case TGSI_FILE_ADDRESS:
-               compile_assert(ctx, n < 1);
-               ctx->output_updates[idx].instrp = &ctx->block->address;
-               ctx->output_updates[idx].instr = instr;
-               ctx->num_output_updates++;
-               break;
-       }
-}
-
-static void
-ssa_src(struct fd3_compile_context *ctx, struct ir3_register *reg,
-               const struct tgsi_src_register *src, unsigned chan)
-{
-       struct ir3_block *block = ctx->block;
-       unsigned n = regid(src->Index, chan);
-
-       switch (src->File) {
-       case TGSI_FILE_INPUT:
-               reg->flags |= IR3_REG_SSA;
-               reg->instr = block_input(ctx->block, n);
-               break;
-       case TGSI_FILE_OUTPUT:
-               /* really this should just happen in case of 'MOV_SAT OUT[n], ..',
-                * for the following clamp instructions:
-                */
-               reg->flags |= IR3_REG_SSA;
-               reg->instr = block->outputs[n];
-               /* we don't have to worry about read from an OUTPUT that was
-                * assigned outside of the current block, because the _SAT
-                * clamp instructions will always be in the same block as
-                * the original instruction which wrote the OUTPUT
-                */
-               compile_assert(ctx, reg->instr);
-               break;
-       case TGSI_FILE_TEMPORARY:
-               reg->flags |= IR3_REG_SSA;
-               reg->instr = block_temporary(ctx->block, n);
-               break;
-       }
-
-       if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
-               /* this can happen when registers (or components of a TGSI
-                * register) are used as src before they have been assigned
-                * (undefined contents).  To avoid confusing the rest of the
-                * compiler, and to generally keep things peachy, substitute
-                * an instruction that sets the src to 0.0.  Or to keep
-                * things undefined, I could plug in a random number? :-P
-                *
-                * NOTE: *don't* use instr_create() here!
-                */
-               reg->instr = create_immed(ctx, 0.0);
-       }
-}
-
-static struct ir3_register *
-add_dst_reg_wrmask(struct fd3_compile_context *ctx,
-               struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
-               unsigned chan, unsigned wrmask)
-{
-       unsigned flags = 0, num = 0;
-       struct ir3_register *reg;
-
-       switch (dst->File) {
-       case TGSI_FILE_OUTPUT:
-       case TGSI_FILE_TEMPORARY:
-               /* uses SSA */
-               break;
-       case TGSI_FILE_ADDRESS:
-               flags |= IR3_REG_ADDR;
-               /* uses SSA */
-               break;
-       default:
-               compile_error(ctx, "unsupported dst register file: %s\n",
-                       tgsi_file_name(dst->File));
-               break;
-       }
-
-       if (dst->Indirect)
-               flags |= IR3_REG_RELATIV;
-
-       reg = ir3_reg_create(instr, regid(num, chan), flags);
-
-       /* NOTE: do not call ssa_dst() if atomic.. vectorize()
-        * itself will call ssa_dst().  This is to filter out
-        * the (initially bogus) .x component dst which is
-        * created (but not necessarily used, ie. if the net
-        * result of the vector operation does not write to
-        * the .x component)
-        */
-
-       reg->wrmask = wrmask;
-       if (wrmask == 0x1) {
-               /* normal case */
-               if (!ctx->atomic)
-                       ssa_dst(ctx, instr, dst, chan);
-       } else if ((dst->File == TGSI_FILE_TEMPORARY) ||
-                       (dst->File == TGSI_FILE_OUTPUT) ||
-                       (dst->File == TGSI_FILE_ADDRESS)) {
-               unsigned i;
-
-               /* if instruction writes multiple, we need to create
-                * some place-holder collect the registers:
-                */
-               for (i = 0; i < 4; i++) {
-                       if (wrmask & (1 << i)) {
-                               struct ir3_instruction *collect =
-                                               ir3_instr_create(ctx->block, -1, OPC_META_FO);
-                               collect->fo.off = i;
-                               /* unused dst reg: */
-                               ir3_reg_create(collect, 0, 0);
-                               /* and src reg used to hold original instr */
-                               ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
-                               if (!ctx->atomic)
-                                       ssa_dst(ctx, collect, dst, chan+i);
-                       }
-               }
-       }
-
-       return reg;
-}
-
-static struct ir3_register *
-add_dst_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
-               const struct tgsi_dst_register *dst, unsigned chan)
-{
-       return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
-}
-
-static struct ir3_register *
-add_src_reg_wrmask(struct fd3_compile_context *ctx,
-               struct ir3_instruction *instr, const struct tgsi_src_register *src,
-               unsigned chan, unsigned wrmask)
-{
-       unsigned flags = 0, num = 0;
-       struct ir3_register *reg;
-       struct ir3_instruction *orig = NULL;
-
-       /* TODO we need to use a mov to temp for const >= 64.. or maybe
-        * we could use relative addressing..
-        */
-       compile_assert(ctx, src->Index < 64);
-
-       switch (src->File) {
-       case TGSI_FILE_IMMEDIATE:
-               /* TODO if possible, use actual immediate instead of const.. but
-                * TGSI has vec4 immediates, we can only embed scalar (of limited
-                * size, depending on instruction..)
-                */
-               flags |= IR3_REG_CONST;
-               num = src->Index + ctx->so->first_immediate;
-               break;
-       case TGSI_FILE_CONSTANT:
-               flags |= IR3_REG_CONST;
-               num = src->Index;
-               break;
-       case TGSI_FILE_OUTPUT:
-               /* NOTE: we should only end up w/ OUTPUT file for things like
-                * clamp()'ing saturated dst instructions
-                */
-       case TGSI_FILE_INPUT:
-       case TGSI_FILE_TEMPORARY:
-               /* uses SSA */
-               break;
-       default:
-               compile_error(ctx, "unsupported src register file: %s\n",
-                       tgsi_file_name(src->File));
-               break;
-       }
-
-       if (src->Absolute)
-               flags |= IR3_REG_ABS;
-       if (src->Negate)
-               flags |= IR3_REG_NEGATE;
-
-       if (src->Indirect) {
-               flags |= IR3_REG_RELATIV;
-
-               /* shouldn't happen, and we can't cope with it below: */
-               compile_assert(ctx, wrmask == 0x1);
-
-               /* wrap in a meta-deref to track both the src and address: */
-               orig = instr;
-
-               instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
-               ir3_reg_create(instr, 0, 0);
-               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
-       }
-
-       reg = ir3_reg_create(instr, regid(num, chan), flags);
-
-       reg->wrmask = wrmask;
-       if (wrmask == 0x1) {
-               /* normal case */
-               ssa_src(ctx, reg, src, chan);
-       } else if ((src->File == TGSI_FILE_TEMPORARY) ||
-                       (src->File == TGSI_FILE_OUTPUT) ||
-                       (src->File == TGSI_FILE_INPUT)) {
-               struct ir3_instruction *collect;
-               unsigned i;
-
-               compile_assert(ctx, !src->Indirect);
-
-               /* if instruction reads multiple, we need to create
-                * some place-holder collect the registers:
-                */
-               collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
-               ir3_reg_create(collect, 0, 0);   /* unused dst reg */
-
-               for (i = 0; i < 4; i++) {
-                       if (wrmask & (1 << i)) {
-                               /* and src reg used point to the original instr */
-                               ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-                                               src, chan + i);
-                       } else if (wrmask & ~((i << i) - 1)) {
-                               /* if any remaining components, then dummy
-                                * placeholder src reg to fill in the blanks:
-                                */
-                               ir3_reg_create(collect, 0, 0);
-                       }
-               }
-
-               reg->flags |= IR3_REG_SSA;
-               reg->instr = collect;
-       }
-
-       if (src->Indirect) {
-               reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
-               reg->instr = instr;
-       }
-       return reg;
-}
-
-static struct ir3_register *
-add_src_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
-               const struct tgsi_src_register *src, unsigned chan)
-{
-       return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
-}
-
-static void
-src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
-{
-       src->File      = dst->File;
-       src->Indirect  = dst->Indirect;
-       src->Dimension = dst->Dimension;
-       src->Index     = dst->Index;
-       src->Absolute  = 0;
-       src->Negate    = 0;
-       src->SwizzleX  = TGSI_SWIZZLE_X;
-       src->SwizzleY  = TGSI_SWIZZLE_Y;
-       src->SwizzleZ  = TGSI_SWIZZLE_Z;
-       src->SwizzleW  = TGSI_SWIZZLE_W;
-}
-
-/* Get internal-temp src/dst to use for a sequence of instructions
- * generated by a single TGSI op.
- */
-static struct tgsi_src_register *
-get_internal_temp(struct fd3_compile_context *ctx,
-               struct tgsi_dst_register *tmp_dst)
-{
-       struct tgsi_src_register *tmp_src;
-       int n;
-
-       tmp_dst->File      = TGSI_FILE_TEMPORARY;
-       tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
-       tmp_dst->Indirect  = 0;
-       tmp_dst->Dimension = 0;
-
-       /* assign next temporary: */
-       n = ctx->num_internal_temps++;
-       compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
-       tmp_src = &ctx->internal_temps[n];
-
-       tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
-
-       src_from_dst(tmp_src, tmp_dst);
-
-       return tmp_src;
-}
-
-static inline bool
-is_const(struct tgsi_src_register *src)
-{
-       return (src->File == TGSI_FILE_CONSTANT) ||
-                       (src->File == TGSI_FILE_IMMEDIATE);
-}
-
-static inline bool
-is_relative(struct tgsi_src_register *src)
-{
-       return src->Indirect;
-}
-
-static inline bool
-is_rel_or_const(struct tgsi_src_register *src)
-{
-       return is_relative(src) || is_const(src);
-}
-
-static type_t
-get_ftype(struct fd3_compile_context *ctx)
-{
-       return TYPE_F32;
-}
-
-static type_t
-get_utype(struct fd3_compile_context *ctx)
-{
-       return TYPE_U32;
-}
-
-static unsigned
-src_swiz(struct tgsi_src_register *src, int chan)
-{
-       switch (chan) {
-       case 0: return src->SwizzleX;
-       case 1: return src->SwizzleY;
-       case 2: return src->SwizzleZ;
-       case 3: return src->SwizzleW;
-       }
-       assert(0);
-       return 0;
-}
-
-/* for instructions that cannot take a const register as src, if needed
- * generate a move to temporary gpr:
- */
-static struct tgsi_src_register *
-get_unconst(struct fd3_compile_context *ctx, struct tgsi_src_register *src)
-{
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register *tmp_src;
-
-       compile_assert(ctx, is_rel_or_const(src));
-
-       tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-       create_mov(ctx, &tmp_dst, src);
-
-       return tmp_src;
-}
-
-static void
-get_immediate(struct fd3_compile_context *ctx,
-               struct tgsi_src_register *reg, uint32_t val)
-{
-       unsigned neg, swiz, idx, i;
-       /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
-       static const unsigned swiz2tgsi[] = {
-                       TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
-       };
-
-       for (i = 0; i < ctx->immediate_idx; i++) {
-               swiz = i % 4;
-               idx  = i / 4;
-
-               if (ctx->so->immediates[idx].val[swiz] == val) {
-                       neg = 0;
-                       break;
-               }
-
-               if (ctx->so->immediates[idx].val[swiz] == -val) {
-                       neg = 1;
-                       break;
-               }
-       }
-
-       if (i == ctx->immediate_idx) {
-               /* need to generate a new immediate: */
-               swiz = i % 4;
-               idx  = i / 4;
-               neg  = 0;
-               ctx->so->immediates[idx].val[swiz] = val;
-               ctx->so->immediates_count = idx + 1;
-               ctx->immediate_idx++;
-       }
-
-       reg->File      = TGSI_FILE_IMMEDIATE;
-       reg->Indirect  = 0;
-       reg->Dimension = 0;
-       reg->Index     = idx;
-       reg->Absolute  = 0;
-       reg->Negate    = neg;
-       reg->SwizzleX  = swiz2tgsi[swiz];
-       reg->SwizzleY  = swiz2tgsi[swiz];
-       reg->SwizzleZ  = swiz2tgsi[swiz];
-       reg->SwizzleW  = swiz2tgsi[swiz];
-}
-
-static void
-create_mov(struct fd3_compile_context *ctx, struct tgsi_dst_register *dst,
-               struct tgsi_src_register *src)
-{
-       type_t type_mov = get_ftype(ctx);
-       unsigned i;
-
-       for (i = 0; i < 4; i++) {
-               /* move to destination: */
-               if (dst->WriteMask & (1 << i)) {
-                       struct ir3_instruction *instr;
-
-                       if (src->Absolute || src->Negate) {
-                               /* can't have abs or neg on a mov instr, so use
-                                * absneg.f instead to handle these cases:
-                                */
-                               instr = instr_create(ctx, 2, OPC_ABSNEG_F);
-                       } else {
-                               instr = instr_create(ctx, 1, 0);
-                               instr->cat1.src_type = type_mov;
-                               instr->cat1.dst_type = type_mov;
-                       }
-
-                       add_dst_reg(ctx, instr, dst, i);
-                       add_src_reg(ctx, instr, src, src_swiz(src, i));
-               }
-       }
-}
-
-static void
-create_clamp(struct fd3_compile_context *ctx,
-               struct tgsi_dst_register *dst, struct tgsi_src_register *val,
-               struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
-{
-       struct ir3_instruction *instr;
-
-       instr = instr_create(ctx, 2, OPC_MAX_F);
-       vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
-
-       instr = instr_create(ctx, 2, OPC_MIN_F);
-       vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
-}
-
-static void
-create_clamp_imm(struct fd3_compile_context *ctx,
-               struct tgsi_dst_register *dst,
-               uint32_t minval, uint32_t maxval)
-{
-       struct tgsi_src_register minconst, maxconst;
-       struct tgsi_src_register src;
-
-       src_from_dst(&src, dst);
-
-       get_immediate(ctx, &minconst, minval);
-       get_immediate(ctx, &maxconst, maxval);
-
-       create_clamp(ctx, dst, &src, &minconst, &maxconst);
-}
-
-static struct tgsi_dst_register *
-get_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-       unsigned i;
-       for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-               struct tgsi_src_register *src = &inst->Src[i].Register;
-               if ((src->File == dst->File) && (src->Index == dst->Index)) {
-                       if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
-                                       (src->SwizzleX == TGSI_SWIZZLE_X) &&
-                                       (src->SwizzleY == TGSI_SWIZZLE_Y) &&
-                                       (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
-                                       (src->SwizzleW == TGSI_SWIZZLE_W))
-                               continue;
-                       ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
-                       ctx->tmp_dst.WriteMask = dst->WriteMask;
-                       dst = &ctx->tmp_dst;
-                       break;
-               }
-       }
-       return dst;
-}
-
-static void
-put_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst,
-               struct tgsi_dst_register *dst)
-{
-       /* if necessary, add mov back into original dst: */
-       if (dst != &inst->Dst[0].Register) {
-               create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
-       }
-}
-
-/* helper to generate the necessary repeat and/or additional instructions
- * to turn a scalar instruction into a vector operation:
- */
-static void
-vectorize(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
-               struct tgsi_dst_register *dst, int nsrcs, ...)
-{
-       va_list ap;
-       int i, j, n = 0;
-
-       instr_atomic_start(ctx);
-
-       add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
-
-       va_start(ap, nsrcs);
-       for (j = 0; j < nsrcs; j++) {
-               struct tgsi_src_register *src =
-                               va_arg(ap, struct tgsi_src_register *);
-               unsigned flags = va_arg(ap, unsigned);
-               struct ir3_register *reg;
-               if (flags & IR3_REG_IMMED) {
-                       reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
-                       /* this is an ugly cast.. should have put flags first! */
-                       reg->iim_val = *(int *)&src;
-               } else {
-                       reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
-               }
-               reg->flags |= flags & ~IR3_REG_NEGATE;
-               if (flags & IR3_REG_NEGATE)
-                       reg->flags ^= IR3_REG_NEGATE;
-       }
-       va_end(ap);
-
-       for (i = 0; i < 4; i++) {
-               if (dst->WriteMask & (1 << i)) {
-                       struct ir3_instruction *cur;
-
-                       if (n++ == 0) {
-                               cur = instr;
-                       } else {
-                               cur = instr_clone(ctx, instr);
-                       }
-
-                       ssa_dst(ctx, cur, dst, i);
-
-                       /* fix-up dst register component: */
-                       cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
-
-                       /* fix-up src register component: */
-                       va_start(ap, nsrcs);
-                       for (j = 0; j < nsrcs; j++) {
-                               struct ir3_register *reg = cur->regs[j+1];
-                               struct tgsi_src_register *src =
-                                               va_arg(ap, struct tgsi_src_register *);
-                               unsigned flags = va_arg(ap, unsigned);
-                               if (reg->flags & IR3_REG_SSA) {
-                                       ssa_src(ctx, reg, src, src_swiz(src, i));
-                               } else if (!(flags & IR3_REG_IMMED)) {
-                                       reg->num = regid(reg->num >> 2, src_swiz(src, i));
-                               }
-                       }
-                       va_end(ap);
-               }
-       }
-
-       instr_atomic_end(ctx);
-}
-
-/*
- * Handlers for TGSI instructions which do not have a 1:1 mapping to
- * native instructions:
- */
-
-static void
-trans_clamp(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src0 = &inst->Src[0].Register;
-       struct tgsi_src_register *src1 = &inst->Src[1].Register;
-       struct tgsi_src_register *src2 = &inst->Src[2].Register;
-
-       create_clamp(ctx, dst, src0, src1, src2);
-
-       put_dst(ctx, inst, dst);
-}
-
-/* ARL(x) = x, but mova from hrN.x to a0.. */
-static void
-trans_arl(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register *tmp_src;
-       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-       unsigned chan = src->SwizzleX;
-
-       compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
-
-       /* NOTE: we allocate a temporary from a flat register
-        * namespace (ignoring half vs full).  It turns out
-        * not to really matter since registers get reassigned
-        * later in ir3_ra which (hopefully!) can deal a bit
-        * better with mixed half and full precision.
-        */
-       tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-       /* cov.f{32,16}s16 Rtmp, Rsrc */
-       instr = instr_create(ctx, 1, 0);
-       instr->cat1.src_type = get_ftype(ctx);
-       instr->cat1.dst_type = TYPE_S16;
-       add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
-       add_src_reg(ctx, instr, src, chan);
-
-       /* shl.b Rtmp, Rtmp, 2 */
-       instr = instr_create(ctx, 2, OPC_SHL_B);
-       add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
-       add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
-       ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
-
-       /* mova a0, Rtmp */
-       instr = instr_create(ctx, 1, 0);
-       instr->cat1.src_type = TYPE_S16;
-       instr->cat1.dst_type = TYPE_S16;
-       add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
-       add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
-}
-
-/*
- * texture fetch/sample instructions:
- */
-
-struct tex_info {
-       int8_t order[4];
-       unsigned src_wrmask, flags;
-};
-
-static const struct tex_info *
-get_tex_info(struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       static const struct tex_info tex1d = {
-               .order = { 0, -1, -1, -1 },  /* coord.x */
-               .src_wrmask = TGSI_WRITEMASK_XY,
-               .flags = 0,
-       };
-       static const struct tex_info tex1ds = {
-               .order = { 0, -1,  2, -1 },  /* coord.xz */
-               .src_wrmask = TGSI_WRITEMASK_XYZ,
-               .flags = IR3_INSTR_S,
-       };
-       static const struct tex_info tex2d = {
-               .order = { 0,  1, -1, -1 },  /* coord.xy */
-               .src_wrmask = TGSI_WRITEMASK_XY,
-               .flags = 0,
-       };
-       static const struct tex_info tex2ds = {
-               .order = { 0,  1,  2, -1 },  /* coord.xyz */
-               .src_wrmask = TGSI_WRITEMASK_XYZ,
-               .flags = IR3_INSTR_S,
-       };
-       static const struct tex_info tex3d = {
-               .order = { 0,  1,  2, -1 },  /* coord.xyz */
-               .src_wrmask = TGSI_WRITEMASK_XYZ,
-               .flags = IR3_INSTR_3D,
-       };
-       static const struct tex_info tex3ds = {
-               .order = { 0,  1,  2,  3 },  /* coord.xyzw */
-               .src_wrmask = TGSI_WRITEMASK_XYZW,
-               .flags = IR3_INSTR_S | IR3_INSTR_3D,
-       };
-       static const struct tex_info txp1d = {
-               .order = { 0, -1,  3, -1 },  /* coord.xw */
-               .src_wrmask = TGSI_WRITEMASK_XYZ,
-               .flags = IR3_INSTR_P,
-       };
-       static const struct tex_info txp1ds = {
-               .order = { 0, -1,  2,  3 },  /* coord.xzw */
-               .src_wrmask = TGSI_WRITEMASK_XYZW,
-               .flags = IR3_INSTR_P | IR3_INSTR_S,
-       };
-       static const struct tex_info txp2d = {
-               .order = { 0,  1,  3, -1 },  /* coord.xyw */
-               .src_wrmask = TGSI_WRITEMASK_XYZ,
-               .flags = IR3_INSTR_P,
-       };
-       static const struct tex_info txp2ds = {
-               .order = { 0,  1,  2,  3 },  /* coord.xyzw */
-               .src_wrmask = TGSI_WRITEMASK_XYZW,
-               .flags = IR3_INSTR_P | IR3_INSTR_S,
-       };
-       static const struct tex_info txp3d = {
-               .order = { 0,  1,  2,  3 },  /* coord.xyzw */
-               .src_wrmask = TGSI_WRITEMASK_XYZW,
-               .flags = IR3_INSTR_P | IR3_INSTR_3D,
-       };
-
-       unsigned tex = inst->Texture.Texture;
-
-       switch (inst->Instruction.Opcode) {
-       case TGSI_OPCODE_TEX:
-               switch (tex) {
-               case TGSI_TEXTURE_1D:
-                       return &tex1d;
-               case TGSI_TEXTURE_SHADOW1D:
-                       return &tex1ds;
-               case TGSI_TEXTURE_2D:
-               case TGSI_TEXTURE_RECT:
-                       return &tex2d;
-               case TGSI_TEXTURE_SHADOW2D:
-               case TGSI_TEXTURE_SHADOWRECT:
-                       return &tex2ds;
-               case TGSI_TEXTURE_3D:
-               case TGSI_TEXTURE_CUBE:
-                       return &tex3d;
-               case TGSI_TEXTURE_SHADOWCUBE:
-                       return &tex3ds;
-               default:
-                       compile_error(ctx, "unknown texture type: %s\n",
-                                       tgsi_texture_names[tex]);
-                       return NULL;
-               }
-               break;
-       case TGSI_OPCODE_TXP:
-               switch (tex) {
-               case TGSI_TEXTURE_1D:
-                       return &txp1d;
-               case TGSI_TEXTURE_SHADOW1D:
-                       return &txp1ds;
-               case TGSI_TEXTURE_2D:
-               case TGSI_TEXTURE_RECT:
-                       return &txp2d;
-               case TGSI_TEXTURE_SHADOW2D:
-               case TGSI_TEXTURE_SHADOWRECT:
-                       return &txp2ds;
-               case TGSI_TEXTURE_3D:
-               case TGSI_TEXTURE_CUBE:
-                       return &txp3d;
-               default:
-                       compile_error(ctx, "unknown texture type: %s\n",
-                                       tgsi_texture_names[tex]);
-                       break;
-               }
-               break;
-       }
-       compile_assert(ctx, 0);
-       return NULL;
-}
-
-static struct tgsi_src_register *
-get_tex_coord(struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst,
-               const struct tex_info *tinf)
-{
-       struct tgsi_src_register *coord = &inst->Src[0].Register;
-       struct ir3_instruction *instr;
-       unsigned tex = inst->Texture.Texture;
-       bool needs_mov = false;
-       unsigned i;
-
-       /* cat5 instruction cannot seem to handle const or relative: */
-       if (is_rel_or_const(coord))
-               needs_mov = true;
-
-       /* 1D textures we fix up w/ 0.0 as 2nd coord: */
-       if ((tex == TGSI_TEXTURE_1D) || (tex == TGSI_TEXTURE_SHADOW1D))
-               needs_mov = true;
-
-       /* The texture sample instructions need to coord in successive
-        * registers/components (ie. src.xy but not src.yx).  And TXP
-        * needs the .w component in .z for 2D..  so in some cases we
-        * might need to emit some mov instructions to shuffle things
-        * around:
-        */
-       for (i = 1; (i < 4) && (tinf->order[i] >= 0) && !needs_mov; i++)
-               if (src_swiz(coord, i) != (src_swiz(coord, 0) + tinf->order[i]))
-                       needs_mov = true;
-
-       if (needs_mov) {
-               struct tgsi_dst_register tmp_dst;
-               struct tgsi_src_register *tmp_src;
-               unsigned j;
-
-               type_t type_mov = get_ftype(ctx);
-
-               /* need to move things around: */
-               tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-               for (j = 0; j < 4; j++) {
-                       if (tinf->order[j] < 0)
-                               continue;
-                       instr = instr_create(ctx, 1, 0);  /* mov */
-                       instr->cat1.src_type = type_mov;
-                       instr->cat1.dst_type = type_mov;
-                       add_dst_reg(ctx, instr, &tmp_dst, j);
-                       add_src_reg(ctx, instr, coord,
-                                       src_swiz(coord, tinf->order[j]));
-               }
-
-               /* fix up .y coord: */
-               if ((tex == TGSI_TEXTURE_1D) ||
-                               (tex == TGSI_TEXTURE_SHADOW1D)) {
-                       instr = instr_create(ctx, 1, 0);  /* mov */
-                       instr->cat1.src_type = type_mov;
-                       instr->cat1.dst_type = type_mov;
-                       add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
-                       ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5;
-               }
-
-               coord = tmp_src;
-       }
-
-       return coord;
-}
-
-static void
-trans_samp(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-       struct tgsi_src_register *coord;
-       struct tgsi_src_register *samp  = &inst->Src[1].Register;
-       const struct tex_info *tinf;
-
-       tinf = get_tex_info(ctx, inst);
-       coord = get_tex_coord(ctx, inst, tinf);
-
-       instr = instr_create(ctx, 5, t->opc);
-       instr->cat5.type = get_ftype(ctx);
-       instr->cat5.samp = samp->Index;
-       instr->cat5.tex  = samp->Index;
-       instr->flags |= tinf->flags;
-
-       add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
-       add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf->src_wrmask);
-}
-
-/*
- * SEQ(a,b) = (a == b) ? 1.0 : 0.0
- *   cmps.f.eq tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SNE(a,b) = (a != b) ? 1.0 : 0.0
- *   cmps.f.ne tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SGE(a,b) = (a >= b) ? 1.0 : 0.0
- *   cmps.f.ge tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SLE(a,b) = (a <= b) ? 1.0 : 0.0
- *   cmps.f.le tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SGT(a,b) = (a > b)  ? 1.0 : 0.0
- *   cmps.f.gt tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SLT(a,b) = (a < b)  ? 1.0 : 0.0
- *   cmps.f.lt tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * CMP(a,b,c) = (a < 0.0) ? b : c
- *   cmps.f.lt tmp0, a, {0.0}
- *   sel.b16 dst, b, tmp0, c
- */
-static void
-trans_cmp(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register *tmp_src;
-       struct tgsi_src_register constval0;
-       /* final instruction for CMP() uses orig src1 and src2: */
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *a0, *a1, *a2;
-       unsigned condition;
-
-       tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-       a0 = &inst->Src[0].Register;  /* a */
-       a1 = &inst->Src[1].Register;  /* b */
-
-       switch (t->tgsi_opc) {
-       case TGSI_OPCODE_SEQ:
-       case TGSI_OPCODE_FSEQ:
-               condition = IR3_COND_EQ;
-               break;
-       case TGSI_OPCODE_SNE:
-       case TGSI_OPCODE_FSNE:
-               condition = IR3_COND_NE;
-               break;
-       case TGSI_OPCODE_SGE:
-       case TGSI_OPCODE_FSGE:
-               condition = IR3_COND_GE;
-               break;
-       case TGSI_OPCODE_SLT:
-       case TGSI_OPCODE_FSLT:
-               condition = IR3_COND_LT;
-               break;
-       case TGSI_OPCODE_SLE:
-               condition = IR3_COND_LE;
-               break;
-       case TGSI_OPCODE_SGT:
-               condition = IR3_COND_GT;
-               break;
-       case TGSI_OPCODE_CMP:
-               get_immediate(ctx, &constval0, fui(0.0));
-               a0 = &inst->Src[0].Register;  /* a */
-               a1 = &constval0;              /* {0.0} */
-               condition = IR3_COND_LT;
-               break;
-       default:
-               compile_assert(ctx, 0);
-               return;
-       }
-
-       if (is_const(a0) && is_const(a1))
-               a0 = get_unconst(ctx, a0);
-
-       /* cmps.f.<cond> tmp, a0, a1 */
-       instr = instr_create(ctx, 2, OPC_CMPS_F);
-       instr->cat2.condition = condition;
-       vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
-
-       switch (t->tgsi_opc) {
-       case TGSI_OPCODE_SEQ:
-       case TGSI_OPCODE_FSEQ:
-       case TGSI_OPCODE_SGE:
-       case TGSI_OPCODE_FSGE:
-       case TGSI_OPCODE_SLE:
-       case TGSI_OPCODE_SNE:
-       case TGSI_OPCODE_FSNE:
-       case TGSI_OPCODE_SGT:
-       case TGSI_OPCODE_SLT:
-       case TGSI_OPCODE_FSLT:
-               /* cov.u16f16 dst, tmp0 */
-               instr = instr_create(ctx, 1, 0);
-               instr->cat1.src_type = get_utype(ctx);
-               instr->cat1.dst_type = get_ftype(ctx);
-               vectorize(ctx, instr, dst, 1, tmp_src, 0);
-               break;
-       case TGSI_OPCODE_CMP:
-               a1 = &inst->Src[1].Register;
-               a2 = &inst->Src[2].Register;
-               /* sel.{b32,b16} dst, src2, tmp, src1 */
-               instr = instr_create(ctx, 3, OPC_SEL_B32);
-               vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
-
-               break;
-       }
-
-       put_dst(ctx, inst, dst);
-}
-
-/*
- * USNE(a,b) = (a != b) ? 1 : 0
- *   cmps.u32.ne dst, a, b
- *
- * USEQ(a,b) = (a == b) ? 1 : 0
- *   cmps.u32.eq dst, a, b
- *
- * ISGE(a,b) = (a > b) ? 1 : 0
- *   cmps.s32.ge dst, a, b
- *
- * USGE(a,b) = (a > b) ? 1 : 0
- *   cmps.u32.ge dst, a, b
- *
- * ISLT(a,b) = (a < b) ? 1 : 0
- *   cmps.s32.lt dst, a, b
- *
- * USLT(a,b) = (a < b) ? 1 : 0
- *   cmps.u32.lt dst, a, b
- *
- * UCMP(a,b,c) = (a < 0) ? b : c
- *   cmps.u32.lt tmp0, a, {0}
- *   sel.b16 dst, b, tmp0, c
- */
-static void
-trans_icmp(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register constval0;
-       struct tgsi_src_register *a0, *a1, *a2;
-       unsigned condition;
-
-       a0 = &inst->Src[0].Register;  /* a */
-       a1 = &inst->Src[1].Register;  /* b */
-
-       switch (t->tgsi_opc) {
-       case TGSI_OPCODE_USNE:
-               condition = IR3_COND_NE;
-               break;
-       case TGSI_OPCODE_USEQ:
-               condition = IR3_COND_EQ;
-               break;
-       case TGSI_OPCODE_ISGE:
-       case TGSI_OPCODE_USGE:
-               condition = IR3_COND_GE;
-               break;
-       case TGSI_OPCODE_ISLT:
-       case TGSI_OPCODE_USLT:
-               condition = IR3_COND_LT;
-               break;
-       case TGSI_OPCODE_UCMP:
-               get_immediate(ctx, &constval0, 0);
-               a0 = &inst->Src[0].Register;  /* a */
-               a1 = &constval0;              /* {0} */
-               condition = IR3_COND_LT;
-               break;
-
-       default:
-               compile_assert(ctx, 0);
-               return;
-       }
-
-       if (is_const(a0) && is_const(a1))
-               a0 = get_unconst(ctx, a0);
-
-       if (t->tgsi_opc == TGSI_OPCODE_UCMP) {
-               struct tgsi_dst_register tmp_dst;
-               struct tgsi_src_register *tmp_src;
-               tmp_src = get_internal_temp(ctx, &tmp_dst);
-               /* cmps.u32.lt tmp, a0, a1 */
-               instr = instr_create(ctx, 2, t->opc);
-               instr->cat2.condition = condition;
-               vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
-
-               a1 = &inst->Src[1].Register;
-               a2 = &inst->Src[2].Register;
-               /* sel.{b32,b16} dst, src2, tmp, src1 */
-               instr = instr_create(ctx, 3, OPC_SEL_B32);
-               vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
-       } else {
-               /* cmps.{u32,s32}.<cond> dst, a0, a1 */
-               instr = instr_create(ctx, 2, t->opc);
-               instr->cat2.condition = condition;
-               vectorize(ctx, instr, dst, 2, a0, 0, a1, 0);
-       }
-       put_dst(ctx, inst, dst);
-}
-
-/*
- * Conditional / Flow control
- */
-
-static void
-push_branch(struct fd3_compile_context *ctx, bool inv,
-               struct ir3_instruction *instr, struct ir3_instruction *cond)
-{
-       unsigned int idx = ctx->branch_count++;
-       compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
-       ctx->branch[idx].instr = instr;
-       ctx->branch[idx].inv = inv;
-       /* else side of branch has same condition: */
-       if (!inv)
-               ctx->branch[idx].cond = cond;
-}
-
-static struct ir3_instruction *
-pop_branch(struct fd3_compile_context *ctx)
-{
-       unsigned int idx = --ctx->branch_count;
-       return ctx->branch[idx].instr;
-}
-
-static void
-trans_if(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr, *cond;
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register *tmp_src;
-       struct tgsi_src_register constval;
-
-       get_immediate(ctx, &constval, fui(0.0));
-       tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-       if (is_const(src))
-               src = get_unconst(ctx, src);
-
-       /* cmps.f.ne tmp0, b, {0.0} */
-       instr = instr_create(ctx, 2, OPC_CMPS_F);
-       add_dst_reg(ctx, instr, &tmp_dst, 0);
-       add_src_reg(ctx, instr, src, src->SwizzleX);
-       add_src_reg(ctx, instr, &constval, constval.SwizzleX);
-       instr->cat2.condition = IR3_COND_NE;
-
-       compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
-       cond = instr->regs[1]->instr;
-
-       /* meta:flow tmp0 */
-       instr = instr_create(ctx, -1, OPC_META_FLOW);
-       ir3_reg_create(instr, 0, 0);  /* dummy dst */
-       add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
-
-       push_branch(ctx, false, instr, cond);
-       instr->flow.if_block = push_block(ctx);
-}
-
-static void
-trans_else(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-
-       pop_block(ctx);
-
-       instr = pop_branch(ctx);
-
-       compile_assert(ctx, (instr->category == -1) &&
-                       (instr->opc == OPC_META_FLOW));
-
-       push_branch(ctx, true, instr, NULL);
-       instr->flow.else_block = push_block(ctx);
-}
-
-static struct ir3_instruction *
-find_temporary(struct ir3_block *block, unsigned n)
-{
-       if (block->parent && !block->temporaries[n])
-               return find_temporary(block->parent, n);
-       return block->temporaries[n];
-}
-
-static struct ir3_instruction *
-find_output(struct ir3_block *block, unsigned n)
-{
-       if (block->parent && !block->outputs[n])
-               return find_output(block->parent, n);
-       return block->outputs[n];
-}
-
-static struct ir3_instruction *
-create_phi(struct fd3_compile_context *ctx, struct ir3_instruction *cond,
-               struct ir3_instruction *a, struct ir3_instruction *b)
-{
-       struct ir3_instruction *phi;
-
-       compile_assert(ctx, cond);
-
-       /* Either side of the condition could be null..  which
-        * indicates a variable written on only one side of the
-        * branch.  Normally this should only be variables not
-        * used outside of that side of the branch.  So we could
-        * just 'return a ? a : b;' in that case.  But for better
-        * defined undefined behavior we just stick in imm{0.0}.
-        * In the common case of a value only used within the
-        * one side of the branch, the PHI instruction will not
-        * get scheduled
-        */
-       if (!a)
-               a = create_immed(ctx, 0.0);
-       if (!b)
-               b = create_immed(ctx, 0.0);
-
-       phi = instr_create(ctx, -1, OPC_META_PHI);
-       ir3_reg_create(phi, 0, 0);  /* dummy dst */
-       ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
-       ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
-       ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
-
-       return phi;
-}
-
-static void
-trans_endif(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-       struct ir3_block *ifb, *elseb;
-       struct ir3_instruction **ifout, **elseout;
-       unsigned i, ifnout = 0, elsenout = 0;
-
-       pop_block(ctx);
-
-       instr = pop_branch(ctx);
-
-       compile_assert(ctx, (instr->category == -1) &&
-                       (instr->opc == OPC_META_FLOW));
-
-       ifb = instr->flow.if_block;
-       elseb = instr->flow.else_block;
-       /* if there is no else block, the parent block is used for the
-        * branch-not-taken src of the PHI instructions:
-        */
-       if (!elseb)
-               elseb = ifb->parent;
-
-       /* worst case sizes: */
-       ifnout = ifb->ntemporaries + ifb->noutputs;
-       elsenout = elseb->ntemporaries + elseb->noutputs;
-
-       ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
-       if (elseb != ifb->parent)
-               elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
-
-       ifnout = 0;
-       elsenout = 0;
-
-       /* generate PHI instructions for any temporaries written: */
-       for (i = 0; i < ifb->ntemporaries; i++) {
-               struct ir3_instruction *a = ifb->temporaries[i];
-               struct ir3_instruction *b = elseb->temporaries[i];
-
-               /* if temporary written in if-block, or if else block
-                * is present and temporary written in else-block:
-                */
-               if (a || ((elseb != ifb->parent) && b)) {
-                       struct ir3_instruction *phi;
-
-                       /* if only written on one side, find the closest
-                        * enclosing update on other side:
-                        */
-                       if (!a)
-                               a = find_temporary(ifb, i);
-                       if (!b)
-                               b = find_temporary(elseb, i);
-
-                       ifout[ifnout] = a;
-                       a = create_output(ifb, a, ifnout++);
-
-                       if (elseb != ifb->parent) {
-                               elseout[elsenout] = b;
-                               b = create_output(elseb, b, elsenout++);
-                       }
-
-                       phi = create_phi(ctx, instr, a, b);
-                       ctx->block->temporaries[i] = phi;
-               }
-       }
-
-       compile_assert(ctx, ifb->noutputs == elseb->noutputs);
-
-       /* .. and any outputs written: */
-       for (i = 0; i < ifb->noutputs; i++) {
-               struct ir3_instruction *a = ifb->outputs[i];
-               struct ir3_instruction *b = elseb->outputs[i];
-
-               /* if output written in if-block, or if else block
-                * is present and output written in else-block:
-                */
-               if (a || ((elseb != ifb->parent) && b)) {
-                       struct ir3_instruction *phi;
-
-                       /* if only written on one side, find the closest
-                        * enclosing update on other side:
-                        */
-                       if (!a)
-                               a = find_output(ifb, i);
-                       if (!b)
-                               b = find_output(elseb, i);
-
-                       ifout[ifnout] = a;
-                       a = create_output(ifb, a, ifnout++);
-
-                       if (elseb != ifb->parent) {
-                               elseout[elsenout] = b;
-                               b = create_output(elseb, b, elsenout++);
-                       }
-
-                       phi = create_phi(ctx, instr, a, b);
-                       ctx->block->outputs[i] = phi;
-               }
-       }
-
-       ifb->noutputs = ifnout;
-       ifb->outputs = ifout;
-
-       if (elseb != ifb->parent) {
-               elseb->noutputs = elsenout;
-               elseb->outputs = elseout;
-       }
-
-       // TODO maybe we want to compact block->inputs?
-}
-
-/*
- * Kill
- */
-
-static void
-trans_kill(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr, *immed, *cond = NULL;
-       bool inv = false;
-
-       switch (t->tgsi_opc) {
-       case TGSI_OPCODE_KILL:
-               /* unconditional kill, use enclosing if condition: */
-               if (ctx->branch_count > 0) {
-                       unsigned int idx = ctx->branch_count - 1;
-                       cond = ctx->branch[idx].cond;
-                       inv = ctx->branch[idx].inv;
-               } else {
-                       cond = create_immed(ctx, 1.0);
-               }
-
-               break;
-       }
-
-       compile_assert(ctx, cond);
-
-       immed = create_immed(ctx, 0.0);
-
-       /* cmps.f.ne p0.x, cond, {0.0} */
-       instr = instr_create(ctx, 2, OPC_CMPS_F);
-       instr->cat2.condition = IR3_COND_NE;
-       ir3_reg_create(instr, regid(REG_P0, 0), 0);
-       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
-       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
-       cond = instr;
-
-       /* kill p0.x */
-       instr = instr_create(ctx, 0, OPC_KILL);
-       instr->cat0.inv = inv;
-       ir3_reg_create(instr, 0, 0);  /* dummy dst */
-       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
-
-       ctx->kill[ctx->kill_count++] = instr;
-}
-
-/*
- * Kill-If
- */
-
-static void
-trans_killif(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-       struct ir3_instruction *instr, *immed, *cond = NULL;
-       bool inv = false;
-
-       immed = create_immed(ctx, 0.0);
-
-       /* cmps.f.ne p0.x, cond, {0.0} */
-       instr = instr_create(ctx, 2, OPC_CMPS_F);
-       instr->cat2.condition = IR3_COND_NE;
-       ir3_reg_create(instr, regid(REG_P0, 0), 0);
-       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
-       add_src_reg(ctx, instr, src, src->SwizzleX);
-
-       cond = instr;
-
-       /* kill p0.x */
-       instr = instr_create(ctx, 0, OPC_KILL);
-       instr->cat0.inv = inv;
-       ir3_reg_create(instr, 0, 0);  /* dummy dst */
-       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
-
-       ctx->kill[ctx->kill_count++] = instr;
-
-}
-/*
- * I2F / U2F / F2I / F2U
- */
-
-static void
-trans_cov(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-
-       // cov.f32s32 dst, tmp0 /
-       instr = instr_create(ctx, 1, 0);
-       switch (t->tgsi_opc) {
-       case TGSI_OPCODE_U2F:
-               instr->cat1.src_type = TYPE_U32;
-               instr->cat1.dst_type = TYPE_F32;
-               break;
-       case TGSI_OPCODE_I2F:
-               instr->cat1.src_type = TYPE_S32;
-               instr->cat1.dst_type = TYPE_F32;
-               break;
-       case TGSI_OPCODE_F2U:
-               instr->cat1.src_type = TYPE_F32;
-               instr->cat1.dst_type = TYPE_U32;
-               break;
-       case TGSI_OPCODE_F2I:
-               instr->cat1.src_type = TYPE_F32;
-               instr->cat1.dst_type = TYPE_S32;
-               break;
-
-       }
-       vectorize(ctx, instr, dst, 1, src, 0);
-}
-
-/*
- * Handlers for TGSI instructions which do have 1:1 mapping to native
- * instructions:
- */
-
-static void
-instr_cat0(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       instr_create(ctx, 0, t->opc);
-}
-
-static void
-instr_cat1(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-       create_mov(ctx, dst, src);
-       put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat2(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src0 = &inst->Src[0].Register;
-       struct tgsi_src_register *src1 = &inst->Src[1].Register;
-       struct ir3_instruction *instr;
-       unsigned src0_flags = 0, src1_flags = 0;
-
-       switch (t->tgsi_opc) {
-       case TGSI_OPCODE_ABS:
-       case TGSI_OPCODE_IABS:
-               src0_flags = IR3_REG_ABS;
-               break;
-       case TGSI_OPCODE_SUB:
-       case TGSI_OPCODE_INEG:
-               src1_flags = IR3_REG_NEGATE;
-               break;
-       }
-
-       switch (t->opc) {
-       case OPC_ABSNEG_F:
-       case OPC_ABSNEG_S:
-       case OPC_CLZ_B:
-       case OPC_CLZ_S:
-       case OPC_SIGN_F:
-       case OPC_FLOOR_F:
-       case OPC_CEIL_F:
-       case OPC_RNDNE_F:
-       case OPC_RNDAZ_F:
-       case OPC_TRUNC_F:
-       case OPC_NOT_B:
-       case OPC_BFREV_B:
-       case OPC_SETRM:
-       case OPC_CBITS_B:
-               /* these only have one src reg */
-               instr = instr_create(ctx, 2, t->opc);
-               vectorize(ctx, instr, dst, 1, src0, src0_flags);
-               break;
-       default:
-               if (is_const(src0) && is_const(src1))
-                       src0 = get_unconst(ctx, src0);
-
-               instr = instr_create(ctx, 2, t->opc);
-               vectorize(ctx, instr, dst, 2, src0, src0_flags,
-                               src1, src1_flags);
-               break;
-       }
-
-       put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat3(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src0 = &inst->Src[0].Register;
-       struct tgsi_src_register *src1 = &inst->Src[1].Register;
-       struct ir3_instruction *instr;
-
-       /* in particular, can't handle const for src1 for cat3..
-        * for mad, we can swap first two src's if needed:
-        */
-       if (is_rel_or_const(src1)) {
-               if (is_mad(t->opc) && !is_rel_or_const(src0)) {
-                       struct tgsi_src_register *tmp;
-                       tmp = src0;
-                       src0 = src1;
-                       src1 = tmp;
-               } else {
-                       src1 = get_unconst(ctx, src1);
-               }
-       }
-
-       instr = instr_create(ctx, 3, t->opc);
-       vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
-                       &inst->Src[2].Register, 0);
-       put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat4(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-       struct ir3_instruction *instr;
-       unsigned i;
-
-       /* seems like blob compiler avoids const as src.. */
-       if (is_const(src))
-               src = get_unconst(ctx, src);
-
-       /* we need to replicate into each component: */
-       for (i = 0; i < 4; i++) {
-               if (dst->WriteMask & (1 << i)) {
-                       instr = instr_create(ctx, 4, t->opc);
-                       add_dst_reg(ctx, instr, dst, i);
-                       add_src_reg(ctx, instr, src, src->SwizzleX);
-               }
-       }
-
-       put_dst(ctx, inst, dst);
-}
-
-static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
-#define INSTR(n, f, ...) \
-       [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
-
-       INSTR(MOV,          instr_cat1),
-       INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
-       INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
-       INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
-       INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
-       INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
-       INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
-       INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
-       INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
-       INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
-       INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
-       INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
-       INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
-       INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
-       INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
-       INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
-       INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
-       INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
-       INSTR(UMUL,         instr_cat2, .opc = OPC_MUL_U),
-       INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
-       INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
-       INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
-       INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
-       INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
-       INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
-       INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
-       INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
-       INSTR(CLAMP,        trans_clamp),
-       INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
-       INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
-       INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
-       INSTR(CEIL,         instr_cat2, .opc = OPC_CEIL_F),
-       INSTR(ARL,          trans_arl),
-       INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
-       INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
-       INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
-       INSTR(COS,          instr_cat4, .opc = OPC_COS),
-       INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
-       INSTR(TEX,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
-       INSTR(TXP,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
-       INSTR(SGT,          trans_cmp),
-       INSTR(SLT,          trans_cmp),
-       INSTR(FSLT,         trans_cmp),
-       INSTR(SGE,          trans_cmp),
-       INSTR(FSGE,         trans_cmp),
-       INSTR(SLE,          trans_cmp),
-       INSTR(SNE,          trans_cmp),
-       INSTR(FSNE,         trans_cmp),
-       INSTR(SEQ,          trans_cmp),
-       INSTR(FSEQ,         trans_cmp),
-       INSTR(CMP,          trans_cmp),
-       INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
-       INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
-       INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
-       INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
-       INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
-       INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
-       INSTR(UCMP,         trans_icmp, .opc = OPC_CMPS_U),
-       INSTR(IF,           trans_if),
-       INSTR(UIF,          trans_if),
-       INSTR(ELSE,         trans_else),
-       INSTR(ENDIF,        trans_endif),
-       INSTR(END,          instr_cat0, .opc = OPC_END),
-       INSTR(KILL,         trans_kill, .opc = OPC_KILL),
-       INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
-       INSTR(I2F,          trans_cov),
-       INSTR(U2F,          trans_cov),
-       INSTR(F2I,          trans_cov),
-       INSTR(F2U,          trans_cov),
-};
-
-static fd3_semantic
-decl_semantic(const struct tgsi_declaration_semantic *sem)
-{
-       return fd3_semantic_name(sem->Name, sem->Index);
-}
-
-static struct ir3_instruction *
-decl_in_frag_bary(struct fd3_compile_context *ctx, unsigned regid,
-               unsigned j, unsigned inloc)
-{
-       struct ir3_instruction *instr;
-       struct ir3_register *src;
-
-       /* bary.f dst, #inloc, r0.x */
-       instr = instr_create(ctx, 2, OPC_BARY_F);
-       ir3_reg_create(instr, regid, 0);   /* dummy dst */
-       ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
-       src = ir3_reg_create(instr, 0, IR3_REG_SSA);
-       src->wrmask = 0x3;
-       src->instr = ctx->frag_pos;
-
-       return instr;
-}
-
-/* TGSI_SEMANTIC_POSITION
- * """"""""""""""""""""""
- *
- * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
- * fragment shader input contains the fragment's window position.  The X
- * component starts at zero and always increases from left to right.
- * The Y component starts at zero and always increases but Y=0 may either
- * indicate the top of the window or the bottom depending on the fragment
- * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
- * The Z coordinate ranges from 0 to 1 to represent depth from the front
- * to the back of the Z buffer.  The W component contains the reciprocol
- * of the interpolated vertex position W component.
- */
-static struct ir3_instruction *
-decl_in_frag_coord(struct fd3_compile_context *ctx, unsigned regid,
-               unsigned j)
-{
-       struct ir3_instruction *instr, *src;
-
-       compile_assert(ctx, !ctx->frag_coord[j]);
-
-       ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
-
-
-       switch (j) {
-       case 0: /* .x */
-       case 1: /* .y */
-               /* for frag_coord, we get unsigned values.. we need
-                * to subtract (integer) 8 and divide by 16 (right-
-                * shift by 4) then convert to float:
-                */
-
-               /* add.s tmp, src, -8 */
-               instr = instr_create(ctx, 2, OPC_ADD_S);
-               ir3_reg_create(instr, regid, 0);    /* dummy dst */
-               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
-               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
-               src = instr;
-
-               /* shr.b tmp, tmp, 4 */
-               instr = instr_create(ctx, 2, OPC_SHR_B);
-               ir3_reg_create(instr, regid, 0);    /* dummy dst */
-               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
-               src = instr;
-
-               /* mov.u32f32 dst, tmp */
-               instr = instr_create(ctx, 1, 0);
-               instr->cat1.src_type = TYPE_U32;
-               instr->cat1.dst_type = TYPE_F32;
-               ir3_reg_create(instr, regid, 0);    /* dummy dst */
-               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-
-               break;
-       case 2: /* .z */
-       case 3: /* .w */
-               /* seems that we can use these as-is: */
-               instr = ctx->frag_coord[j];
-               break;
-       default:
-               compile_error(ctx, "invalid channel\n");
-               instr = create_immed(ctx, 0.0);
-               break;
-       }
-
-       return instr;
-}
-
-/* TGSI_SEMANTIC_FACE
- * """"""""""""""""""
- *
- * This label applies to fragment shader inputs only and indicates that
- * the register contains front/back-face information of the form (F, 0,
- * 0, 1).  The first component will be positive when the fragment belongs
- * to a front-facing polygon, and negative when the fragment belongs to a
- * back-facing polygon.
- */
-static struct ir3_instruction *
-decl_in_frag_face(struct fd3_compile_context *ctx, unsigned regid,
-               unsigned j)
-{
-       struct ir3_instruction *instr, *src;
-
-       switch (j) {
-       case 0: /* .x */
-               compile_assert(ctx, !ctx->frag_face);
-
-               ctx->frag_face = create_input(ctx->block, NULL, 0);
-
-               /* for faceness, we always get -1 or 0 (int).. but TGSI expects
-                * positive vs negative float.. and piglit further seems to
-                * expect -1.0 or 1.0:
-                *
-                *    mul.s tmp, hr0.x, 2
-                *    add.s tmp, tmp, 1
-                *    mov.s16f32, dst, tmp
-                *
-                */
-
-               instr = instr_create(ctx, 2, OPC_MUL_S);
-               ir3_reg_create(instr, regid, 0);    /* dummy dst */
-               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
-               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
-               src = instr;
-
-               instr = instr_create(ctx, 2, OPC_ADD_S);
-               ir3_reg_create(instr, regid, 0);    /* dummy dst */
-               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-               src = instr;
-
-               instr = instr_create(ctx, 1, 0); /* mov */
-               instr->cat1.src_type = TYPE_S32;
-               instr->cat1.dst_type = TYPE_F32;
-               ir3_reg_create(instr, regid, 0);    /* dummy dst */
-               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-
-               break;
-       case 1: /* .y */
-       case 2: /* .z */
-               instr = create_immed(ctx, 0.0);
-               break;
-       case 3: /* .w */
-               instr = create_immed(ctx, 1.0);
-               break;
-       default:
-               compile_error(ctx, "invalid channel\n");
-               instr = create_immed(ctx, 0.0);
-               break;
-       }
-
-       return instr;
-}
-
-static void
-decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-       struct fd3_shader_variant *so = ctx->so;
-       unsigned name = decl->Semantic.Name;
-       unsigned i;
-
-       /* I don't think we should get frag shader input without
-        * semantic info?  Otherwise how do inputs get linked to
-        * vert outputs?
-        */
-       compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
-                       decl->Declaration.Semantic);
-
-       for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-               unsigned n = so->inputs_count++;
-               unsigned r = regid(i, 0);
-               unsigned ncomp, j;
-
-               /* we'll figure out the actual components used after scheduling */
-               ncomp = 4;
-
-               DBG("decl in -> r%d", i);
-
-               compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
-
-               so->inputs[n].semantic = decl_semantic(&decl->Semantic);
-               so->inputs[n].compmask = (1 << ncomp) - 1;
-               so->inputs[n].regid = r;
-               so->inputs[n].inloc = ctx->next_inloc;
-
-               for (j = 0; j < ncomp; j++) {
-                       struct ir3_instruction *instr = NULL;
-
-                       if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-                               /* for fragment shaders, POSITION and FACE are handled
-                                * specially, not using normal varying / bary.f
-                                */
-                               if (name == TGSI_SEMANTIC_POSITION) {
-                                       so->inputs[n].bary = false;
-                                       so->frag_coord = true;
-                                       instr = decl_in_frag_coord(ctx, r + j, j);
-                               } else if (name == TGSI_SEMANTIC_FACE) {
-                                       so->inputs[n].bary = false;
-                                       so->frag_face = true;
-                                       instr = decl_in_frag_face(ctx, r + j, j);
-                               } else {
-                                       so->inputs[n].bary = true;
-                                       instr = decl_in_frag_bary(ctx, r + j, j,
-                                                       so->inputs[n].inloc + j - 8);
-                               }
-                       } else {
-                               instr = create_input(ctx->block, NULL, (i * 4) + j);
-                       }
-
-                       ctx->block->inputs[(i * 4) + j] = instr;
-               }
-
-               if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
-                       ctx->next_inloc += ncomp;
-                       so->total_in += ncomp;
-               }
-       }
-}
-
-static void
-decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-       struct fd3_shader_variant *so = ctx->so;
-       unsigned comp = 0;
-       unsigned name = decl->Semantic.Name;
-       unsigned i;
-
-       compile_assert(ctx, decl->Declaration.Semantic);
-
-       DBG("decl out[%d] -> r%d", name, decl->Range.First);
-
-       if (ctx->type == TGSI_PROCESSOR_VERTEX) {
-               switch (name) {
-               case TGSI_SEMANTIC_POSITION:
-                       so->writes_pos = true;
-                       break;
-               case TGSI_SEMANTIC_PSIZE:
-                       so->writes_psize = true;
-                       break;
-               case TGSI_SEMANTIC_COLOR:
-               case TGSI_SEMANTIC_BCOLOR:
-               case TGSI_SEMANTIC_GENERIC:
-               case TGSI_SEMANTIC_FOG:
-               case TGSI_SEMANTIC_TEXCOORD:
-                       break;
-               default:
-                       compile_error(ctx, "unknown VS semantic name: %s\n",
-                                       tgsi_semantic_names[name]);
-               }
-       } else {
-               switch (name) {
-               case TGSI_SEMANTIC_POSITION:
-                       comp = 2;  /* tgsi will write to .z component */
-                       so->writes_pos = true;
-                       break;
-               case TGSI_SEMANTIC_COLOR:
-                       break;
-               default:
-                       compile_error(ctx, "unknown FS semantic name: %s\n",
-                                       tgsi_semantic_names[name]);
-               }
-       }
-
-       for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-               unsigned n = so->outputs_count++;
-               unsigned ncomp, j;
-
-               ncomp = 4;
-
-               compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
-
-               so->outputs[n].semantic = decl_semantic(&decl->Semantic);
-               so->outputs[n].regid = regid(i, comp);
-
-               /* avoid undefined outputs, stick a dummy mov from imm{0.0},
-                * which if the output is actually assigned will be over-
-                * written
-                */
-               for (j = 0; j < ncomp; j++)
-                       ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
-       }
-}
-
-/* from TGSI perspective, we actually have inputs.  But most of the "inputs"
- * for a fragment shader are just bary.f instructions.  The *actual* inputs
- * from the hw perspective are the frag_pos and optionally frag_coord and
- * frag_face.
- */
-static void
-fixup_frag_inputs(struct fd3_compile_context *ctx)
-{
-       struct fd3_shader_variant *so = ctx->so;
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction **inputs;
-       struct ir3_instruction *instr;
-       int n, regid = 0;
-
-       block->ninputs = 0;
-
-       n  = 4;  /* always have frag_pos */
-       n += COND(so->frag_face, 4);
-       n += COND(so->frag_coord, 4);
-
-       inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
-
-       if (so->frag_face) {
-               /* this ultimately gets assigned to hr0.x so doesn't conflict
-                * with frag_coord/frag_pos..
-                */
-               inputs[block->ninputs++] = ctx->frag_face;
-               ctx->frag_face->regs[0]->num = 0;
-
-               /* remaining channels not used, but let's avoid confusing
-                * other parts that expect inputs to come in groups of vec4
-                */
-               inputs[block->ninputs++] = NULL;
-               inputs[block->ninputs++] = NULL;
-               inputs[block->ninputs++] = NULL;
-       }
-
-       /* since we don't know where to set the regid for frag_coord,
-        * we have to use r0.x for it.  But we don't want to *always*
-        * use r1.x for frag_pos as that could increase the register
-        * footprint on simple shaders:
-        */
-       if (so->frag_coord) {
-               ctx->frag_coord[0]->regs[0]->num = regid++;
-               ctx->frag_coord[1]->regs[0]->num = regid++;
-               ctx->frag_coord[2]->regs[0]->num = regid++;
-               ctx->frag_coord[3]->regs[0]->num = regid++;
-
-               inputs[block->ninputs++] = ctx->frag_coord[0];
-               inputs[block->ninputs++] = ctx->frag_coord[1];
-               inputs[block->ninputs++] = ctx->frag_coord[2];
-               inputs[block->ninputs++] = ctx->frag_coord[3];
-       }
-
-       /* we always have frag_pos: */
-       so->pos_regid = regid;
-
-       /* r0.x */
-       instr = create_input(block, NULL, block->ninputs);
-       instr->regs[0]->num = regid++;
-       inputs[block->ninputs++] = instr;
-       ctx->frag_pos->regs[1]->instr = instr;
-
-       /* r0.y */
-       instr = create_input(block, NULL, block->ninputs);
-       instr->regs[0]->num = regid++;
-       inputs[block->ninputs++] = instr;
-       ctx->frag_pos->regs[2]->instr = instr;
-
-       block->inputs = inputs;
-}
-
-static void
-compile_instructions(struct fd3_compile_context *ctx)
-{
-       push_block(ctx);
-
-       /* for fragment shader, we have a single input register (usually
-        * r0.xy) which is used as the base for bary.f varying fetch instrs:
-        */
-       if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-               struct ir3_instruction *instr;
-               instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
-               ir3_reg_create(instr, 0, 0);
-               ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
-               ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
-               ctx->frag_pos = instr;
-       }
-
-       while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
-               tgsi_parse_token(&ctx->parser);
-
-               switch (ctx->parser.FullToken.Token.Type) {
-               case TGSI_TOKEN_TYPE_DECLARATION: {
-                       struct tgsi_full_declaration *decl =
-                                       &ctx->parser.FullToken.FullDeclaration;
-                       if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
-                               decl_out(ctx, decl);
-                       } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
-                               decl_in(ctx, decl);
-                       }
-                       break;
-               }
-               case TGSI_TOKEN_TYPE_IMMEDIATE: {
-                       /* TODO: if we know the immediate is small enough, and only
-                        * used with instructions that can embed an immediate, we
-                        * can skip this:
-                        */
-                       struct tgsi_full_immediate *imm =
-                                       &ctx->parser.FullToken.FullImmediate;
-                       unsigned n = ctx->so->immediates_count++;
-                       compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
-                       memcpy(ctx->so->immediates[n].val, imm->u, 16);
-                       break;
-               }
-               case TGSI_TOKEN_TYPE_INSTRUCTION: {
-                       struct tgsi_full_instruction *inst =
-                                       &ctx->parser.FullToken.FullInstruction;
-                       unsigned opc = inst->Instruction.Opcode;
-                       const struct instr_translater *t = &translaters[opc];
-
-                       if (t->fxn) {
-                               t->fxn(t, ctx, inst);
-                               ctx->num_internal_temps = 0;
-                       } else {
-                               compile_error(ctx, "unknown TGSI opc: %s\n",
-                                               tgsi_get_opcode_name(opc));
-                       }
-
-                       switch (inst->Instruction.Saturate) {
-                       case TGSI_SAT_ZERO_ONE:
-                               create_clamp_imm(ctx, &inst->Dst[0].Register,
-                                               fui(0.0), fui(1.0));
-                               break;
-                       case TGSI_SAT_MINUS_PLUS_ONE:
-                               create_clamp_imm(ctx, &inst->Dst[0].Register,
-                                               fui(-1.0), fui(1.0));
-                               break;
-                       }
-
-                       instr_finish(ctx);
-
-                       break;
-               }
-               default:
-                       break;
-               }
-       }
-}
-
-static void
-compile_dump(struct fd3_compile_context *ctx)
-{
-       const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
-       static unsigned n = 0;
-       char fname[16];
-       FILE *f;
-       snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
-       f = fopen(fname, "w");
-       if (!f)
-               return;
-       ir3_block_depth(ctx->block);
-       ir3_dump(ctx->ir, name, ctx->block, f);
-       fclose(f);
-}
-
-int
-fd3_compile_shader(struct fd3_shader_variant *so,
-               const struct tgsi_token *tokens, struct fd3_shader_key key)
-{
-       struct fd3_compile_context ctx;
-       struct ir3_block *block;
-       struct ir3_instruction **inputs;
-       unsigned i, j, actual_in;
-       int ret = 0;
-
-       assert(!so->ir);
-
-       so->ir = ir3_create();
-
-       assert(so->ir);
-
-       if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
-               ret = -1;
-               goto out;
-       }
-
-       compile_instructions(&ctx);
-
-       block = ctx.block;
-
-       /* keep track of the inputs from TGSI perspective.. */
-       inputs = block->inputs;
-
-       /* but fixup actual inputs for frag shader: */
-       if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
-               fixup_frag_inputs(&ctx);
-
-       /* at this point, for binning pass, throw away unneeded outputs: */
-       if (key.binning_pass) {
-               for (i = 0, j = 0; i < so->outputs_count; i++) {
-                       unsigned name = sem2name(so->outputs[i].semantic);
-                       unsigned idx = sem2name(so->outputs[i].semantic);
-
-                       /* throw away everything but first position/psize */
-                       if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
-                                       (name == TGSI_SEMANTIC_PSIZE))) {
-                               if (i != j) {
-                                       so->outputs[j] = so->outputs[i];
-                                       block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
-                                       block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
-                                       block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
-                                       block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
-                               }
-                               j++;
-                       }
-               }
-               so->outputs_count = j;
-               block->noutputs = j * 4;
-       }
-
-       /* at this point, we want the kill's in the outputs array too,
-        * so that they get scheduled (since they have no dst).. we've
-        * already ensured that the array is big enough in push_block():
-        */
-       if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
-               for (i = 0; i < ctx.kill_count; i++)
-                       block->outputs[block->noutputs++] = ctx.kill[i];
-       }
-
-       if (fd_mesa_debug & FD_DBG_OPTDUMP)
-               compile_dump(&ctx);
-
-       ret = ir3_block_flatten(block);
-       if (ret < 0)
-               goto out;
-       if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
-               compile_dump(&ctx);
-
-       ir3_block_cp(block);
-
-       if (fd_mesa_debug & FD_DBG_OPTDUMP)
-               compile_dump(&ctx);
-
-       ir3_block_depth(block);
-
-       if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-               printf("AFTER DEPTH:\n");
-               ir3_dump_instr_list(block->head);
-       }
-
-       ir3_block_sched(block);
-
-       if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-               printf("AFTER SCHED:\n");
-               ir3_dump_instr_list(block->head);
-       }
-
-       ret = ir3_block_ra(block, so->type, key.half_precision,
-                       so->frag_coord, so->frag_face, &so->has_samp);
-       if (ret)
-               goto out;
-
-       if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-               printf("AFTER RA:\n");
-               ir3_dump_instr_list(block->head);
-       }
-
-       /* fixup input/outputs: */
-       for (i = 0; i < so->outputs_count; i++) {
-               so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
-               /* preserve hack for depth output.. tgsi writes depth to .z,
-                * but what we give the hw is the scalar register:
-                */
-               if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
-                       (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
-                       so->outputs[i].regid += 2;
-       }
-       /* Note that some or all channels of an input may be unused: */
-       actual_in = 0;
-       for (i = 0; i < so->inputs_count; i++) {
-               unsigned j, regid = ~0, compmask = 0;
-               so->inputs[i].ncomp = 0;
-               for (j = 0; j < 4; j++) {
-                       struct ir3_instruction *in = inputs[(i*4) + j];
-                       if (in) {
-                               compmask |= (1 << j);
-                               regid = in->regs[0]->num - j;
-                               actual_in++;
-                               so->inputs[i].ncomp++;
-                       }
-               }
-               so->inputs[i].regid = regid;
-               so->inputs[i].compmask = compmask;
-       }
-
-       /* fragment shader always gets full vec4's even if it doesn't
-        * fetch all components, but vertex shader we need to update
-        * with the actual number of components fetch, otherwise thing
-        * will hang due to mismaptch between VFD_DECODE's and
-        * TOTALATTRTOVS
-        */
-       if (so->type == SHADER_VERTEX)
-               so->total_in = actual_in;
-
-out:
-       if (ret) {
-               ir3_destroy(so->ir);
-               so->ir = NULL;
-       }
-       compile_free(&ctx);
-
-       return ret;
-}
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.h b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.h
deleted file mode 100644 (file)
index a53bb3e..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#ifndef FD3_COMPILER_H_
-#define FD3_COMPILER_H_
-
-#include "fd3_program.h"
-#include "fd3_util.h"
-
-
-int fd3_compile_shader(struct fd3_shader_variant *so,
-               const struct tgsi_token *tokens,
-               struct fd3_shader_key key);
-int fd3_compile_shader_old(struct fd3_shader_variant *so,
-               const struct tgsi_token *tokens,
-               struct fd3_shader_key key);
-
-#endif /* FD3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c
deleted file mode 100644 (file)
index 66f724b..0000000
+++ /dev/null
@@ -1,1524 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <stdarg.h>
-
-#include "pipe/p_state.h"
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_ureg.h"
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_strings.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_scan.h"
-
-#include "freedreno_lowering.h"
-
-#include "fd3_compiler.h"
-#include "fd3_program.h"
-#include "fd3_util.h"
-
-#include "instr-a3xx.h"
-#include "ir3.h"
-
-
-struct fd3_compile_context {
-       const struct tgsi_token *tokens;
-       bool free_tokens;
-       struct ir3 *ir;
-       struct ir3_block *block;
-       struct fd3_shader_variant *so;
-
-       struct tgsi_parse_context parser;
-       unsigned type;
-
-       struct tgsi_shader_info info;
-
-       /* last input dst (for setting (ei) flag): */
-       struct ir3_register *last_input;
-
-       /* last instruction with relative addressing: */
-       struct ir3_instruction *last_rel;
-
-       /* for calculating input/output positions/linkages: */
-       unsigned next_inloc;
-
-       unsigned num_internal_temps;
-       struct tgsi_src_register internal_temps[6];
-
-       /* track registers which need to synchronize w/ "complex alu" cat3
-        * instruction pipeline:
-        */
-       regmask_t needs_ss;
-
-       /* track registers which need to synchronize with texture fetch
-        * pipeline:
-        */
-       regmask_t needs_sy;
-
-       /* inputs start at r0, temporaries start after last input, and
-        * outputs start after last temporary.
-        *
-        * We could be more clever, because this is not a hw restriction,
-        * but probably best just to implement an optimizing pass to
-        * reduce the # of registers used and get rid of redundant mov's
-        * (to output register).
-        */
-       unsigned base_reg[TGSI_FILE_COUNT];
-
-       /* idx/slot for last compiler generated immediate */
-       unsigned immediate_idx;
-
-       /* stack of branch instructions that start (potentially nested)
-        * branch instructions, so that we can fix up the branch targets
-        * so that we can fix up the branch target on the corresponding
-        * END instruction
-        */
-       struct ir3_instruction *branch[16];
-       unsigned int branch_count;
-
-       /* used when dst is same as one of the src, to avoid overwriting a
-        * src element before the remaining scalar instructions that make
-        * up the vector operation
-        */
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register *tmp_src;
-};
-
-
-static void vectorize(struct fd3_compile_context *ctx,
-               struct ir3_instruction *instr, struct tgsi_dst_register *dst,
-               int nsrcs, ...);
-static void create_mov(struct fd3_compile_context *ctx,
-               struct tgsi_dst_register *dst, struct tgsi_src_register *src);
-
-static unsigned
-compile_init(struct fd3_compile_context *ctx, struct fd3_shader_variant *so,
-               const struct tgsi_token *tokens)
-{
-       unsigned ret, base = 0;
-       struct tgsi_shader_info *info = &ctx->info;
-       const struct fd_lowering_config lconfig = {
-                       .color_two_side = so->key.color_two_side,
-                       .lower_DST  = true,
-                       .lower_XPD  = true,
-                       .lower_SCS  = true,
-                       .lower_LRP  = true,
-                       .lower_FRC  = true,
-                       .lower_POW  = true,
-                       .lower_LIT  = true,
-                       .lower_EXP  = true,
-                       .lower_LOG  = true,
-                       .lower_DP4  = true,
-                       .lower_DP3  = true,
-                       .lower_DPH  = true,
-                       .lower_DP2  = true,
-                       .lower_DP2A = true,
-       };
-
-       ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
-       ctx->free_tokens = !!ctx->tokens;
-       if (!ctx->tokens) {
-               /* no lowering */
-               ctx->tokens = tokens;
-       }
-       ctx->ir = so->ir;
-       ctx->block = ir3_block_create(ctx->ir, 0, 0, 0);
-       ctx->so = so;
-       ctx->last_input = NULL;
-       ctx->last_rel = NULL;
-       ctx->next_inloc = 8;
-       ctx->num_internal_temps = 0;
-       ctx->branch_count = 0;
-
-       regmask_init(&ctx->needs_ss);
-       regmask_init(&ctx->needs_sy);
-       memset(ctx->base_reg, 0, sizeof(ctx->base_reg));
-
-       /* Immediates go after constants: */
-       ctx->base_reg[TGSI_FILE_CONSTANT]  = 0;
-       ctx->base_reg[TGSI_FILE_IMMEDIATE] =
-                       info->file_max[TGSI_FILE_CONSTANT] + 1;
-
-       /* if full precision and fragment shader, don't clobber
-        * r0.x w/ bary fetch:
-        */
-       if ((so->type == SHADER_FRAGMENT) && !so->key.half_precision)
-               base = 1;
-
-       /* Temporaries after outputs after inputs: */
-       ctx->base_reg[TGSI_FILE_INPUT]     = base;
-       ctx->base_reg[TGSI_FILE_OUTPUT]    = base +
-                       info->file_max[TGSI_FILE_INPUT] + 1;
-       ctx->base_reg[TGSI_FILE_TEMPORARY] = base +
-                       info->file_max[TGSI_FILE_INPUT] + 1 +
-                       info->file_max[TGSI_FILE_OUTPUT] + 1;
-
-       so->first_immediate = ctx->base_reg[TGSI_FILE_IMMEDIATE];
-       ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
-
-       ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
-       if (ret != TGSI_PARSE_OK)
-               return ret;
-
-       ctx->type = ctx->parser.FullHeader.Processor.Processor;
-
-       return ret;
-}
-
-static void
-compile_error(struct fd3_compile_context *ctx, const char *format, ...)
-{
-       va_list ap;
-       va_start(ap, format);
-       _debug_vprintf(format, ap);
-       va_end(ap);
-       tgsi_dump(ctx->tokens, 0);
-       debug_assert(0);
-}
-
-#define compile_assert(ctx, cond) do { \
-               if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
-       } while (0)
-
-static void
-compile_free(struct fd3_compile_context *ctx)
-{
-       if (ctx->free_tokens)
-               free((void *)ctx->tokens);
-       tgsi_parse_free(&ctx->parser);
-}
-
-struct instr_translater {
-       void (*fxn)(const struct instr_translater *t,
-                       struct fd3_compile_context *ctx,
-                       struct tgsi_full_instruction *inst);
-       unsigned tgsi_opc;
-       opc_t opc;
-       opc_t hopc;    /* opc to use for half_precision mode, if different */
-       unsigned arg;
-};
-
-static void
-handle_last_rel(struct fd3_compile_context *ctx)
-{
-       if (ctx->last_rel) {
-               ctx->last_rel->flags |= IR3_INSTR_UL;
-               ctx->last_rel = NULL;
-       }
-}
-
-static struct ir3_instruction *
-instr_create(struct fd3_compile_context *ctx, int category, opc_t opc)
-{
-       return ir3_instr_create(ctx->block, category, opc);
-}
-
-static void
-add_nop(struct fd3_compile_context *ctx, unsigned count)
-{
-       while (count-- > 0)
-               instr_create(ctx, 0, OPC_NOP);
-}
-
-static unsigned
-src_flags(struct fd3_compile_context *ctx, struct ir3_register *reg)
-{
-       unsigned flags = 0;
-
-       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-               return flags;
-
-       if (regmask_get(&ctx->needs_ss, reg)) {
-               flags |= IR3_INSTR_SS;
-               regmask_init(&ctx->needs_ss);
-       }
-
-       if (regmask_get(&ctx->needs_sy, reg)) {
-               flags |= IR3_INSTR_SY;
-               regmask_init(&ctx->needs_sy);
-       }
-
-       return flags;
-}
-
-static struct ir3_register *
-add_dst_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
-               const struct tgsi_dst_register *dst, unsigned chan)
-{
-       unsigned flags = 0, num = 0;
-       struct ir3_register *reg;
-
-       switch (dst->File) {
-       case TGSI_FILE_OUTPUT:
-       case TGSI_FILE_TEMPORARY:
-               num = dst->Index + ctx->base_reg[dst->File];
-               break;
-       case TGSI_FILE_ADDRESS:
-               num = REG_A0;
-               break;
-       default:
-               compile_error(ctx, "unsupported dst register file: %s\n",
-                       tgsi_file_name(dst->File));
-               break;
-       }
-
-       if (dst->Indirect)
-               flags |= IR3_REG_RELATIV;
-       if (ctx->so->key.half_precision)
-               flags |= IR3_REG_HALF;
-
-       reg = ir3_reg_create(instr, regid(num, chan), flags);
-
-       if (dst->Indirect)
-               ctx->last_rel = instr;
-
-       return reg;
-}
-
-static struct ir3_register *
-add_src_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
-               const struct tgsi_src_register *src, unsigned chan)
-{
-       unsigned flags = 0, num = 0;
-       struct ir3_register *reg;
-
-       /* TODO we need to use a mov to temp for const >= 64.. or maybe
-        * we could use relative addressing..
-        */
-       compile_assert(ctx, src->Index < 64);
-
-       switch (src->File) {
-       case TGSI_FILE_IMMEDIATE:
-               /* TODO if possible, use actual immediate instead of const.. but
-                * TGSI has vec4 immediates, we can only embed scalar (of limited
-                * size, depending on instruction..)
-                */
-       case TGSI_FILE_CONSTANT:
-               flags |= IR3_REG_CONST;
-               num = src->Index + ctx->base_reg[src->File];
-               break;
-       case TGSI_FILE_OUTPUT:
-               /* NOTE: we should only end up w/ OUTPUT file for things like
-                * clamp()'ing saturated dst instructions
-                */
-       case TGSI_FILE_INPUT:
-       case TGSI_FILE_TEMPORARY:
-               num = src->Index + ctx->base_reg[src->File];
-               break;
-       default:
-               compile_error(ctx, "unsupported src register file: %s\n",
-                       tgsi_file_name(src->File));
-               break;
-       }
-
-       if (src->Absolute)
-               flags |= IR3_REG_ABS;
-       if (src->Negate)
-               flags |= IR3_REG_NEGATE;
-       if (src->Indirect)
-               flags |= IR3_REG_RELATIV;
-       if (ctx->so->key.half_precision)
-               flags |= IR3_REG_HALF;
-
-       reg = ir3_reg_create(instr, regid(num, chan), flags);
-
-       if (src->Indirect)
-               ctx->last_rel = instr;
-
-       instr->flags |= src_flags(ctx, reg);
-
-       return reg;
-}
-
-static void
-src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
-{
-       src->File      = dst->File;
-       src->Indirect  = dst->Indirect;
-       src->Dimension = dst->Dimension;
-       src->Index     = dst->Index;
-       src->Absolute  = 0;
-       src->Negate    = 0;
-       src->SwizzleX  = TGSI_SWIZZLE_X;
-       src->SwizzleY  = TGSI_SWIZZLE_Y;
-       src->SwizzleZ  = TGSI_SWIZZLE_Z;
-       src->SwizzleW  = TGSI_SWIZZLE_W;
-}
-
-/* Get internal-temp src/dst to use for a sequence of instructions
- * generated by a single TGSI op.
- */
-static struct tgsi_src_register *
-get_internal_temp(struct fd3_compile_context *ctx,
-               struct tgsi_dst_register *tmp_dst)
-{
-       struct tgsi_src_register *tmp_src;
-       int n;
-
-       tmp_dst->File      = TGSI_FILE_TEMPORARY;
-       tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
-       tmp_dst->Indirect  = 0;
-       tmp_dst->Dimension = 0;
-
-       /* assign next temporary: */
-       n = ctx->num_internal_temps++;
-       compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
-       tmp_src = &ctx->internal_temps[n];
-
-       tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
-
-       src_from_dst(tmp_src, tmp_dst);
-
-       return tmp_src;
-}
-
-/* Get internal half-precision temp src/dst to use for a sequence of
- * instructions generated by a single TGSI op.
- */
-static struct tgsi_src_register *
-get_internal_temp_hr(struct fd3_compile_context *ctx,
-               struct tgsi_dst_register *tmp_dst)
-{
-       struct tgsi_src_register *tmp_src;
-       int n;
-
-       if (ctx->so->key.half_precision)
-               return get_internal_temp(ctx, tmp_dst);
-
-       tmp_dst->File      = TGSI_FILE_TEMPORARY;
-       tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
-       tmp_dst->Indirect  = 0;
-       tmp_dst->Dimension = 0;
-
-       /* assign next temporary: */
-       n = ctx->num_internal_temps++;
-       compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
-       tmp_src = &ctx->internal_temps[n];
-
-       /* just use hr0 because no one else should be using half-
-        * precision regs:
-        */
-       tmp_dst->Index = 0;
-
-       src_from_dst(tmp_src, tmp_dst);
-
-       return tmp_src;
-}
-
-static inline bool
-is_const(struct tgsi_src_register *src)
-{
-       return (src->File == TGSI_FILE_CONSTANT) ||
-                       (src->File == TGSI_FILE_IMMEDIATE);
-}
-
-static inline bool
-is_relative(struct tgsi_src_register *src)
-{
-       return src->Indirect;
-}
-
-static inline bool
-is_rel_or_const(struct tgsi_src_register *src)
-{
-       return is_relative(src) || is_const(src);
-}
-
-static type_t
-get_ftype(struct fd3_compile_context *ctx)
-{
-       return ctx->so->key.half_precision ? TYPE_F16 : TYPE_F32;
-}
-
-static type_t
-get_utype(struct fd3_compile_context *ctx)
-{
-       return ctx->so->key.half_precision ? TYPE_U16 : TYPE_U32;
-}
-
-static unsigned
-src_swiz(struct tgsi_src_register *src, int chan)
-{
-       switch (chan) {
-       case 0: return src->SwizzleX;
-       case 1: return src->SwizzleY;
-       case 2: return src->SwizzleZ;
-       case 3: return src->SwizzleW;
-       }
-       assert(0);
-       return 0;
-}
-
-/* for instructions that cannot take a const register as src, if needed
- * generate a move to temporary gpr:
- */
-static struct tgsi_src_register *
-get_unconst(struct fd3_compile_context *ctx, struct tgsi_src_register *src)
-{
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register *tmp_src;
-
-       compile_assert(ctx, is_rel_or_const(src));
-
-       tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-       create_mov(ctx, &tmp_dst, src);
-
-       return tmp_src;
-}
-
-static void
-get_immediate(struct fd3_compile_context *ctx,
-               struct tgsi_src_register *reg, uint32_t val)
-{
-       unsigned neg, swiz, idx, i;
-       /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
-       static const unsigned swiz2tgsi[] = {
-                       TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
-       };
-
-       for (i = 0; i < ctx->immediate_idx; i++) {
-               swiz = i % 4;
-               idx  = i / 4;
-
-               if (ctx->so->immediates[idx].val[swiz] == val) {
-                       neg = 0;
-                       break;
-               }
-
-               if (ctx->so->immediates[idx].val[swiz] == -val) {
-                       neg = 1;
-                       break;
-               }
-       }
-
-       if (i == ctx->immediate_idx) {
-               /* need to generate a new immediate: */
-               swiz = i % 4;
-               idx  = i / 4;
-               neg  = 0;
-               ctx->so->immediates[idx].val[swiz] = val;
-               ctx->so->immediates_count = idx + 1;
-               ctx->immediate_idx++;
-       }
-
-       reg->File      = TGSI_FILE_IMMEDIATE;
-       reg->Indirect  = 0;
-       reg->Dimension = 0;
-       reg->Index     = idx;
-       reg->Absolute  = 0;
-       reg->Negate    = neg;
-       reg->SwizzleX  = swiz2tgsi[swiz];
-       reg->SwizzleY  = swiz2tgsi[swiz];
-       reg->SwizzleZ  = swiz2tgsi[swiz];
-       reg->SwizzleW  = swiz2tgsi[swiz];
-}
-
-static void
-create_mov(struct fd3_compile_context *ctx, struct tgsi_dst_register *dst,
-               struct tgsi_src_register *src)
-{
-       type_t type_mov = get_ftype(ctx);
-       unsigned i;
-
-       for (i = 0; i < 4; i++) {
-               /* move to destination: */
-               if (dst->WriteMask & (1 << i)) {
-                       struct ir3_instruction *instr;
-
-                       if (src->Absolute || src->Negate) {
-                               /* can't have abs or neg on a mov instr, so use
-                                * absneg.f instead to handle these cases:
-                                */
-                               instr = instr_create(ctx, 2, OPC_ABSNEG_F);
-                       } else {
-                               instr = instr_create(ctx, 1, 0);
-                               instr->cat1.src_type = type_mov;
-                               instr->cat1.dst_type = type_mov;
-                       }
-
-                       add_dst_reg(ctx, instr, dst, i);
-                       add_src_reg(ctx, instr, src, src_swiz(src, i));
-               } else {
-                       add_nop(ctx, 1);
-               }
-       }
-}
-
-static void
-create_clamp(struct fd3_compile_context *ctx,
-               struct tgsi_dst_register *dst, struct tgsi_src_register *val,
-               struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
-{
-       struct ir3_instruction *instr;
-
-       instr = instr_create(ctx, 2, OPC_MAX_F);
-       vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
-
-       instr = instr_create(ctx, 2, OPC_MIN_F);
-       vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
-}
-
-static void
-create_clamp_imm(struct fd3_compile_context *ctx,
-               struct tgsi_dst_register *dst,
-               uint32_t minval, uint32_t maxval)
-{
-       struct tgsi_src_register minconst, maxconst;
-       struct tgsi_src_register src;
-
-       src_from_dst(&src, dst);
-
-       get_immediate(ctx, &minconst, minval);
-       get_immediate(ctx, &maxconst, maxval);
-
-       create_clamp(ctx, dst, &src, &minconst, &maxconst);
-}
-
-static struct tgsi_dst_register *
-get_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-       unsigned i;
-       for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-               struct tgsi_src_register *src = &inst->Src[i].Register;
-               if ((src->File == dst->File) && (src->Index == dst->Index)) {
-                       if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
-                                       (src->SwizzleX == TGSI_SWIZZLE_X) &&
-                                       (src->SwizzleY == TGSI_SWIZZLE_Y) &&
-                                       (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
-                                       (src->SwizzleW == TGSI_SWIZZLE_W))
-                               continue;
-                       ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
-                       ctx->tmp_dst.WriteMask = dst->WriteMask;
-                       dst = &ctx->tmp_dst;
-                       break;
-               }
-       }
-       return dst;
-}
-
-static void
-put_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst,
-               struct tgsi_dst_register *dst)
-{
-       /* if necessary, add mov back into original dst: */
-       if (dst != &inst->Dst[0].Register) {
-               create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
-       }
-}
-
-/* helper to generate the necessary repeat and/or additional instructions
- * to turn a scalar instruction into a vector operation:
- */
-static void
-vectorize(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
-               struct tgsi_dst_register *dst, int nsrcs, ...)
-{
-       va_list ap;
-       int i, j, n = 0;
-       bool indirect = dst->Indirect;
-
-       add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
-
-       va_start(ap, nsrcs);
-       for (j = 0; j < nsrcs; j++) {
-               struct tgsi_src_register *src =
-                               va_arg(ap, struct tgsi_src_register *);
-               unsigned flags = va_arg(ap, unsigned);
-               struct ir3_register *reg;
-               if (flags & IR3_REG_IMMED) {
-                       reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
-                       /* this is an ugly cast.. should have put flags first! */
-                       reg->iim_val = *(int *)&src;
-               } else {
-                       reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
-                       indirect |= src->Indirect;
-               }
-               reg->flags |= flags & ~IR3_REG_NEGATE;
-               if (flags & IR3_REG_NEGATE)
-                       reg->flags ^= IR3_REG_NEGATE;
-       }
-       va_end(ap);
-
-       for (i = 0; i < 4; i++) {
-               if (dst->WriteMask & (1 << i)) {
-                       struct ir3_instruction *cur;
-
-                       if (n++ == 0) {
-                               cur = instr;
-                       } else {
-                               cur = ir3_instr_clone(instr);
-                               cur->flags &= ~(IR3_INSTR_SY | IR3_INSTR_SS | IR3_INSTR_JP);
-                       }
-
-                       /* fix-up dst register component: */
-                       cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
-
-                       /* fix-up src register component: */
-                       va_start(ap, nsrcs);
-                       for (j = 0; j < nsrcs; j++) {
-                               struct tgsi_src_register *src =
-                                               va_arg(ap, struct tgsi_src_register *);
-                               unsigned flags = va_arg(ap, unsigned);
-                               if (!(flags & IR3_REG_IMMED)) {
-                                       cur->regs[j+1]->num =
-                                                       regid(cur->regs[j+1]->num >> 2,
-                                                                       src_swiz(src, i));
-                                       cur->flags |= src_flags(ctx, cur->regs[j+1]);
-                               }
-                       }
-                       va_end(ap);
-
-                       if (indirect)
-                               ctx->last_rel = cur;
-               }
-       }
-
-       /* pad w/ nop's.. at least until we are clever enough to
-        * figure out if we really need to..
-        */
-       add_nop(ctx, 4 - n);
-}
-
-/*
- * Handlers for TGSI instructions which do not have a 1:1 mapping to
- * native instructions:
- */
-
-static void
-trans_clamp(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src0 = &inst->Src[0].Register;
-       struct tgsi_src_register *src1 = &inst->Src[1].Register;
-       struct tgsi_src_register *src2 = &inst->Src[2].Register;
-
-       create_clamp(ctx, dst, src0, src1, src2);
-
-       put_dst(ctx, inst, dst);
-}
-
-/* ARL(x) = x, but mova from hrN.x to a0.. */
-static void
-trans_arl(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register *tmp_src;
-       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-       unsigned chan = src->SwizzleX;
-       compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
-
-       handle_last_rel(ctx);
-
-       tmp_src = get_internal_temp_hr(ctx, &tmp_dst);
-
-       /* cov.{f32,f16}s16 Rtmp, Rsrc */
-       instr = instr_create(ctx, 1, 0);
-       instr->cat1.src_type = get_ftype(ctx);
-       instr->cat1.dst_type = TYPE_S16;
-       add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
-       add_src_reg(ctx, instr, src, chan);
-
-       add_nop(ctx, 3);
-
-       /* shl.b Rtmp, Rtmp, 2 */
-       instr = instr_create(ctx, 2, OPC_SHL_B);
-       add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
-       add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
-       ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
-
-       add_nop(ctx, 3);
-
-       /* mova a0, Rtmp */
-       instr = instr_create(ctx, 1, 0);
-       instr->cat1.src_type = TYPE_S16;
-       instr->cat1.dst_type = TYPE_S16;
-       add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
-       add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
-
-       /* need to ensure 5 instr slots before a0 is used: */
-       add_nop(ctx, 6);
-}
-
-/* texture fetch/sample instructions: */
-static void
-trans_samp(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_register *r;
-       struct ir3_instruction *instr;
-       struct tgsi_src_register *coord = &inst->Src[0].Register;
-       struct tgsi_src_register *samp  = &inst->Src[1].Register;
-       unsigned tex = inst->Texture.Texture;
-       int8_t *order;
-       unsigned i, flags = 0, src_wrmask;
-       bool needs_mov = false;
-
-       switch (t->arg) {
-       case TGSI_OPCODE_TEX:
-               if (tex == TGSI_TEXTURE_2D) {
-                       order = (int8_t[4]){ 0,  1, -1, -1 };
-                       src_wrmask = TGSI_WRITEMASK_XY;
-               } else {
-                       order = (int8_t[4]){ 0,  1,  2, -1 };
-                       src_wrmask = TGSI_WRITEMASK_XYZ;
-               }
-               break;
-       case TGSI_OPCODE_TXP:
-               if (tex == TGSI_TEXTURE_2D) {
-                       order = (int8_t[4]){ 0,  1,  3, -1 };
-                       src_wrmask = TGSI_WRITEMASK_XYZ;
-               } else {
-                       order = (int8_t[4]){ 0,  1,  2,  3 };
-                       src_wrmask = TGSI_WRITEMASK_XYZW;
-               }
-               flags |= IR3_INSTR_P;
-               break;
-       default:
-               compile_assert(ctx, 0);
-               break;
-       }
-
-       if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE)) {
-               add_nop(ctx, 3);
-               flags |= IR3_INSTR_3D;
-       }
-
-       /* cat5 instruction cannot seem to handle const or relative: */
-       if (is_rel_or_const(coord))
-               needs_mov = true;
-
-       /* The texture sample instructions need to coord in successive
-        * registers/components (ie. src.xy but not src.yx).  And TXP
-        * needs the .w component in .z for 2D..  so in some cases we
-        * might need to emit some mov instructions to shuffle things
-        * around:
-        */
-       for (i = 1; (i < 4) && (order[i] >= 0) && !needs_mov; i++)
-               if (src_swiz(coord, i) != (src_swiz(coord, 0) + order[i]))
-                       needs_mov = true;
-
-       if (needs_mov) {
-               struct tgsi_dst_register tmp_dst;
-               struct tgsi_src_register *tmp_src;
-               unsigned j;
-
-               type_t type_mov = get_ftype(ctx);
-
-               /* need to move things around: */
-               tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-               for (j = 0; (j < 4) && (order[j] >= 0); j++) {
-                       instr = instr_create(ctx, 1, 0);
-                       instr->cat1.src_type = type_mov;
-                       instr->cat1.dst_type = type_mov;
-                       add_dst_reg(ctx, instr, &tmp_dst, j);
-                       add_src_reg(ctx, instr, coord,
-                                       src_swiz(coord, order[j]));
-               }
-
-               coord = tmp_src;
-
-               add_nop(ctx, 4 - j);
-       }
-
-       instr = instr_create(ctx, 5, t->opc);
-       instr->cat5.type = get_ftype(ctx);
-       instr->cat5.samp = samp->Index;
-       instr->cat5.tex  = samp->Index;
-       instr->flags |= flags;
-
-       r = add_dst_reg(ctx, instr, &inst->Dst[0].Register, 0);
-       r->wrmask = inst->Dst[0].Register.WriteMask;
-
-       add_src_reg(ctx, instr, coord, coord->SwizzleX)->wrmask = src_wrmask;
-
-       /* after add_src_reg() so we don't set (sy) on sam instr itself! */
-       regmask_set(&ctx->needs_sy, r);
-}
-
-/*
- * SEQ(a,b) = (a == b) ? 1.0 : 0.0
- *   cmps.f.eq tmp0, b, a
- *   cov.u16f16 dst, tmp0
- *
- * SNE(a,b) = (a != b) ? 1.0 : 0.0
- *   cmps.f.eq tmp0, b, a
- *   add.s tmp0, tmp0, -1
- *   sel.f16 dst, {0.0}, tmp0, {1.0}
- *
- * SGE(a,b) = (a >= b) ? 1.0 : 0.0
- *   cmps.f.ge tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SLE(a,b) = (a <= b) ? 1.0 : 0.0
- *   cmps.f.ge tmp0, b, a
- *   cov.u16f16 dst, tmp0
- *
- * SGT(a,b) = (a > b)  ? 1.0 : 0.0
- *   cmps.f.ge tmp0, b, a
- *   add.s tmp0, tmp0, -1
- *   sel.f16 dst, {0.0}, tmp0, {1.0}
- *
- * SLT(a,b) = (a < b)  ? 1.0 : 0.0
- *   cmps.f.ge tmp0, a, b
- *   add.s tmp0, tmp0, -1
- *   sel.f16 dst, {0.0}, tmp0, {1.0}
- *
- * CMP(a,b,c) = (a < 0.0) ? b : c
- *   cmps.f.ge tmp0, a, {0.0}
- *   add.s tmp0, tmp0, -1
- *   sel.f16 dst, c, tmp0, b
- */
-static void
-trans_cmp(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register *tmp_src;
-       struct tgsi_src_register constval0, constval1;
-       /* final instruction for CMP() uses orig src1 and src2: */
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *a0, *a1;
-       unsigned condition;
-
-       tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-       switch (t->tgsi_opc) {
-       case TGSI_OPCODE_SEQ:
-       case TGSI_OPCODE_SNE:
-               a0 = &inst->Src[1].Register;  /* b */
-               a1 = &inst->Src[0].Register;  /* a */
-               condition = IR3_COND_EQ;
-               break;
-       case TGSI_OPCODE_SGE:
-       case TGSI_OPCODE_SLT:
-               a0 = &inst->Src[0].Register;  /* a */
-               a1 = &inst->Src[1].Register;  /* b */
-               condition = IR3_COND_GE;
-               break;
-       case TGSI_OPCODE_SLE:
-       case TGSI_OPCODE_SGT:
-               a0 = &inst->Src[1].Register;  /* b */
-               a1 = &inst->Src[0].Register;  /* a */
-               condition = IR3_COND_GE;
-               break;
-       case TGSI_OPCODE_CMP:
-               get_immediate(ctx, &constval0, fui(0.0));
-               a0 = &inst->Src[0].Register;  /* a */
-               a1 = &constval0;              /* {0.0} */
-               condition = IR3_COND_GE;
-               break;
-       default:
-               compile_assert(ctx, 0);
-               return;
-       }
-
-       if (is_const(a0) && is_const(a1))
-               a0 = get_unconst(ctx, a0);
-
-       /* cmps.f.ge tmp, a0, a1 */
-       instr = instr_create(ctx, 2, OPC_CMPS_F);
-       instr->cat2.condition = condition;
-       vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
-
-       switch (t->tgsi_opc) {
-       case TGSI_OPCODE_SEQ:
-       case TGSI_OPCODE_SGE:
-       case TGSI_OPCODE_SLE:
-               /* cov.u16f16 dst, tmp0 */
-               instr = instr_create(ctx, 1, 0);
-               instr->cat1.src_type = get_utype(ctx);
-               instr->cat1.dst_type = get_ftype(ctx);
-               vectorize(ctx, instr, dst, 1, tmp_src, 0);
-               break;
-       case TGSI_OPCODE_SNE:
-       case TGSI_OPCODE_SGT:
-       case TGSI_OPCODE_SLT:
-       case TGSI_OPCODE_CMP:
-               /* add.s tmp, tmp, -1 */
-               instr = instr_create(ctx, 2, OPC_ADD_S);
-               vectorize(ctx, instr, &tmp_dst, 2, tmp_src, 0, -1, IR3_REG_IMMED);
-
-               if (t->tgsi_opc == TGSI_OPCODE_CMP) {
-                       /* sel.{f32,f16} dst, src2, tmp, src1 */
-                       instr = instr_create(ctx, 3,
-                                       ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32);
-                       vectorize(ctx, instr, dst, 3,
-                                       &inst->Src[2].Register, 0,
-                                       tmp_src, 0,
-                                       &inst->Src[1].Register, 0);
-               } else {
-                       get_immediate(ctx, &constval0, fui(0.0));
-                       get_immediate(ctx, &constval1, fui(1.0));
-                       /* sel.{f32,f16} dst, {0.0}, tmp0, {1.0} */
-                       instr = instr_create(ctx, 3,
-                                       ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32);
-                       vectorize(ctx, instr, dst, 3,
-                                       &constval0, 0, tmp_src, 0, &constval1, 0);
-               }
-
-               break;
-       }
-
-       put_dst(ctx, inst, dst);
-}
-
-/*
- * Conditional / Flow control
- */
-
-static unsigned
-find_instruction(struct fd3_compile_context *ctx, struct ir3_instruction *instr)
-{
-       unsigned i;
-       for (i = 0; i < ctx->ir->instrs_count; i++)
-               if (ctx->ir->instrs[i] == instr)
-                       return i;
-       return ~0;
-}
-
-static void
-push_branch(struct fd3_compile_context *ctx, struct ir3_instruction *instr)
-{
-       ctx->branch[ctx->branch_count++] = instr;
-}
-
-static void
-pop_branch(struct fd3_compile_context *ctx)
-{
-       struct ir3_instruction *instr;
-
-       /* if we were clever enough, we'd patch this up after the fact,
-        * and set (jp) flag on whatever the next instruction was, rather
-        * than inserting an extra nop..
-        */
-       instr = instr_create(ctx, 0, OPC_NOP);
-       instr->flags |= IR3_INSTR_JP;
-
-       /* pop the branch instruction from the stack and fix up branch target: */
-       instr = ctx->branch[--ctx->branch_count];
-       instr->cat0.immed = ctx->ir->instrs_count - find_instruction(ctx, instr) - 1;
-}
-
-/* We probably don't really want to translate if/else/endif into branches..
- * the blob driver evaluates both legs of the if and then uses the sel
- * instruction to pick which sides of the branch to "keep".. but figuring
- * that out will take somewhat more compiler smarts.  So hopefully branches
- * don't kill performance too badly.
- */
-static void
-trans_if(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-       struct tgsi_src_register constval;
-
-       get_immediate(ctx, &constval, fui(0.0));
-
-       if (is_const(src))
-               src = get_unconst(ctx, src);
-
-       instr = instr_create(ctx, 2, OPC_CMPS_F);
-       ir3_reg_create(instr, regid(REG_P0, 0), 0);
-       add_src_reg(ctx, instr, src, src->SwizzleX);
-       add_src_reg(ctx, instr, &constval, constval.SwizzleX);
-       instr->cat2.condition = IR3_COND_EQ;
-
-       instr = instr_create(ctx, 0, OPC_BR);
-       push_branch(ctx, instr);
-}
-
-static void
-trans_else(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct ir3_instruction *instr;
-
-       /* for first half of if/else/endif, generate a jump past the else: */
-       instr = instr_create(ctx, 0, OPC_JUMP);
-
-       pop_branch(ctx);
-       push_branch(ctx, instr);
-}
-
-static void
-trans_endif(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       pop_branch(ctx);
-}
-
-/*
- * Handlers for TGSI instructions which do have 1:1 mapping to native
- * instructions:
- */
-
-static void
-instr_cat0(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       instr_create(ctx, 0, t->opc);
-}
-
-static void
-instr_cat1(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-
-       /* mov instructions can't handle a negate on src: */
-       if (src->Negate) {
-               struct tgsi_src_register constval;
-               struct ir3_instruction *instr;
-
-               /* since right now, we are using uniformly either TYPE_F16 or
-                * TYPE_F32, and we don't utilize the conversion possibilities
-                * of mov instructions, we can get away with substituting an
-                * add.f which can handle negate.  Might need to revisit this
-                * in the future if we start supporting widening/narrowing or
-                * conversion to/from integer..
-                */
-               instr = instr_create(ctx, 2, OPC_ADD_F);
-               get_immediate(ctx, &constval, fui(0.0));
-               vectorize(ctx, instr, dst, 2, src, 0, &constval, 0);
-       } else {
-               create_mov(ctx, dst, src);
-               /* create_mov() generates vector sequence, so no vectorize() */
-       }
-       put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat2(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src0 = &inst->Src[0].Register;
-       struct tgsi_src_register *src1 = &inst->Src[1].Register;
-       struct ir3_instruction *instr;
-       unsigned src0_flags = 0, src1_flags = 0;
-
-       switch (t->tgsi_opc) {
-       case TGSI_OPCODE_ABS:
-               src0_flags = IR3_REG_ABS;
-               break;
-       case TGSI_OPCODE_SUB:
-               src1_flags = IR3_REG_NEGATE;
-               break;
-       }
-
-       switch (t->opc) {
-       case OPC_ABSNEG_F:
-       case OPC_ABSNEG_S:
-       case OPC_CLZ_B:
-       case OPC_CLZ_S:
-       case OPC_SIGN_F:
-       case OPC_FLOOR_F:
-       case OPC_CEIL_F:
-       case OPC_RNDNE_F:
-       case OPC_RNDAZ_F:
-       case OPC_TRUNC_F:
-       case OPC_NOT_B:
-       case OPC_BFREV_B:
-       case OPC_SETRM:
-       case OPC_CBITS_B:
-               /* these only have one src reg */
-               instr = instr_create(ctx, 2, t->opc);
-               vectorize(ctx, instr, dst, 1, src0, src0_flags);
-               break;
-       default:
-               if (is_const(src0) && is_const(src1))
-                       src0 = get_unconst(ctx, src0);
-
-               instr = instr_create(ctx, 2, t->opc);
-               vectorize(ctx, instr, dst, 2, src0, src0_flags,
-                               src1, src1_flags);
-               break;
-       }
-
-       put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat3(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src0 = &inst->Src[0].Register;
-       struct tgsi_src_register *src1 = &inst->Src[1].Register;
-       struct ir3_instruction *instr;
-
-       /* in particular, can't handle const for src1 for cat3..
-        * for mad, we can swap first two src's if needed:
-        */
-       if (is_rel_or_const(src1)) {
-               if (is_mad(t->opc) && !is_rel_or_const(src0)) {
-                       struct tgsi_src_register *tmp;
-                       tmp = src0;
-                       src0 = src1;
-                       src1 = tmp;
-               } else {
-                       src1 = get_unconst(ctx, src1);
-               }
-       }
-
-       instr = instr_create(ctx, 3,
-                       ctx->so->key.half_precision ? t->hopc : t->opc);
-       vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
-                       &inst->Src[2].Register, 0);
-       put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat4(const struct instr_translater *t,
-               struct fd3_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register *dst = get_dst(ctx, inst);
-       struct tgsi_src_register *src = &inst->Src[0].Register;
-       struct ir3_instruction *instr;
-       unsigned i, n;
-
-       /* seems like blob compiler avoids const as src.. */
-       if (is_const(src))
-               src = get_unconst(ctx, src);
-
-       /* worst case: */
-       add_nop(ctx, 6);
-
-       /* we need to replicate into each component: */
-       for (i = 0, n = 0; i < 4; i++) {
-               if (dst->WriteMask & (1 << i)) {
-                       if (n++)
-                               add_nop(ctx, 1);
-                       instr = instr_create(ctx, 4, t->opc);
-                       add_dst_reg(ctx, instr, dst, i);
-                       add_src_reg(ctx, instr, src, src->SwizzleX);
-               }
-       }
-
-       regmask_set(&ctx->needs_ss, instr->regs[0]);
-       put_dst(ctx, inst, dst);
-}
-
-static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
-#define INSTR(n, f, ...) \
-       [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
-
-       INSTR(MOV,          instr_cat1),
-       INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
-       INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
-       INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
-       INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
-       INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
-       INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
-       INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
-       INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
-       INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
-       INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
-       INSTR(CLAMP,        trans_clamp),
-       INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
-       INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
-       INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
-       INSTR(ARL,          trans_arl),
-       INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
-       INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
-       INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
-       INSTR(COS,          instr_cat4, .opc = OPC_COS),
-       INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
-       INSTR(TEX,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
-       INSTR(TXP,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
-       INSTR(SGT,          trans_cmp),
-       INSTR(SLT,          trans_cmp),
-       INSTR(SGE,          trans_cmp),
-       INSTR(SLE,          trans_cmp),
-       INSTR(SNE,          trans_cmp),
-       INSTR(SEQ,          trans_cmp),
-       INSTR(CMP,          trans_cmp),
-       INSTR(IF,           trans_if),
-       INSTR(ELSE,         trans_else),
-       INSTR(ENDIF,        trans_endif),
-       INSTR(END,          instr_cat0, .opc = OPC_END),
-       INSTR(KILL,         instr_cat0, .opc = OPC_KILL),
-};
-
-static fd3_semantic
-decl_semantic(const struct tgsi_declaration_semantic *sem)
-{
-       return fd3_semantic_name(sem->Name, sem->Index);
-}
-
-static int
-decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-       struct fd3_shader_variant *so = ctx->so;
-       unsigned base = ctx->base_reg[TGSI_FILE_INPUT];
-       unsigned i, flags = 0;
-       int nop = 0;
-
-       /* I don't think we should get frag shader input without
-        * semantic info?  Otherwise how do inputs get linked to
-        * vert outputs?
-        */
-       compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
-                       decl->Declaration.Semantic);
-
-       if (ctx->so->key.half_precision)
-               flags |= IR3_REG_HALF;
-
-       for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-               unsigned n = so->inputs_count++;
-               unsigned r = regid(i + base, 0);
-               unsigned ncomp;
-
-               /* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */
-               ncomp = 4;
-
-               DBG("decl in -> r%d", i + base);   // XXX
-
-               compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
-
-               so->inputs[n].semantic = decl_semantic(&decl->Semantic);
-               so->inputs[n].compmask = (1 << ncomp) - 1;
-               so->inputs[n].ncomp = ncomp;
-               so->inputs[n].regid = r;
-               so->inputs[n].inloc = ctx->next_inloc;
-               so->inputs[n].bary = true;   /* all that is supported */
-               ctx->next_inloc += ncomp;
-
-               so->total_in += ncomp;
-
-               /* for frag shaders, we need to generate the corresponding bary instr: */
-               if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-                       unsigned j;
-
-                       for (j = 0; j < ncomp; j++) {
-                               struct ir3_instruction *instr;
-                               struct ir3_register *dst;
-
-                               instr = instr_create(ctx, 2, OPC_BARY_F);
-
-                               /* dst register: */
-                               dst = ir3_reg_create(instr, r + j, flags);
-                               ctx->last_input = dst;
-
-                               /* input position: */
-                               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val =
-                                               so->inputs[n].inloc + j - 8;
-
-                               /* input base (always r0.xy): */
-                               ir3_reg_create(instr, regid(0,0), 0)->wrmask = 0x3;
-                       }
-
-                       nop = 6;
-               }
-       }
-
-       return nop;
-}
-
-static void
-decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-       struct fd3_shader_variant *so = ctx->so;
-       unsigned base = ctx->base_reg[TGSI_FILE_OUTPUT];
-       unsigned comp = 0;
-       unsigned name = decl->Semantic.Name;
-       unsigned i;
-
-       compile_assert(ctx, decl->Declaration.Semantic);  // TODO is this ever not true?
-
-       DBG("decl out[%d] -> r%d", name, decl->Range.First + base);   // XXX
-
-       if (ctx->type == TGSI_PROCESSOR_VERTEX) {
-               switch (name) {
-               case TGSI_SEMANTIC_POSITION:
-                       so->writes_pos = true;
-                       break;
-               case TGSI_SEMANTIC_PSIZE:
-                       so->writes_psize = true;
-                       break;
-               case TGSI_SEMANTIC_COLOR:
-               case TGSI_SEMANTIC_BCOLOR:
-               case TGSI_SEMANTIC_GENERIC:
-               case TGSI_SEMANTIC_FOG:
-               case TGSI_SEMANTIC_TEXCOORD:
-                       break;
-               default:
-                       compile_error(ctx, "unknown VS semantic name: %s\n",
-                                       tgsi_semantic_names[name]);
-               }
-       } else {
-               switch (name) {
-               case TGSI_SEMANTIC_POSITION:
-                       comp = 2;  /* tgsi will write to .z component */
-                       so->writes_pos = true;
-                       break;
-               case TGSI_SEMANTIC_COLOR:
-                       break;
-               default:
-                       compile_error(ctx, "unknown FS semantic name: %s\n",
-                                       tgsi_semantic_names[name]);
-               }
-       }
-
-       for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-               unsigned n = so->outputs_count++;
-               compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
-               so->outputs[n].semantic = decl_semantic(&decl->Semantic);
-               so->outputs[n].regid = regid(i + base, comp);
-       }
-}
-
-static void
-decl_samp(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-       ctx->so->has_samp = true;
-}
-
-static void
-compile_instructions(struct fd3_compile_context *ctx)
-{
-       struct ir3 *ir = ctx->ir;
-       int nop = 0;
-
-       while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
-               tgsi_parse_token(&ctx->parser);
-
-               switch (ctx->parser.FullToken.Token.Type) {
-               case TGSI_TOKEN_TYPE_DECLARATION: {
-                       struct tgsi_full_declaration *decl =
-                                       &ctx->parser.FullToken.FullDeclaration;
-                       if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
-                               decl_out(ctx, decl);
-                       } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
-                               nop = decl_in(ctx, decl);
-                       } else if (decl->Declaration.File == TGSI_FILE_SAMPLER) {
-                               decl_samp(ctx, decl);
-                       }
-                       break;
-               }
-               case TGSI_TOKEN_TYPE_IMMEDIATE: {
-                       /* TODO: if we know the immediate is small enough, and only
-                        * used with instructions that can embed an immediate, we
-                        * can skip this:
-                        */
-                       struct tgsi_full_immediate *imm =
-                                       &ctx->parser.FullToken.FullImmediate;
-                       unsigned n = ctx->so->immediates_count++;
-                       memcpy(ctx->so->immediates[n].val, imm->u, 16);
-                       break;
-               }
-               case TGSI_TOKEN_TYPE_INSTRUCTION: {
-                       struct tgsi_full_instruction *inst =
-                                       &ctx->parser.FullToken.FullInstruction;
-                       unsigned opc = inst->Instruction.Opcode;
-                       const struct instr_translater *t = &translaters[opc];
-
-                       add_nop(ctx, nop);
-                       nop = 0;
-
-                       if (t->fxn) {
-                               t->fxn(t, ctx, inst);
-                               ctx->num_internal_temps = 0;
-                       } else {
-                               compile_error(ctx, "unknown TGSI opc: %s\n",
-                                               tgsi_get_opcode_name(opc));
-                       }
-
-                       switch (inst->Instruction.Saturate) {
-                       case TGSI_SAT_ZERO_ONE:
-                               create_clamp_imm(ctx, &inst->Dst[0].Register,
-                                               fui(0.0), fui(1.0));
-                               break;
-                       case TGSI_SAT_MINUS_PLUS_ONE:
-                               create_clamp_imm(ctx, &inst->Dst[0].Register,
-                                               fui(-1.0), fui(1.0));
-                               break;
-                       }
-
-                       break;
-               }
-               default:
-                       break;
-               }
-       }
-
-       if (ir->instrs_count > 0)
-               ir->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
-
-       if (ctx->last_input)
-               ctx->last_input->flags |= IR3_REG_EI;
-
-       handle_last_rel(ctx);
-}
-
-int
-fd3_compile_shader_old(struct fd3_shader_variant *so,
-               const struct tgsi_token *tokens, struct fd3_shader_key key)
-{
-       struct fd3_compile_context ctx;
-
-       assert(!so->ir);
-
-       so->ir = ir3_create();
-
-       assert(so->ir);
-
-       if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK)
-               return -1;
-
-       compile_instructions(&ctx);
-
-       compile_free(&ctx);
-
-       return 0;
-}
index 4b2d94103f5ee10f551da7dfa6e6eba28dd7ca2b..89af740c07c6e0eba6221b1fe340dd6fe0736c4d 100644 (file)
@@ -44,7 +44,7 @@
 
 static void
 emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring,
-               struct fd3_shader_key key)
+               struct ir3_shader_key key)
 {
        struct fd_vertex_stateobj *vtx = ctx->vtx;
        struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vertexbuf;
@@ -70,7 +70,7 @@ emit_vertexbufs(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 static void
 draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
-               struct fd_ringbuffer *ring, unsigned dirty, struct fd3_shader_key key)
+               struct fd_ringbuffer *ring, unsigned dirty, struct ir3_shader_key key)
 {
        fd3_emit_state(ctx, ring, &ctx->prog, dirty, key);
 
@@ -99,7 +99,7 @@ static void
 fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
 {
        unsigned dirty = ctx->dirty;
-       struct fd3_shader_key key = {
+       struct ir3_shader_key key = {
                        /* do binning pass first: */
                        .binning_pass = true,
                        .color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
@@ -127,7 +127,7 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
 {
        struct fd3_context *fd3_ctx = fd3_context(ctx);
        struct fd_ringbuffer *ring = ctx->binning_ring;
-       struct fd3_shader_key key = {
+       struct ir3_shader_key key = {
                        .binning_pass = true,
                        .half_precision = true,
        };
@@ -168,7 +168,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
        struct fd_ringbuffer *ring = ctx->ring;
        unsigned dirty = ctx->dirty;
        unsigned ce, i;
-       struct fd3_shader_key key = {
+       struct ir3_shader_key key = {
                        .half_precision = true,
        };
 
index 1e4de26406a8425bd068c2ed4e6f706370652231..44932dc241dc9f04775acc9d88f3a5e34601ac93 100644 (file)
@@ -87,7 +87,7 @@ static void
 emit_constants(struct fd_ringbuffer *ring,
                enum adreno_state_block sb,
                struct fd_constbuf_stateobj *constbuf,
-               struct fd3_shader_variant *shader)
+               struct ir3_shader_variant *shader)
 {
        uint32_t enabled_mask = constbuf->enabled_mask;
        uint32_t first_immediate;
@@ -291,7 +291,7 @@ fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf
 
 void
 fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
-               struct fd3_shader_variant *vp,
+               struct ir3_shader_variant *vp,
                struct fd3_vertex_buf *vbufs, uint32_t n)
 {
        uint32_t i, j, last = 0;
@@ -350,10 +350,10 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
 void
 fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
                struct fd_program_stateobj *prog, uint32_t dirty,
-               struct fd3_shader_key key)
+               struct ir3_shader_key key)
 {
-       struct fd3_shader_variant *vp;
-       struct fd3_shader_variant *fp;
+       struct ir3_shader_variant *vp;
+       struct ir3_shader_variant *fp;
 
        fp = fd3_shader_variant(prog->fp, key);
        vp = fd3_shader_variant(prog->vp, key);
index f2ae4dc295eb3dd1ddff0cce758d4378607d857e..5735c9f873d9f9bbf294d5c58cb153e6e0d48144 100644 (file)
@@ -33,7 +33,7 @@
 
 #include "freedreno_context.h"
 #include "fd3_util.h"
-
+#include "ir3_shader.h"
 
 struct fd_ringbuffer;
 enum adreno_state_block;
@@ -56,11 +56,11 @@ struct fd3_vertex_buf {
 };
 
 void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
-               struct fd3_shader_variant *vp,
+               struct ir3_shader_variant *vp,
                struct fd3_vertex_buf *vbufs, uint32_t n);
 void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
                struct fd_program_stateobj *prog, uint32_t dirty,
-               struct fd3_shader_key key);
+               struct ir3_shader_key key);
 void fd3_emit_restore(struct fd_context *ctx);
 
 #endif /* FD3_EMIT_H */
index 8519a90ccfa2b58c1d692cee804ff948deff0285..6828d0e1fb4222703bf6164c12ddadd49ed0c576 100644 (file)
@@ -43,7 +43,7 @@
 #include "fd3_util.h"
 #include "fd3_zsa.h"
 
-static const struct fd3_shader_key key = {
+static const struct ir3_shader_key key = {
                // XXX should set this based on render target format!  We don't
                // want half_precision if float32 render target!!!
                .half_precision = true,
index 164b1521a89fecb05c594e3c33f3277e643edf26..78c71d42e3940c7923fc53a4434eaffb4d52c762 100644 (file)
 #include "freedreno_program.h"
 
 #include "fd3_program.h"
-#include "fd3_compiler.h"
 #include "fd3_emit.h"
 #include "fd3_texture.h"
 #include "fd3_util.h"
 
 static void
-delete_variant(struct fd3_shader_variant *v)
+delete_shader_stateobj(struct fd3_shader_stateobj *so)
 {
-       ir3_destroy(v->ir);
-       fd_bo_del(v->bo);
-       free(v);
-}
-
-static void
-assemble_variant(struct fd3_shader_variant *so)
-{
-       struct fd_context *ctx = fd_context(so->so->pctx);
-       uint32_t sz, *bin;
-
-       bin = ir3_assemble(so->ir, &so->info);
-       sz = so->info.sizedwords * 4;
-
-       so->bo = fd_bo_new(ctx->dev, sz,
-                       DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
-                       DRM_FREEDRENO_GEM_TYPE_KMEM);
-
-       memcpy(fd_bo_map(so->bo), bin, sz);
-
-       free(bin);
-
-       so->instrlen = so->info.sizedwords / 8;
-       so->constlen = so->info.max_const + 1;
-}
-
-/* for vertex shader, the inputs are loaded into registers before the shader
- * is executed, so max_regs from the shader instructions might not properly
- * reflect the # of registers actually used:
- */
-static void
-fixup_vp_regfootprint(struct fd3_shader_variant *so)
-{
-       unsigned i;
-       for (i = 0; i < so->inputs_count; i++) {
-               if (so->inputs[i].compmask) {
-                       uint32_t regid = (so->inputs[i].regid + 3) >> 2;
-                       so->info.max_reg = MAX2(so->info.max_reg, regid);
-               }
-       }
-       for (i = 0; i < so->outputs_count; i++) {
-               uint32_t regid = (so->outputs[i].regid + 3) >> 2;
-               so->info.max_reg = MAX2(so->info.max_reg, regid);
-       }
-}
-
-static struct fd3_shader_variant *
-create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
-{
-       struct fd3_shader_variant *v = CALLOC_STRUCT(fd3_shader_variant);
-       const struct tgsi_token *tokens = so->tokens;
-       int ret;
-
-       if (!v)
-               return NULL;
-
-       v->so = so;
-       v->key = key;
-       v->type = so->type;
-
-       if (fd_mesa_debug & FD_DBG_DISASM) {
-               DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type,
-                       key.binning_pass, key.color_two_side, key.half_precision);
-               tgsi_dump(tokens, 0);
-       }
-
-       if (!(fd_mesa_debug & FD_DBG_NOOPT)) {
-               ret = fd3_compile_shader(v, tokens, key);
-               if (ret) {
-                       debug_error("new compiler failed, trying fallback!");
-
-                       v->inputs_count = 0;
-                       v->outputs_count = 0;
-                       v->total_in = 0;
-                       v->has_samp = false;
-                       v->immediates_count = 0;
-               }
-       } else {
-               ret = -1;  /* force fallback to old compiler */
-       }
-
-       if (ret)
-               ret = fd3_compile_shader_old(v, tokens, key);
-
-       if (ret) {
-               debug_error("compile failed!");
-               goto fail;
-       }
-
-       assemble_variant(v);
-       if (!v->bo) {
-               debug_error("assemble failed!");
-               goto fail;
-       }
-
-       if (so->type == SHADER_VERTEX)
-               fixup_vp_regfootprint(v);
-
-       if (fd_mesa_debug & FD_DBG_DISASM) {
-               DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
-                       key.binning_pass, key.color_two_side, key.half_precision);
-               disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
-       }
-
-       return v;
-
-fail:
-       delete_variant(v);
-       return NULL;
-}
-
-struct fd3_shader_variant *
-fd3_shader_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
-{
-       struct fd3_shader_variant *v;
-
-       /* some shader key values only apply to vertex or frag shader,
-        * so normalize the key to avoid constructing multiple identical
-        * variants:
-        */
-       if (so->type == SHADER_FRAGMENT) {
-               key.binning_pass = false;
-       }
-       if (so->type == SHADER_VERTEX) {
-               key.color_two_side = false;
-               key.half_precision = false;
-       }
-
-       for (v = so->variants; v; v = v->next)
-               if (!memcmp(&key, &v->key, sizeof(key)))
-                       return v;
-
-       /* compile new variant if it doesn't exist already: */
-       v = create_variant(so, key);
-       v->next = so->variants;
-       so->variants = v;
-
-       return v;
-}
-
-
-static void
-delete_shader(struct fd3_shader_stateobj *so)
-{
-       struct fd3_shader_variant *v, *t;
-       for (v = so->variants; v; ) {
-               t = v;
-               v = v->next;
-               delete_variant(t);
-       }
-       free((void *)so->tokens);
+       ir3_shader_destroy(so->shader);
        free(so);
 }
 
 static struct fd3_shader_stateobj *
-create_shader(struct pipe_context *pctx, const struct pipe_shader_state *cso,
+create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso,
                enum shader_t type)
 {
        struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj);
-       so->pctx = pctx;
-       so->type = type;
-       so->tokens = tgsi_dup_tokens(cso->tokens);
+       so->shader = ir3_shader_create(pctx, cso->tokens, type);
        return so;
 }
 
@@ -215,32 +62,32 @@ static void *
 fd3_fp_state_create(struct pipe_context *pctx,
                const struct pipe_shader_state *cso)
 {
-       return create_shader(pctx, cso, SHADER_FRAGMENT);
+       return create_shader_stateobj(pctx, cso, SHADER_FRAGMENT);
 }
 
 static void
 fd3_fp_state_delete(struct pipe_context *pctx, void *hwcso)
 {
        struct fd3_shader_stateobj *so = hwcso;
-       delete_shader(so);
+       delete_shader_stateobj(so);
 }
 
 static void *
 fd3_vp_state_create(struct pipe_context *pctx,
                const struct pipe_shader_state *cso)
 {
-       return create_shader(pctx, cso, SHADER_VERTEX);
+       return create_shader_stateobj(pctx, cso, SHADER_VERTEX);
 }
 
 static void
 fd3_vp_state_delete(struct pipe_context *pctx, void *hwcso)
 {
        struct fd3_shader_stateobj *so = hwcso;
-       delete_shader(so);
+       delete_shader_stateobj(so);
 }
 
 static void
-emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_variant *so)
+emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so)
 {
        const struct ir3_info *si = &so->info;
        enum adreno_state_block sb;
@@ -281,7 +128,7 @@ emit_shader(struct fd_ringbuffer *ring, const struct fd3_shader_variant *so)
 }
 
 static int
-find_output(const struct fd3_shader_variant *so, fd3_semantic semantic)
+find_output(const struct ir3_shader_variant *so, ir3_semantic semantic)
 {
        int j;
 
@@ -297,7 +144,7 @@ find_output(const struct fd3_shader_variant *so, fd3_semantic semantic)
         */
        if (sem2name(semantic) == TGSI_SEMANTIC_BCOLOR) {
                unsigned idx = sem2idx(semantic);
-               return find_output(so, fd3_semantic_name(TGSI_SEMANTIC_COLOR, idx));
+               return find_output(so, ir3_semantic_name(TGSI_SEMANTIC_COLOR, idx));
        }
 
        debug_assert(0);
@@ -306,7 +153,7 @@ find_output(const struct fd3_shader_variant *so, fd3_semantic semantic)
 }
 
 static int
-next_varying(const struct fd3_shader_variant *so, int i)
+next_varying(const struct ir3_shader_variant *so, int i)
 {
        while (++i < so->inputs_count)
                if (so->inputs[i].compmask && so->inputs[i].bary)
@@ -315,7 +162,7 @@ next_varying(const struct fd3_shader_variant *so, int i)
 }
 
 static uint32_t
-find_output_regid(const struct fd3_shader_variant *so, fd3_semantic semantic)
+find_output_regid(const struct ir3_shader_variant *so, ir3_semantic semantic)
 {
        int j;
        for (j = 0; j < so->outputs_count; j++)
@@ -326,9 +173,9 @@ find_output_regid(const struct fd3_shader_variant *so, fd3_semantic semantic)
 
 void
 fd3_program_emit(struct fd_ringbuffer *ring,
-               struct fd_program_stateobj *prog, struct fd3_shader_key key)
+               struct fd_program_stateobj *prog, struct ir3_shader_key key)
 {
-       const struct fd3_shader_variant *vp, *fp;
+       const struct ir3_shader_variant *vp, *fp;
        const struct ir3_info *vsi, *fsi;
        uint32_t pos_regid, posz_regid, psize_regid, color_regid;
        int i, j, k;
@@ -337,7 +184,7 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 
        if (key.binning_pass) {
                /* use dummy stateobj to simplify binning vs non-binning: */
-               static const struct fd3_shader_variant binning_fp = {};
+               static const struct ir3_shader_variant binning_fp = {};
                fp = &binning_fp;
        } else {
                fp = fd3_shader_variant(prog->fp, key);
@@ -347,13 +194,13 @@ fd3_program_emit(struct fd_ringbuffer *ring,
        fsi = &fp->info;
 
        pos_regid = find_output_regid(vp,
-               fd3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
+               ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
        posz_regid = find_output_regid(fp,
-               fd3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
+               ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
        psize_regid = find_output_regid(vp,
-               fd3_semantic_name(TGSI_SEMANTIC_PSIZE, 0));
+               ir3_semantic_name(TGSI_SEMANTIC_PSIZE, 0));
        color_regid = find_output_regid(fp,
-               fd3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
+               ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
 
        /* we could probably divide this up into things that need to be
         * emitted if frag-prog is dirty vs if vert-prog is dirty..
@@ -522,16 +369,16 @@ fd3_program_emit(struct fd_ringbuffer *ring,
                                A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in));
 
                OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
-               OUT_RING(ring, fp->so->vinterp[0]);    /* VPC_VARYING_INTERP[0].MODE */
-               OUT_RING(ring, fp->so->vinterp[1]);    /* VPC_VARYING_INTERP[1].MODE */
-               OUT_RING(ring, fp->so->vinterp[2]);    /* VPC_VARYING_INTERP[2].MODE */
-               OUT_RING(ring, fp->so->vinterp[3]);    /* VPC_VARYING_INTERP[3].MODE */
+               OUT_RING(ring, fp->shader->vinterp[0]);    /* VPC_VARYING_INTERP[0].MODE */
+               OUT_RING(ring, fp->shader->vinterp[1]);    /* VPC_VARYING_INTERP[1].MODE */
+               OUT_RING(ring, fp->shader->vinterp[2]);    /* VPC_VARYING_INTERP[2].MODE */
+               OUT_RING(ring, fp->shader->vinterp[3]);    /* VPC_VARYING_INTERP[3].MODE */
 
                OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
-               OUT_RING(ring, fp->so->vpsrepl[0]);    /* VPC_VARYING_PS_REPL[0].MODE */
-               OUT_RING(ring, fp->so->vpsrepl[1]);    /* VPC_VARYING_PS_REPL[1].MODE */
-               OUT_RING(ring, fp->so->vpsrepl[2]);    /* VPC_VARYING_PS_REPL[2].MODE */
-               OUT_RING(ring, fp->so->vpsrepl[3]);    /* VPC_VARYING_PS_REPL[3].MODE */
+               OUT_RING(ring, fp->shader->vpsrepl[0]);    /* VPC_VARYING_PS_REPL[0].MODE */
+               OUT_RING(ring, fp->shader->vpsrepl[1]);    /* VPC_VARYING_PS_REPL[1].MODE */
+               OUT_RING(ring, fp->shader->vpsrepl[2]);    /* VPC_VARYING_PS_REPL[2].MODE */
+               OUT_RING(ring, fp->shader->vpsrepl[3]);    /* VPC_VARYING_PS_REPL[3].MODE */
        }
 
        OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
@@ -558,10 +405,10 @@ fix_blit_fp(struct pipe_context *pctx)
        struct fd_context *ctx = fd_context(pctx);
        struct fd3_shader_stateobj *so = ctx->blit_prog.fp;
 
-       so->vpsrepl[0] = 0x99999999;
-       so->vpsrepl[1] = 0x99999999;
-       so->vpsrepl[2] = 0x99999999;
-       so->vpsrepl[3] = 0x99999999;
+       so->shader->vpsrepl[0] = 0x99999999;
+       so->shader->vpsrepl[1] = 0x99999999;
+       so->shader->vpsrepl[2] = 0x99999999;
+       so->shader->vpsrepl[3] = 0x99999999;
 }
 
 void
index e2ed1cc3dda8fe7eab2e1c5b7cb899a95f9edebb..cebaeecc5bc20180ffb68f1161dd5379f90f084d 100644 (file)
 #define FD3_PROGRAM_H_
 
 #include "pipe/p_context.h"
-
 #include "freedreno_context.h"
-#include "fd3_util.h"
-#include "ir3.h"
-#include "disasm.h"
-
-typedef uint16_t fd3_semantic;  /* semantic name + index */
-static inline fd3_semantic
-fd3_semantic_name(uint8_t name, uint16_t index)
-{
-       return (name << 8) | (index & 0xff);
-}
-
-static inline uint8_t sem2name(fd3_semantic sem)
-{
-       return sem >> 8;
-}
-
-static inline uint16_t sem2idx(fd3_semantic sem)
-{
-       return sem & 0xff;
-}
-
-struct fd3_shader_variant {
-       struct fd_bo *bo;
-
-       struct fd3_shader_key key;
-
-       struct ir3_info info;
-       struct ir3 *ir;
-
-       /* the instructions length is in units of instruction groups
-        * (4 instructions, 8 dwords):
-        */
-       unsigned instrlen;
-
-       /* the constants length is in units of vec4's, and is the sum of
-        * the uniforms and the built-in compiler constants
-        */
-       unsigned constlen;
-
-       /* About Linkage:
-        *   + Let the frag shader determine the position/compmask for the
-        *     varyings, since it is the place where we know if the varying
-        *     is actually used, and if so, which components are used.  So
-        *     what the hw calls "outloc" is taken from the "inloc" of the
-        *     frag shader.
-        *   + From the vert shader, we only need the output regid
-        */
-
-       /* for frag shader, pos_regid holds the frag_pos, ie. what is passed
-        * to bary.f instructions
-        */
-       uint8_t pos_regid;
-       bool frag_coord, frag_face;
-
-       /* varyings/outputs: */
-       unsigned outputs_count;
-       struct {
-               fd3_semantic semantic;
-               uint8_t regid;
-       } outputs[16 + 2];  /* +POSITION +PSIZE */
-       bool writes_pos, writes_psize;
-
-       /* vertices/inputs: */
-       unsigned inputs_count;
-       struct {
-               fd3_semantic semantic;
-               uint8_t regid;
-               uint8_t compmask;
-               uint8_t ncomp;
-               /* in theory inloc of fs should match outloc of vs: */
-               uint8_t inloc;
-               uint8_t bary;
-       } inputs[16 + 2];  /* +POSITION +FACE */
-
-       unsigned total_in;       /* sum of inputs (scalar) */
-
-       /* do we have one or more texture sample instructions: */
-       bool has_samp;
-
-       /* const reg # of first immediate, ie. 1 == c1
-        * (not regid, because TGSI thinks in terms of vec4 registers,
-        * not scalar registers)
-        */
-       unsigned first_immediate;
-       unsigned immediates_count;
-       struct {
-               uint32_t val[4];
-       } immediates[64];
-
-       /* shader varients form a linked list: */
-       struct fd3_shader_variant *next;
-
-       /* replicated here to avoid passing extra ptrs everywhere: */
-       enum shader_t type;
-       struct fd3_shader_stateobj *so;
-};
+#include "ir3_shader.h"
 
 struct fd3_shader_stateobj {
-       enum shader_t type;
-
-       struct pipe_context *pctx;
-       const struct tgsi_token *tokens;
-
-       struct fd3_shader_variant *variants;
-
-       /* so far, only used for blit_prog shader.. values for
-        * VPC_VARYING_INTERP[i].MODE and VPC_VARYING_PS_REPL[i].MODE
-        *
-        * Possibly should be in fd3_program_variant?
-        */
-       uint32_t vinterp[4], vpsrepl[4];
+       struct ir3_shader *shader;
 };
 
-struct fd3_shader_variant * fd3_shader_variant(struct fd3_shader_stateobj *so,
-               struct fd3_shader_key key);
-
 void fd3_program_emit(struct fd_ringbuffer *ring,
-               struct fd_program_stateobj *prog, struct fd3_shader_key key);
+               struct fd_program_stateobj *prog, struct ir3_shader_key key);
 
 void fd3_prog_init(struct pipe_context *pctx);
 
+static inline struct ir3_shader_variant *
+fd3_shader_variant(struct fd3_shader_stateobj *so, struct ir3_shader_key key)
+{
+       return ir3_shader_variant(so->shader, key);
+}
+
 #endif /* FD3_PROGRAM_H_ */
index 6462d18f913731a6f7031a6d3d2dce01623f4d06..4681840b1732c28f9f54cf82e0454f911908586d 100644 (file)
@@ -43,22 +43,4 @@ enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format);
 uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r,
                unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a);
 
-/* Configuration key used to identify a shader variant.. different
- * shader variants can be used to implement features not supported
- * in hw (two sided color), binning-pass vertex shader, etc.
- *
- * NOTE: this is declared here (rather than fd3_program.h) as it is
- * passed around through a lot of the emit code in various parts
- * which would otherwise not necessarily need to incl fd3_program.h
- */
-struct fd3_shader_key {
-       /* vertex shader variant parameters: */
-       unsigned binning_pass : 1;
-
-       /* fragment shader variant parameters: */
-       unsigned color_two_side : 1;
-       unsigned half_precision : 1;
-};
-struct fd3_shader_variant;
-
 #endif /* FD3_UTIL_H_ */
diff --git a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h
deleted file mode 100644 (file)
index c67f103..0000000
+++ /dev/null
@@ -1,691 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef INSTR_A3XX_H_
-#define INSTR_A3XX_H_
-
-#define PACKED __attribute__((__packed__))
-
-#include <stdint.h>
-#include <assert.h>
-
-typedef enum {
-       /* category 0: */
-       OPC_NOP = 0,
-       OPC_BR = 1,
-       OPC_JUMP = 2,
-       OPC_CALL = 3,
-       OPC_RET = 4,
-       OPC_KILL = 5,
-       OPC_END = 6,
-       OPC_EMIT = 7,
-       OPC_CUT = 8,
-       OPC_CHMASK = 9,
-       OPC_CHSH = 10,
-       OPC_FLOW_REV = 11,
-
-       /* category 1: */
-       /* no opc.. all category 1 are variants of mov */
-
-       /* category 2: */
-       OPC_ADD_F = 0,
-       OPC_MIN_F = 1,
-       OPC_MAX_F = 2,
-       OPC_MUL_F = 3,
-       OPC_SIGN_F = 4,
-       OPC_CMPS_F = 5,
-       OPC_ABSNEG_F = 6,
-       OPC_CMPV_F = 7,
-       /* 8 - invalid */
-       OPC_FLOOR_F = 9,
-       OPC_CEIL_F = 10,
-       OPC_RNDNE_F = 11,
-       OPC_RNDAZ_F = 12,
-       OPC_TRUNC_F = 13,
-       /* 14-15 - invalid */
-       OPC_ADD_U = 16,
-       OPC_ADD_S = 17,
-       OPC_SUB_U = 18,
-       OPC_SUB_S = 19,
-       OPC_CMPS_U = 20,
-       OPC_CMPS_S = 21,
-       OPC_MIN_U = 22,
-       OPC_MIN_S = 23,
-       OPC_MAX_U = 24,
-       OPC_MAX_S = 25,
-       OPC_ABSNEG_S = 26,
-       /* 27 - invalid */
-       OPC_AND_B = 28,
-       OPC_OR_B = 29,
-       OPC_NOT_B = 30,
-       OPC_XOR_B = 31,
-       /* 32 - invalid */
-       OPC_CMPV_U = 33,
-       OPC_CMPV_S = 34,
-       /* 35-47 - invalid */
-       OPC_MUL_U = 48,
-       OPC_MUL_S = 49,
-       OPC_MULL_U = 50,
-       OPC_BFREV_B = 51,
-       OPC_CLZ_S = 52,
-       OPC_CLZ_B = 53,
-       OPC_SHL_B = 54,
-       OPC_SHR_B = 55,
-       OPC_ASHR_B = 56,
-       OPC_BARY_F = 57,
-       OPC_MGEN_B = 58,
-       OPC_GETBIT_B = 59,
-       OPC_SETRM = 60,
-       OPC_CBITS_B = 61,
-       OPC_SHB = 62,
-       OPC_MSAD = 63,
-
-       /* category 3: */
-       OPC_MAD_U16 = 0,
-       OPC_MADSH_U16 = 1,
-       OPC_MAD_S16 = 2,
-       OPC_MADSH_M16 = 3,   /* should this be .s16? */
-       OPC_MAD_U24 = 4,
-       OPC_MAD_S24 = 5,
-       OPC_MAD_F16 = 6,
-       OPC_MAD_F32 = 7,
-       OPC_SEL_B16 = 8,
-       OPC_SEL_B32 = 9,
-       OPC_SEL_S16 = 10,
-       OPC_SEL_S32 = 11,
-       OPC_SEL_F16 = 12,
-       OPC_SEL_F32 = 13,
-       OPC_SAD_S16 = 14,
-       OPC_SAD_S32 = 15,
-
-       /* category 4: */
-       OPC_RCP = 0,
-       OPC_RSQ = 1,
-       OPC_LOG2 = 2,
-       OPC_EXP2 = 3,
-       OPC_SIN = 4,
-       OPC_COS = 5,
-       OPC_SQRT = 6,
-       // 7-63 - invalid
-
-       /* category 5: */
-       OPC_ISAM = 0,
-       OPC_ISAML = 1,
-       OPC_ISAMM = 2,
-       OPC_SAM = 3,
-       OPC_SAMB = 4,
-       OPC_SAML = 5,
-       OPC_SAMGQ = 6,
-       OPC_GETLOD = 7,
-       OPC_CONV = 8,
-       OPC_CONVM = 9,
-       OPC_GETSIZE = 10,
-       OPC_GETBUF = 11,
-       OPC_GETPOS = 12,
-       OPC_GETINFO = 13,
-       OPC_DSX = 14,
-       OPC_DSY = 15,
-       OPC_GATHER4R = 16,
-       OPC_GATHER4G = 17,
-       OPC_GATHER4B = 18,
-       OPC_GATHER4A = 19,
-       OPC_SAMGP0 = 20,
-       OPC_SAMGP1 = 21,
-       OPC_SAMGP2 = 22,
-       OPC_SAMGP3 = 23,
-       OPC_DSXPP_1 = 24,
-       OPC_DSYPP_1 = 25,
-       OPC_RGETPOS = 26,
-       OPC_RGETINFO = 27,
-
-       /* category 6: */
-       OPC_LDG = 0,        /* load-global */
-       OPC_LDL = 1,
-       OPC_LDP = 2,
-       OPC_STG = 3,        /* store-global */
-       OPC_STL = 4,
-       OPC_STP = 5,
-       OPC_STI = 6,
-       OPC_G2L = 7,
-       OPC_L2G = 8,
-       OPC_PREFETCH = 9,
-       OPC_LDLW = 10,
-       OPC_STLW = 11,
-       OPC_RESFMT = 14,
-       OPC_RESINFO = 15,
-       OPC_ATOMIC_ADD_L = 16,
-       OPC_ATOMIC_SUB_L = 17,
-       OPC_ATOMIC_XCHG_L = 18,
-       OPC_ATOMIC_INC_L = 19,
-       OPC_ATOMIC_DEC_L = 20,
-       OPC_ATOMIC_CMPXCHG_L = 21,
-       OPC_ATOMIC_MIN_L = 22,
-       OPC_ATOMIC_MAX_L = 23,
-       OPC_ATOMIC_AND_L = 24,
-       OPC_ATOMIC_OR_L = 25,
-       OPC_ATOMIC_XOR_L = 26,
-       OPC_LDGB_TYPED_4D = 27,
-       OPC_STGB_4D_4 = 28,
-       OPC_STIB = 29,
-       OPC_LDC_4 = 30,
-       OPC_LDLV = 31,
-
-       /* meta instructions (category -1): */
-       /* placeholder instr to mark inputs/outputs: */
-       OPC_META_INPUT = 0,
-       OPC_META_OUTPUT = 1,
-       /* The "fan-in" and "fan-out" instructions are used for keeping
-        * track of instructions that write to multiple dst registers
-        * (fan-out) like texture sample instructions, or read multiple
-        * consecutive scalar registers (fan-in) (bary.f, texture samp)
-        */
-       OPC_META_FO = 2,
-       OPC_META_FI = 3,
-       /* branches/flow control */
-       OPC_META_FLOW = 4,
-       OPC_META_PHI = 5,
-       /* relative addressing */
-       OPC_META_DEREF = 6,
-
-
-} opc_t;
-
-typedef enum {
-       TYPE_F16 = 0,
-       TYPE_F32 = 1,
-       TYPE_U16 = 2,
-       TYPE_U32 = 3,
-       TYPE_S16 = 4,
-       TYPE_S32 = 5,
-       TYPE_U8  = 6,
-       TYPE_S8  = 7,  // XXX I assume?
-} type_t;
-
-static inline uint32_t type_size(type_t type)
-{
-       switch (type) {
-       case TYPE_F32:
-       case TYPE_U32:
-       case TYPE_S32:
-               return 32;
-       case TYPE_F16:
-       case TYPE_U16:
-       case TYPE_S16:
-               return 16;
-       case TYPE_U8:
-       case TYPE_S8:
-               return 8;
-       default:
-               assert(0); /* invalid type */
-               return 0;
-       }
-}
-
-static inline int type_float(type_t type)
-{
-       return (type == TYPE_F32) || (type == TYPE_F16);
-}
-
-static inline int type_uint(type_t type)
-{
-       return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
-}
-
-static inline int type_sint(type_t type)
-{
-       return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
-}
-
-typedef union PACKED {
-       /* normal gpr or const src register: */
-       struct PACKED {
-               uint32_t comp  : 2;
-               uint32_t num   : 10;
-       };
-       /* for immediate val: */
-       int32_t  iim_val   : 11;
-       /* to make compiler happy: */
-       uint32_t dummy32;
-       uint32_t dummy10   : 10;
-       uint32_t dummy11   : 11;
-       uint32_t dummy12   : 12;
-       uint32_t dummy13   : 13;
-       uint32_t dummy8    : 8;
-} reg_t;
-
-/* special registers: */
-#define REG_A0 61       /* address register */
-#define REG_P0 62       /* predicate register */
-
-static inline int reg_special(reg_t reg)
-{
-       return (reg.num == REG_A0) || (reg.num == REG_P0);
-}
-
-typedef struct PACKED {
-       /* dword0: */
-       int16_t  immed    : 16;
-       uint32_t dummy1   : 16;
-
-       /* dword1: */
-       uint32_t dummy2   : 8;
-       uint32_t repeat   : 3;
-       uint32_t dummy3   : 1;
-       uint32_t ss       : 1;
-       uint32_t dummy4   : 7;
-       uint32_t inv      : 1;
-       uint32_t comp     : 2;
-       uint32_t opc      : 4;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat0_t;
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               /* for normal src register: */
-               struct PACKED {
-                       uint32_t src : 11;
-                       /* at least low bit of pad must be zero or it will
-                        * look like a address relative src
-                        */
-                       uint32_t pad : 21;
-               };
-               /* for address relative: */
-               struct PACKED {
-                       int32_t  off : 10;
-                       uint32_t src_rel_c : 1;
-                       uint32_t src_rel : 1;
-                       uint32_t unknown : 20;
-               };
-               /* for immediate: */
-               int32_t iim_val;
-               float   fim_val;
-       };
-
-       /* dword1: */
-       uint32_t dst        : 8;
-       uint32_t repeat     : 3;
-       uint32_t src_r      : 1;
-       uint32_t ss         : 1;
-       uint32_t ul         : 1;
-       uint32_t dst_type   : 3;
-       uint32_t dst_rel    : 1;
-       uint32_t src_type   : 3;
-       uint32_t src_c      : 1;
-       uint32_t src_im     : 1;
-       uint32_t even       : 1;
-       uint32_t pos_inf    : 1;
-       uint32_t must_be_0  : 2;
-       uint32_t jmp_tgt    : 1;
-       uint32_t sync       : 1;
-       uint32_t opc_cat    : 3;
-} instr_cat1_t;
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       uint32_t src1         : 11;
-                       uint32_t must_be_zero1: 2;
-                       uint32_t src1_im      : 1;   /* immediate */
-                       uint32_t src1_neg     : 1;   /* negate */
-                       uint32_t src1_abs     : 1;   /* absolute value */
-               };
-               struct PACKED {
-                       uint32_t src1         : 10;
-                       uint32_t src1_c       : 1;   /* relative-const */
-                       uint32_t src1_rel     : 1;   /* relative address */
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel1;
-               struct PACKED {
-                       uint32_t src1         : 12;
-                       uint32_t src1_c       : 1;   /* const */
-                       uint32_t dummy        : 3;
-               } c1;
-       };
-
-       union PACKED {
-               struct PACKED {
-                       uint32_t src2         : 11;
-                       uint32_t must_be_zero2: 2;
-                       uint32_t src2_im      : 1;   /* immediate */
-                       uint32_t src2_neg     : 1;   /* negate */
-                       uint32_t src2_abs     : 1;   /* absolute value */
-               };
-               struct PACKED {
-                       uint32_t src2         : 10;
-                       uint32_t src2_c       : 1;   /* relative-const */
-                       uint32_t src2_rel     : 1;   /* relative address */
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel2;
-               struct PACKED {
-                       uint32_t src2         : 12;
-                       uint32_t src2_c       : 1;   /* const */
-                       uint32_t dummy        : 3;
-               } c2;
-       };
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t repeat   : 3;
-       uint32_t src1_r   : 1;
-       uint32_t ss       : 1;
-       uint32_t ul       : 1;   /* dunno */
-       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-       uint32_t ei       : 1;
-       uint32_t cond     : 3;
-       uint32_t src2_r   : 1;
-       uint32_t full     : 1;   /* not half */
-       uint32_t opc      : 6;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat2_t;
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       uint32_t src1         : 11;
-                       uint32_t must_be_zero1: 2;
-                       uint32_t src2_c       : 1;
-                       uint32_t src1_neg     : 1;
-                       uint32_t src2_r       : 1;
-               };
-               struct PACKED {
-                       uint32_t src1         : 10;
-                       uint32_t src1_c       : 1;
-                       uint32_t src1_rel     : 1;
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel1;
-               struct PACKED {
-                       uint32_t src1         : 12;
-                       uint32_t src1_c       : 1;
-                       uint32_t dummy        : 3;
-               } c1;
-       };
-
-       union PACKED {
-               struct PACKED {
-                       uint32_t src3         : 11;
-                       uint32_t must_be_zero2: 2;
-                       uint32_t src3_r       : 1;
-                       uint32_t src2_neg     : 1;
-                       uint32_t src3_neg     : 1;
-               };
-               struct PACKED {
-                       uint32_t src3         : 10;
-                       uint32_t src3_c       : 1;
-                       uint32_t src3_rel     : 1;
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel2;
-               struct PACKED {
-                       uint32_t src3         : 12;
-                       uint32_t src3_c       : 1;
-                       uint32_t dummy        : 3;
-               } c2;
-       };
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t repeat   : 3;
-       uint32_t src1_r   : 1;
-       uint32_t ss       : 1;
-       uint32_t ul       : 1;
-       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-       uint32_t src2     : 8;
-       uint32_t opc      : 4;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat3_t;
-
-static inline bool instr_cat3_full(instr_cat3_t *cat3)
-{
-       switch (cat3->opc) {
-       case OPC_MAD_F16:
-       case OPC_MAD_U16:
-       case OPC_MAD_S16:
-       case OPC_SEL_B16:
-       case OPC_SEL_S16:
-       case OPC_SEL_F16:
-       case OPC_SAD_S16:
-       case OPC_SAD_S32:  // really??
-               return false;
-       default:
-               return true;
-       }
-}
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       uint32_t src          : 11;
-                       uint32_t must_be_zero1: 2;
-                       uint32_t src_im       : 1;   /* immediate */
-                       uint32_t src_neg      : 1;   /* negate */
-                       uint32_t src_abs      : 1;   /* absolute value */
-               };
-               struct PACKED {
-                       uint32_t src          : 10;
-                       uint32_t src_c        : 1;   /* relative-const */
-                       uint32_t src_rel      : 1;   /* relative address */
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel;
-               struct PACKED {
-                       uint32_t src          : 12;
-                       uint32_t src_c        : 1;   /* const */
-                       uint32_t dummy        : 3;
-               } c;
-       };
-       uint32_t dummy1   : 16;  /* seem to be ignored */
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t repeat   : 3;
-       uint32_t src_r    : 1;
-       uint32_t ss       : 1;
-       uint32_t ul       : 1;
-       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-       uint32_t dummy2   : 5;   /* seem to be ignored */
-       uint32_t full     : 1;   /* not half */
-       uint32_t opc      : 6;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat4_t;
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               /* normal case: */
-               struct PACKED {
-                       uint32_t full     : 1;   /* not half */
-                       uint32_t src1     : 8;
-                       uint32_t src2     : 8;
-                       uint32_t dummy1   : 4;   /* seem to be ignored */
-                       uint32_t samp     : 4;
-                       uint32_t tex      : 7;
-               } norm;
-               /* s2en case: */
-               struct PACKED {
-                       uint32_t full     : 1;   /* not half */
-                       uint32_t src1     : 8;
-                       uint32_t src2     : 11;
-                       uint32_t dummy1   : 1;
-                       uint32_t src3     : 8;
-                       uint32_t dummy2   : 3;
-               } s2en;
-               /* same in either case: */
-               // XXX I think, confirm this
-               struct PACKED {
-                       uint32_t full     : 1;   /* not half */
-                       uint32_t src1     : 8;
-                       uint32_t pad      : 23;
-               };
-       };
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t wrmask   : 4;   /* write-mask */
-       uint32_t type     : 3;
-       uint32_t dummy2   : 1;   /* seems to be ignored */
-       uint32_t is_3d    : 1;
-
-       uint32_t is_a     : 1;
-       uint32_t is_s     : 1;
-       uint32_t is_s2en  : 1;
-       uint32_t is_o     : 1;
-       uint32_t is_p     : 1;
-
-       uint32_t opc      : 5;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat5_t;
-
-/* used for load instructions: */
-typedef struct PACKED {
-       /* dword0: */
-       uint32_t must_be_one1 : 1;
-       int16_t  off      : 13;
-       uint32_t src      : 8;
-       uint32_t dummy1   : 1;
-       uint32_t must_be_one2 : 1;
-       int32_t  iim_val  : 8;
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t dummy2   : 9;
-       uint32_t type     : 3;
-       uint32_t dummy3   : 2;
-       uint32_t opc      : 5;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat6a_t;
-
-/* used for store instructions: */
-typedef struct PACKED {
-       /* dword0: */
-       uint32_t must_be_zero1 : 1;
-       uint32_t src      : 8;
-       uint32_t off_hi   : 5;   /* high bits of 'off'... ugly! */
-       uint32_t dummy1   : 9;
-       uint32_t must_be_one1 : 1;
-       int32_t  iim_val  : 8;
-
-       /* dword1: */
-       uint16_t off      : 8;
-       uint32_t must_be_one2 : 1;
-       uint32_t dst      : 8;
-       uint32_t type     : 3;
-       uint32_t dummy2   : 2;
-       uint32_t opc      : 5;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat6b_t;
-
-typedef union PACKED {
-       instr_cat6a_t a;
-       instr_cat6b_t b;
-       struct PACKED {
-               /* dword0: */
-               uint32_t pad1     : 24;
-               int32_t  iim_val  : 8;
-
-               /* dword1: */
-               uint32_t pad2     : 17;
-               uint32_t type     : 3;
-               uint32_t pad3     : 2;
-               uint32_t opc      : 5;
-               uint32_t jmp_tgt  : 1;
-               uint32_t sync     : 1;
-               uint32_t opc_cat  : 3;
-       };
-} instr_cat6_t;
-
-typedef union PACKED {
-       instr_cat0_t cat0;
-       instr_cat1_t cat1;
-       instr_cat2_t cat2;
-       instr_cat3_t cat3;
-       instr_cat4_t cat4;
-       instr_cat5_t cat5;
-       instr_cat6_t cat6;
-       struct PACKED {
-               /* dword0: */
-               uint64_t pad1     : 40;
-               uint32_t repeat   : 3;  /* cat0-cat4 */
-               uint32_t pad2     : 1;
-               uint32_t ss       : 1;  /* cat1-cat4 (cat0??) */
-               uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
-               uint32_t pad3     : 13;
-               uint32_t jmp_tgt  : 1;
-               uint32_t sync     : 1;
-               uint32_t opc_cat  : 3;
-
-       };
-} instr_t;
-
-static inline uint32_t instr_opc(instr_t *instr)
-{
-       switch (instr->opc_cat) {
-       case 0:  return instr->cat0.opc;
-       case 1:  return 0;
-       case 2:  return instr->cat2.opc;
-       case 3:  return instr->cat3.opc;
-       case 4:  return instr->cat4.opc;
-       case 5:  return instr->cat5.opc;
-       case 6:  return instr->cat6.opc;
-       default: return 0;
-       }
-}
-
-static inline bool is_mad(opc_t opc)
-{
-       switch (opc) {
-       case OPC_MAD_U16:
-       case OPC_MADSH_U16:
-       case OPC_MAD_S16:
-       case OPC_MADSH_M16:
-       case OPC_MAD_U24:
-       case OPC_MAD_S24:
-       case OPC_MAD_F16:
-       case OPC_MAD_F32:
-               return true;
-       default:
-               return false;
-       }
-}
-
-#endif /* INSTR_A3XX_H_ */
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.c b/src/gallium/drivers/freedreno/a3xx/ir3.c
deleted file mode 100644 (file)
index ea2a925..0000000
+++ /dev/null
@@ -1,675 +0,0 @@
-/*
- * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ir3.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <stdbool.h>
-#include <errno.h>
-
-#include "freedreno_util.h"
-#include "instr-a3xx.h"
-
-#define CHUNK_SZ 1020
-
-struct ir3_heap_chunk {
-       struct ir3_heap_chunk *next;
-       uint32_t heap[CHUNK_SZ];
-};
-
-static void grow_heap(struct ir3 *shader)
-{
-       struct ir3_heap_chunk *chunk = calloc(1, sizeof(*chunk));
-       chunk->next = shader->chunk;
-       shader->chunk = chunk;
-       shader->heap_idx = 0;
-}
-
-/* simple allocator to carve allocations out of an up-front allocated heap,
- * so that we can free everything easily in one shot.
- */
-void * ir3_alloc(struct ir3 *shader, int sz)
-{
-       void *ptr;
-
-       sz = align(sz, 4) / 4;
-
-       if ((shader->heap_idx + sz) > CHUNK_SZ)
-               grow_heap(shader);
-
-       ptr = &shader->chunk->heap[shader->heap_idx];
-       shader->heap_idx += sz;
-
-       return ptr;
-}
-
-struct ir3 * ir3_create(void)
-{
-       struct ir3 *shader =
-                       calloc(1, sizeof(struct ir3));
-       grow_heap(shader);
-       return shader;
-}
-
-void ir3_destroy(struct ir3 *shader)
-{
-       while (shader->chunk) {
-               struct ir3_heap_chunk *chunk = shader->chunk;
-               shader->chunk = chunk->next;
-               free(chunk);
-       }
-       free(shader);
-}
-
-#define iassert(cond) do { \
-       if (!(cond)) { \
-               assert(cond); \
-               return -1; \
-       } } while (0)
-
-static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
-               uint32_t repeat, uint32_t valid_flags)
-{
-       reg_t val = { .dummy32 = 0 };
-
-       assert(!(reg->flags & ~valid_flags));
-
-       if (!(reg->flags & IR3_REG_R))
-               repeat = 0;
-
-       if (reg->flags & IR3_REG_IMMED) {
-               val.iim_val = reg->iim_val;
-       } else {
-               int8_t components = util_last_bit(reg->wrmask);
-               int8_t max = (reg->num + repeat + components - 1) >> 2;
-
-               val.comp = reg->num & 0x3;
-               val.num  = reg->num >> 2;
-
-               if (reg->flags & IR3_REG_CONST) {
-                       info->max_const = MAX2(info->max_const, max);
-               } else if ((max != REG_A0) && (max != REG_P0)) {
-                       if (reg->flags & IR3_REG_HALF) {
-                               info->max_half_reg = MAX2(info->max_half_reg, max);
-                       } else {
-                               info->max_reg = MAX2(info->max_reg, max);
-                       }
-               }
-       }
-
-       return val.dummy32;
-}
-
-static int emit_cat0(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       instr_cat0_t *cat0 = ptr;
-
-       cat0->immed    = instr->cat0.immed;
-       cat0->repeat   = instr->repeat;
-       cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat0->inv      = instr->cat0.inv;
-       cat0->comp     = instr->cat0.comp;
-       cat0->opc      = instr->opc;
-       cat0->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat0->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat0->opc_cat  = 0;
-
-       return 0;
-}
-
-static uint32_t type_flags(type_t type)
-{
-       return (type_size(type) == 32) ? 0 : IR3_REG_HALF;
-}
-
-static int emit_cat1(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src = instr->regs[1];
-       instr_cat1_t *cat1 = ptr;
-
-       iassert(instr->regs_count == 2);
-       iassert(!((dst->flags ^ type_flags(instr->cat1.dst_type)) & IR3_REG_HALF));
-       iassert((src->flags & IR3_REG_IMMED) ||
-                       !((src->flags ^ type_flags(instr->cat1.src_type)) & IR3_REG_HALF));
-
-       if (src->flags & IR3_REG_IMMED) {
-               cat1->iim_val = src->iim_val;
-               cat1->src_im  = 1;
-       } else if (src->flags & IR3_REG_RELATIV) {
-               cat1->off       = src->offset;
-               cat1->src_rel   = 1;
-               cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
-       } else {
-               cat1->src  = reg(src, info, instr->repeat,
-                               IR3_REG_IMMED | IR3_REG_R |
-                               IR3_REG_CONST | IR3_REG_HALF);
-               cat1->src_c     = !!(src->flags & IR3_REG_CONST);
-       }
-
-       cat1->dst      = reg(dst, info, instr->repeat,
-                       IR3_REG_RELATIV | IR3_REG_EVEN |
-                       IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
-       cat1->repeat   = instr->repeat;
-       cat1->src_r    = !!(src->flags & IR3_REG_R);
-       cat1->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat1->ul       = !!(instr->flags & IR3_INSTR_UL);
-       cat1->dst_type = instr->cat1.dst_type;
-       cat1->dst_rel  = !!(dst->flags & IR3_REG_RELATIV);
-       cat1->src_type = instr->cat1.src_type;
-       cat1->even     = !!(dst->flags & IR3_REG_EVEN);
-       cat1->pos_inf  = !!(dst->flags & IR3_REG_POS_INF);
-       cat1->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat1->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat1->opc_cat  = 1;
-
-       return 0;
-}
-
-static int emit_cat2(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src1 = instr->regs[1];
-       struct ir3_register *src2 = instr->regs[2];
-       instr_cat2_t *cat2 = ptr;
-
-       iassert((instr->regs_count == 2) || (instr->regs_count == 3));
-
-       if (src1->flags & IR3_REG_RELATIV) {
-               iassert(src1->num < (1 << 10));
-               cat2->rel1.src1      = reg(src1, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                               IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
-               cat2->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
-               cat2->rel1.src1_rel  = 1;
-       } else if (src1->flags & IR3_REG_CONST) {
-               iassert(src1->num < (1 << 12));
-               cat2->c1.src1   = reg(src1, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
-                               IR3_REG_R | IR3_REG_HALF);
-               cat2->c1.src1_c = 1;
-       } else {
-               iassert(src1->num < (1 << 11));
-               cat2->src1 = reg(src1, info, instr->repeat,
-                               IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
-                               IR3_REG_R | IR3_REG_HALF);
-       }
-       cat2->src1_im  = !!(src1->flags & IR3_REG_IMMED);
-       cat2->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
-       cat2->src1_abs = !!(src1->flags & IR3_REG_ABS);
-       cat2->src1_r   = !!(src1->flags & IR3_REG_R);
-
-       if (src2) {
-               iassert((src2->flags & IR3_REG_IMMED) ||
-                               !((src1->flags ^ src2->flags) & IR3_REG_HALF));
-
-               if (src2->flags & IR3_REG_RELATIV) {
-                       iassert(src2->num < (1 << 10));
-                       cat2->rel2.src2      = reg(src2, info, instr->repeat,
-                                       IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                                       IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
-                       cat2->rel2.src2_c    = !!(src2->flags & IR3_REG_CONST);
-                       cat2->rel2.src2_rel  = 1;
-               } else if (src2->flags & IR3_REG_CONST) {
-                       iassert(src2->num < (1 << 12));
-                       cat2->c2.src2   = reg(src2, info, instr->repeat,
-                                       IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
-                                       IR3_REG_R | IR3_REG_HALF);
-                       cat2->c2.src2_c = 1;
-               } else {
-                       iassert(src2->num < (1 << 11));
-                       cat2->src2 = reg(src2, info, instr->repeat,
-                                       IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
-                                       IR3_REG_R | IR3_REG_HALF);
-               }
-
-               cat2->src2_im  = !!(src2->flags & IR3_REG_IMMED);
-               cat2->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
-               cat2->src2_abs = !!(src2->flags & IR3_REG_ABS);
-               cat2->src2_r   = !!(src2->flags & IR3_REG_R);
-       }
-
-       cat2->dst      = reg(dst, info, instr->repeat,
-                       IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
-       cat2->repeat   = instr->repeat;
-       cat2->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat2->ul       = !!(instr->flags & IR3_INSTR_UL);
-       cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
-       cat2->ei       = !!(dst->flags & IR3_REG_EI);
-       cat2->cond     = instr->cat2.condition;
-       cat2->full     = ! (src1->flags & IR3_REG_HALF);
-       cat2->opc      = instr->opc;
-       cat2->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat2->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat2->opc_cat  = 2;
-
-       return 0;
-}
-
-static int emit_cat3(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src1 = instr->regs[1];
-       struct ir3_register *src2 = instr->regs[2];
-       struct ir3_register *src3 = instr->regs[3];
-       instr_cat3_t *cat3 = ptr;
-       uint32_t src_flags = 0;
-
-       switch (instr->opc) {
-       case OPC_MAD_F16:
-       case OPC_MAD_U16:
-       case OPC_MAD_S16:
-       case OPC_SEL_B16:
-       case OPC_SEL_S16:
-       case OPC_SEL_F16:
-       case OPC_SAD_S16:
-       case OPC_SAD_S32:  // really??
-               src_flags |= IR3_REG_HALF;
-               break;
-       default:
-               break;
-       }
-
-       iassert(instr->regs_count == 4);
-       iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
-       iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
-       iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
-
-       if (src1->flags & IR3_REG_RELATIV) {
-               iassert(src1->num < (1 << 10));
-               cat3->rel1.src1      = reg(src1, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                               IR3_REG_R | IR3_REG_HALF);
-               cat3->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
-               cat3->rel1.src1_rel  = 1;
-       } else if (src1->flags & IR3_REG_CONST) {
-               iassert(src1->num < (1 << 12));
-               cat3->c1.src1   = reg(src1, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
-                               IR3_REG_HALF);
-               cat3->c1.src1_c = 1;
-       } else {
-               iassert(src1->num < (1 << 11));
-               cat3->src1 = reg(src1, info, instr->repeat,
-                               IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
-       }
-
-       cat3->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
-       cat3->src1_r   = !!(src1->flags & IR3_REG_R);
-
-       cat3->src2     = reg(src2, info, instr->repeat,
-                       IR3_REG_CONST | IR3_REG_NEGATE |
-                       IR3_REG_R | IR3_REG_HALF);
-       cat3->src2_c   = !!(src2->flags & IR3_REG_CONST);
-       cat3->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
-       cat3->src2_r   = !!(src2->flags & IR3_REG_R);
-
-
-       if (src3->flags & IR3_REG_RELATIV) {
-               iassert(src3->num < (1 << 10));
-               cat3->rel2.src3      = reg(src3, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                               IR3_REG_R | IR3_REG_HALF);
-               cat3->rel2.src3_c    = !!(src3->flags & IR3_REG_CONST);
-               cat3->rel2.src3_rel  = 1;
-       } else if (src3->flags & IR3_REG_CONST) {
-               iassert(src3->num < (1 << 12));
-               cat3->c2.src3   = reg(src3, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
-                               IR3_REG_HALF);
-               cat3->c2.src3_c = 1;
-       } else {
-               iassert(src3->num < (1 << 11));
-               cat3->src3 = reg(src3, info, instr->repeat,
-                               IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
-       }
-
-       cat3->src3_neg = !!(src3->flags & IR3_REG_NEGATE);
-       cat3->src3_r   = !!(src3->flags & IR3_REG_R);
-
-       cat3->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-       cat3->repeat   = instr->repeat;
-       cat3->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat3->ul       = !!(instr->flags & IR3_INSTR_UL);
-       cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
-       cat3->opc      = instr->opc;
-       cat3->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat3->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat3->opc_cat  = 3;
-
-       return 0;
-}
-
-static int emit_cat4(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src = instr->regs[1];
-       instr_cat4_t *cat4 = ptr;
-
-       iassert(instr->regs_count == 2);
-
-       if (src->flags & IR3_REG_RELATIV) {
-               iassert(src->num < (1 << 10));
-               cat4->rel.src      = reg(src, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                               IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
-               cat4->rel.src_c    = !!(src->flags & IR3_REG_CONST);
-               cat4->rel.src_rel  = 1;
-       } else if (src->flags & IR3_REG_CONST) {
-               iassert(src->num < (1 << 12));
-               cat4->c.src   = reg(src, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
-                               IR3_REG_R | IR3_REG_HALF);
-               cat4->c.src_c = 1;
-       } else {
-               iassert(src->num < (1 << 11));
-               cat4->src = reg(src, info, instr->repeat,
-                               IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
-                               IR3_REG_R | IR3_REG_HALF);
-       }
-
-       cat4->src_im   = !!(src->flags & IR3_REG_IMMED);
-       cat4->src_neg  = !!(src->flags & IR3_REG_NEGATE);
-       cat4->src_abs  = !!(src->flags & IR3_REG_ABS);
-       cat4->src_r    = !!(src->flags & IR3_REG_R);
-
-       cat4->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-       cat4->repeat   = instr->repeat;
-       cat4->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat4->ul       = !!(instr->flags & IR3_INSTR_UL);
-       cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
-       cat4->full     = ! (src->flags & IR3_REG_HALF);
-       cat4->opc      = instr->opc;
-       cat4->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat4->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat4->opc_cat  = 4;
-
-       return 0;
-}
-
-static int emit_cat5(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src1 = instr->regs[1];
-       struct ir3_register *src2 = instr->regs[2];
-       struct ir3_register *src3 = instr->regs[3];
-       instr_cat5_t *cat5 = ptr;
-
-       iassert(!((dst->flags ^ type_flags(instr->cat5.type)) & IR3_REG_HALF));
-
-       if (src1) {
-               cat5->full = ! (src1->flags & IR3_REG_HALF);
-               cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
-       }
-
-
-       if (instr->flags & IR3_INSTR_S2EN) {
-               if (src2) {
-                       iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
-                       cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
-               }
-               if (src3) {
-                       iassert(src3->flags & IR3_REG_HALF);
-                       cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
-               }
-               iassert(!(instr->cat5.samp | instr->cat5.tex));
-       } else {
-               iassert(!src3);
-               if (src2) {
-                       iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
-                       cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
-               }
-               cat5->norm.samp = instr->cat5.samp;
-               cat5->norm.tex  = instr->cat5.tex;
-       }
-
-       cat5->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-       cat5->wrmask   = dst->wrmask;
-       cat5->type     = instr->cat5.type;
-       cat5->is_3d    = !!(instr->flags & IR3_INSTR_3D);
-       cat5->is_a     = !!(instr->flags & IR3_INSTR_A);
-       cat5->is_s     = !!(instr->flags & IR3_INSTR_S);
-       cat5->is_s2en  = !!(instr->flags & IR3_INSTR_S2EN);
-       cat5->is_o     = !!(instr->flags & IR3_INSTR_O);
-       cat5->is_p     = !!(instr->flags & IR3_INSTR_P);
-       cat5->opc      = instr->opc;
-       cat5->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat5->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat5->opc_cat  = 5;
-
-       return 0;
-}
-
-static int emit_cat6(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src = instr->regs[1];
-       instr_cat6_t *cat6 = ptr;
-
-       iassert(instr->regs_count == 2);
-
-       switch (instr->opc) {
-       /* load instructions: */
-       case OPC_LDG:
-       case OPC_LDP:
-       case OPC_LDL:
-       case OPC_LDLW:
-       case OPC_LDLV:
-       case OPC_PREFETCH: {
-               instr_cat6a_t *cat6a = ptr;
-
-               iassert(!((dst->flags ^ type_flags(instr->cat6.type)) & IR3_REG_HALF));
-
-               cat6a->must_be_one1  = 1;
-               cat6a->must_be_one2  = 1;
-               cat6a->off = instr->cat6.offset;
-               cat6a->src = reg(src, info, instr->repeat, 0);
-               cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-               break;
-       }
-       /* store instructions: */
-       case OPC_STG:
-       case OPC_STP:
-       case OPC_STL:
-       case OPC_STLW:
-       case OPC_STI: {
-               instr_cat6b_t *cat6b = ptr;
-               uint32_t src_flags = type_flags(instr->cat6.type);
-               uint32_t dst_flags = (instr->opc == OPC_STI) ? IR3_REG_HALF : 0;
-
-               iassert(!((src->flags ^ src_flags) & IR3_REG_HALF));
-
-               cat6b->must_be_one1  = 1;
-               cat6b->must_be_one2  = 1;
-               cat6b->src    = reg(src, info, instr->repeat, src_flags);
-               cat6b->off_hi = instr->cat6.offset >> 8;
-               cat6b->off    = instr->cat6.offset;
-               cat6b->dst    = reg(dst, info, instr->repeat, IR3_REG_R | dst_flags);
-
-               break;
-       }
-       default:
-               // TODO
-               break;
-       }
-
-       cat6->iim_val  = instr->cat6.iim_val;
-       cat6->type     = instr->cat6.type;
-       cat6->opc      = instr->opc;
-       cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat6->opc_cat  = 6;
-
-       return 0;
-}
-
-static int (*emit[])(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info) = {
-       emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
-};
-
-void * ir3_assemble(struct ir3 *shader, struct ir3_info *info)
-{
-       uint32_t *ptr, *dwords;
-       uint32_t i;
-
-       info->max_reg       = -1;
-       info->max_half_reg  = -1;
-       info->max_const     = -1;
-       info->instrs_count  = 0;
-
-       /* need a integer number of instruction "groups" (sets of four
-        * instructions), so pad out w/ NOPs if needed:
-        * (each instruction is 64bits)
-        */
-       info->sizedwords = 2 * align(shader->instrs_count, 4);
-
-       ptr = dwords = calloc(1, 4 * info->sizedwords);
-
-       for (i = 0; i < shader->instrs_count; i++) {
-               struct ir3_instruction *instr = shader->instrs[i];
-               int ret = emit[instr->category](instr, dwords, info);
-               if (ret)
-                       goto fail;
-               info->instrs_count += 1 + instr->repeat;
-               dwords += 2;
-       }
-
-       return ptr;
-
-fail:
-       free(ptr);
-       return NULL;
-}
-
-static struct ir3_register * reg_create(struct ir3 *shader,
-               int num, int flags)
-{
-       struct ir3_register *reg =
-                       ir3_alloc(shader, sizeof(struct ir3_register));
-       reg->wrmask = 1;
-       reg->flags = flags;
-       reg->num = num;
-       return reg;
-}
-
-static void insert_instr(struct ir3 *shader,
-               struct ir3_instruction *instr)
-{
-#ifdef DEBUG
-       static uint32_t serialno = 0;
-       instr->serialno = ++serialno;
-#endif
-       if (shader->instrs_count == shader->instrs_sz) {
-               shader->instrs_sz = MAX2(2 * shader->instrs_sz, 16);
-               shader->instrs = realloc(shader->instrs,
-                               shader->instrs_sz * sizeof(shader->instrs[0]));
-       }
-       shader->instrs[shader->instrs_count++] = instr;
-}
-
-struct ir3_block * ir3_block_create(struct ir3 *shader,
-               unsigned ntmp, unsigned nin, unsigned nout)
-{
-       struct ir3_block *block;
-       unsigned size;
-       char *ptr;
-
-       size = sizeof(*block);
-       size += sizeof(block->temporaries[0]) * ntmp;
-       size += sizeof(block->inputs[0]) * nin;
-       size += sizeof(block->outputs[0]) * nout;
-
-       ptr = ir3_alloc(shader, size);
-
-       block = (void *)ptr;
-       ptr += sizeof(*block);
-
-       block->temporaries = (void *)ptr;
-       block->ntemporaries = ntmp;
-       ptr += sizeof(block->temporaries[0]) * ntmp;
-
-       block->inputs = (void *)ptr;
-       block->ninputs = nin;
-       ptr += sizeof(block->inputs[0]) * nin;
-
-       block->outputs = (void *)ptr;
-       block->noutputs = nout;
-       ptr += sizeof(block->outputs[0]) * nout;
-
-       block->shader = shader;
-
-       return block;
-}
-
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
-               int category, opc_t opc)
-{
-       struct ir3_instruction *instr =
-                       ir3_alloc(block->shader, sizeof(struct ir3_instruction));
-       instr->block = block;
-       instr->category = category;
-       instr->opc = opc;
-       insert_instr(block->shader, instr);
-       return instr;
-}
-
-struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
-{
-       struct ir3_instruction *new_instr =
-                       ir3_alloc(instr->block->shader, sizeof(struct ir3_instruction));
-       unsigned i;
-
-       *new_instr = *instr;
-       insert_instr(instr->block->shader, new_instr);
-
-       /* clone registers: */
-       new_instr->regs_count = 0;
-       for (i = 0; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               struct ir3_register *new_reg =
-                               ir3_reg_create(new_instr, reg->num, reg->flags);
-               *new_reg = *reg;
-       }
-
-       return new_instr;
-}
-
-struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
-               int num, int flags)
-{
-       struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
-       assert(instr->regs_count < ARRAY_SIZE(instr->regs));
-       instr->regs[instr->regs_count++] = reg;
-       return reg;
-}
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h
deleted file mode 100644 (file)
index 9ed914b..0000000
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IR3_H_
-#define IR3_H_
-
-#include <stdint.h>
-#include <stdbool.h>
-
-#include "instr-a3xx.h"
-#include "disasm.h"  /* TODO move 'enum shader_t' somewhere else.. */
-
-/* low level intermediate representation of an adreno shader program */
-
-struct ir3;
-struct ir3_instruction;
-struct ir3_block;
-
-struct ir3 * fd_asm_parse(const char *src);
-
-struct ir3_info {
-       uint16_t sizedwords;
-       uint16_t instrs_count;   /* expanded to account for rpt's */
-       /* NOTE: max_reg, etc, does not include registers not touched
-        * by the shader (ie. vertex fetched via VFD_DECODE but not
-        * touched by shader)
-        */
-       int8_t   max_reg;   /* highest GPR # used by shader */
-       int8_t   max_half_reg;
-       int8_t   max_const;
-};
-
-struct ir3_register {
-       enum {
-               IR3_REG_CONST  = 0x001,
-               IR3_REG_IMMED  = 0x002,
-               IR3_REG_HALF   = 0x004,
-               IR3_REG_RELATIV= 0x008,
-               IR3_REG_R      = 0x010,
-               IR3_REG_NEGATE = 0x020,
-               IR3_REG_ABS    = 0x040,
-               IR3_REG_EVEN   = 0x080,
-               IR3_REG_POS_INF= 0x100,
-               /* (ei) flag, end-input?  Set on last bary, presumably to signal
-                * that the shader needs no more input:
-                */
-               IR3_REG_EI     = 0x200,
-               /* meta-flags, for intermediate stages of IR, ie.
-                * before register assignment is done:
-                */
-               IR3_REG_SSA    = 0x1000,   /* 'instr' is ptr to assigning instr */
-               IR3_REG_IA     = 0x2000,   /* meta-input dst is "assigned" */
-               IR3_REG_ADDR   = 0x4000,   /* register is a0.x */
-       } flags;
-       union {
-               /* normal registers:
-                * the component is in the low two bits of the reg #, so
-                * rN.x becomes: (N << 2) | x
-                */
-               int num;
-               /* immediate: */
-               int     iim_val;
-               float   fim_val;
-               /* relative: */
-               int offset;
-               /* for IR3_REG_SSA, src registers contain ptr back to
-                * assigning instruction.
-                */
-               struct ir3_instruction *instr;
-       };
-
-       /* used for cat5 instructions, but also for internal/IR level
-        * tracking of what registers are read/written by an instruction.
-        * wrmask may be a bad name since it is used to represent both
-        * src and dst that touch multiple adjacent registers.
-        */
-       int wrmask;
-};
-
-struct ir3_instruction {
-       struct ir3_block *block;
-       int category;
-       opc_t opc;
-       enum {
-               /* (sy) flag is set on first instruction, and after sample
-                * instructions (probably just on RAW hazard).
-                */
-               IR3_INSTR_SY    = 0x001,
-               /* (ss) flag is set on first instruction, and first instruction
-                * to depend on the result of "long" instructions (RAW hazard):
-                *
-                *   rcp, rsq, log2, exp2, sin, cos, sqrt
-                *
-                * It seems to synchronize until all in-flight instructions are
-                * completed, for example:
-                *
-                *   rsq hr1.w, hr1.w
-                *   add.f hr2.z, (neg)hr2.z, hc0.y
-                *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
-                *   rsq hr2.x, hr2.x
-                *   (rpt1)nop
-                *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
-                *   nop
-                *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
-                *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
-                *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
-                *
-                * The last mul.f does not have (ss) set, presumably because the
-                * (ss) on the previous instruction does the job.
-                *
-                * The blob driver also seems to set it on WAR hazards, although
-                * not really clear if this is needed or just blob compiler being
-                * sloppy.  So far I haven't found a case where removing the (ss)
-                * causes problems for WAR hazard, but I could just be getting
-                * lucky:
-                *
-                *   rcp r1.y, r3.y
-                *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
-                *
-                */
-               IR3_INSTR_SS    = 0x002,
-               /* (jp) flag is set on jump targets:
-                */
-               IR3_INSTR_JP    = 0x004,
-               IR3_INSTR_UL    = 0x008,
-               IR3_INSTR_3D    = 0x010,
-               IR3_INSTR_A     = 0x020,
-               IR3_INSTR_O     = 0x040,
-               IR3_INSTR_P     = 0x080,
-               IR3_INSTR_S     = 0x100,
-               IR3_INSTR_S2EN  = 0x200,
-               /* meta-flags, for intermediate stages of IR, ie.
-                * before register assignment is done:
-                */
-               IR3_INSTR_MARK  = 0x1000,
-       } flags;
-       int repeat;
-       unsigned regs_count;
-       struct ir3_register *regs[5];
-       union {
-               struct {
-                       char inv;
-                       char comp;
-                       int  immed;
-               } cat0;
-               struct {
-                       type_t src_type, dst_type;
-               } cat1;
-               struct {
-                       enum {
-                               IR3_COND_LT = 0,
-                               IR3_COND_LE = 1,
-                               IR3_COND_GT = 2,
-                               IR3_COND_GE = 3,
-                               IR3_COND_EQ = 4,
-                               IR3_COND_NE = 5,
-                       } condition;
-               } cat2;
-               struct {
-                       unsigned samp, tex;
-                       type_t type;
-               } cat5;
-               struct {
-                       type_t type;
-                       int offset;
-                       int iim_val;
-               } cat6;
-               /* for meta-instructions, just used to hold extra data
-                * before instruction scheduling, etc
-                */
-               struct {
-                       int off;              /* component/offset */
-               } fo;
-               struct {
-                       struct ir3_block *if_block, *else_block;
-               } flow;
-               struct {
-                       struct ir3_block *block;
-               } inout;
-       };
-
-       /* transient values used during various algorithms: */
-       union {
-               /* The instruction depth is the max dependency distance to output.
-                *
-                * You can also think of it as the "cost", if we did any sort of
-                * optimization for register footprint.  Ie. a value that is  just
-                * result of moving a const to a reg would have a low cost,  so to
-                * it could make sense to duplicate the instruction at various
-                * points where the result is needed to reduce register footprint.
-                */
-               unsigned depth;
-       };
-       struct ir3_instruction *next;
-#ifdef DEBUG
-       uint32_t serialno;
-#endif
-};
-
-struct ir3_heap_chunk;
-
-struct ir3 {
-       unsigned instrs_count, instrs_sz;
-       struct ir3_instruction **instrs;
-       unsigned heap_idx;
-       struct ir3_heap_chunk *chunk;
-};
-
-struct ir3_block {
-       struct ir3 *shader;
-       unsigned ntemporaries, ninputs, noutputs;
-       /* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */
-       struct ir3_instruction **temporaries;
-       struct ir3_instruction **inputs;
-       struct ir3_instruction **outputs;
-       /* only a single address register: */
-       struct ir3_instruction *address;
-       struct ir3_block *parent;
-       struct ir3_instruction *head;
-};
-
-struct ir3 * ir3_create(void);
-void ir3_destroy(struct ir3 *shader);
-void * ir3_assemble(struct ir3 *shader,
-               struct ir3_info *info);
-void * ir3_alloc(struct ir3 *shader, int sz);
-
-struct ir3_block * ir3_block_create(struct ir3 *shader,
-               unsigned ntmp, unsigned nin, unsigned nout);
-
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
-               int category, opc_t opc);
-struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
-const char *ir3_instr_name(struct ir3_instruction *instr);
-
-struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
-               int num, int flags);
-
-
-static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
-{
-       if (instr->flags & IR3_INSTR_MARK)
-               return true;  /* already visited */
-       instr->flags ^= IR3_INSTR_MARK;
-       return false;
-}
-
-static inline void ir3_clear_mark(struct ir3 *shader)
-{
-       /* TODO would be nice to drop the instruction array.. for
-        * new compiler, _clear_mark() is all we use it for, and
-        * we could probably manage a linked list instead..
-        */
-       unsigned i;
-       for (i = 0; i < shader->instrs_count; i++) {
-               struct ir3_instruction *instr = shader->instrs[i];
-               instr->flags &= ~IR3_INSTR_MARK;
-       }
-}
-
-static inline int ir3_instr_regno(struct ir3_instruction *instr,
-               struct ir3_register *reg)
-{
-       unsigned i;
-       for (i = 0; i < instr->regs_count; i++)
-               if (reg == instr->regs[i])
-                       return i;
-       return -1;
-}
-
-
-/* comp:
- *   0 - x
- *   1 - y
- *   2 - z
- *   3 - w
- */
-static inline uint32_t regid(int num, int comp)
-{
-       return (num << 2) | (comp & 0x3);
-}
-
-static inline uint32_t reg_num(struct ir3_register *reg)
-{
-       return reg->num >> 2;
-}
-
-static inline uint32_t reg_comp(struct ir3_register *reg)
-{
-       return reg->num & 0x3;
-}
-
-static inline bool is_flow(struct ir3_instruction *instr)
-{
-       return (instr->category == 0);
-}
-
-static inline bool is_kill(struct ir3_instruction *instr)
-{
-       return is_flow(instr) && (instr->opc == OPC_KILL);
-}
-
-static inline bool is_nop(struct ir3_instruction *instr)
-{
-       return is_flow(instr) && (instr->opc == OPC_NOP);
-}
-
-static inline bool is_alu(struct ir3_instruction *instr)
-{
-       return (1 <= instr->category) && (instr->category <= 3);
-}
-
-static inline bool is_sfu(struct ir3_instruction *instr)
-{
-       return (instr->category == 4);
-}
-
-static inline bool is_tex(struct ir3_instruction *instr)
-{
-       return (instr->category == 5);
-}
-
-static inline bool is_input(struct ir3_instruction *instr)
-{
-       return (instr->category == 2) && (instr->opc == OPC_BARY_F);
-}
-
-static inline bool is_meta(struct ir3_instruction *instr)
-{
-       /* TODO how should we count PHI (and maybe fan-in/out) which
-        * might actually contribute some instructions to the final
-        * result?
-        */
-       return (instr->category == -1);
-}
-
-static inline bool is_addr(struct ir3_instruction *instr)
-{
-       return is_meta(instr) && (instr->opc == OPC_META_DEREF);
-}
-
-static inline bool writes_addr(struct ir3_instruction *instr)
-{
-       if (instr->regs_count > 0) {
-               struct ir3_register *dst = instr->regs[0];
-               return !!(dst->flags & IR3_REG_ADDR);
-       }
-       return false;
-}
-
-static inline bool writes_pred(struct ir3_instruction *instr)
-{
-       if (instr->regs_count > 0) {
-               struct ir3_register *dst = instr->regs[0];
-               return reg_num(dst) == REG_P0;
-       }
-       return false;
-}
-
-static inline bool reg_gpr(struct ir3_register *r)
-{
-       if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR))
-               return false;
-       if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
-               return false;
-       return true;
-}
-
-/* dump: */
-#include <stdio.h>
-void ir3_dump(struct ir3 *shader, const char *name,
-               struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
-               FILE *f);
-void ir3_dump_instr_single(struct ir3_instruction *instr);
-void ir3_dump_instr_list(struct ir3_instruction *instr);
-
-/* flatten if/else: */
-int ir3_block_flatten(struct ir3_block *block);
-
-/* depth calculation: */
-int ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n);
-void ir3_block_depth(struct ir3_block *block);
-
-/* copy-propagate: */
-void ir3_block_cp(struct ir3_block *block);
-
-/* scheduling: */
-void ir3_block_sched(struct ir3_block *block);
-
-/* register assignment: */
-int ir3_block_ra(struct ir3_block *block, enum shader_t type,
-               bool half_precision, bool frag_coord, bool frag_face,
-               bool *has_samp);
-
-#ifndef ARRAY_SIZE
-#  define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-#endif
-
-/* ************************************************************************* */
-/* split this out or find some helper to use.. like main/bitset.h.. */
-
-#include <string.h>
-
-#define MAX_REG 256
-
-typedef uint8_t regmask_t[2 * MAX_REG / 8];
-
-static inline unsigned regmask_idx(struct ir3_register *reg)
-{
-       unsigned num = reg->num;
-       assert(num < MAX_REG);
-       if (reg->flags & IR3_REG_HALF)
-               num += MAX_REG;
-       return num;
-}
-
-static inline void regmask_init(regmask_t *regmask)
-{
-       memset(regmask, 0, sizeof(*regmask));
-}
-
-static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
-{
-       unsigned idx = regmask_idx(reg);
-       unsigned i;
-       for (i = 0; i < 4; i++, idx++)
-               if (reg->wrmask & (1 << i))
-                       (*regmask)[idx / 8] |= 1 << (idx % 8);
-}
-
-/* set bits in a if not set in b, conceptually:
- *   a |= (reg & ~b)
- */
-static inline void regmask_set_if_not(regmask_t *a,
-               struct ir3_register *reg, regmask_t *b)
-{
-       unsigned idx = regmask_idx(reg);
-       unsigned i;
-       for (i = 0; i < 4; i++, idx++)
-               if (reg->wrmask & (1 << i))
-                       if (!((*b)[idx / 8] & (1 << (idx % 8))))
-                               (*a)[idx / 8] |= 1 << (idx % 8);
-}
-
-static inline unsigned regmask_get(regmask_t *regmask,
-               struct ir3_register *reg)
-{
-       unsigned idx = regmask_idx(reg);
-       unsigned i;
-       for (i = 0; i < 4; i++, idx++)
-               if (reg->wrmask & (1 << i))
-                       if ((*regmask)[idx / 8] & (1 << (idx % 8)))
-                               return true;
-       return false;
-}
-
-/* ************************************************************************* */
-
-#endif /* IR3_H_ */
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c
deleted file mode 100644 (file)
index 73c2a27..0000000
+++ /dev/null
@@ -1,158 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "ir3.h"
-
-/*
- * Copy Propagate:
- *
- * TODO probably want some sort of visitor sort of interface to
- * avoid duplicating the same graph traversal logic everywhere..
- *
- */
-
-static void block_cp(struct ir3_block *block);
-static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep);
-
-static bool is_eligible_mov(struct ir3_instruction *instr)
-{
-       if ((instr->category == 1) &&
-                       (instr->cat1.src_type == instr->cat1.dst_type)) {
-               struct ir3_register *dst = instr->regs[0];
-               struct ir3_register *src = instr->regs[1];
-               if (dst->flags & IR3_REG_ADDR)
-                       return false;
-               if ((src->flags & IR3_REG_SSA) &&
-                               /* TODO: propagate abs/neg modifiers if possible */
-                               !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV)))
-                       return true;
-       }
-       return false;
-}
-
-static void walk_children(struct ir3_instruction *instr, bool keep)
-{
-       unsigned i;
-
-       /* walk down the graph from each src: */
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *src = instr->regs[i];
-               if (src->flags & IR3_REG_SSA)
-                       src->instr = instr_cp(src->instr, keep);
-       }
-}
-
-static struct ir3_instruction *
-instr_cp_fanin(struct ir3_instruction *instr)
-{
-       unsigned i;
-
-       /* we need to handle fanin specially, to detect cases
-        * when we need to keep a mov
-        */
-
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *src = instr->regs[i];
-               if (src->flags & IR3_REG_SSA) {
-                       struct ir3_instruction *cand =
-                                       instr_cp(src->instr, false);
-
-                       /* if the candidate is a fanout, then keep
-                        * the move.
-                        *
-                        * This is a bit, um, fragile, but it should
-                        * catch the extra mov's that the front-end
-                        * puts in for us already in these cases.
-                        */
-                       if (is_meta(cand) && (cand->opc == OPC_META_FO))
-                               cand = instr_cp(src->instr, true);
-
-                       src->instr = cand;
-               }
-       }
-
-       walk_children(instr, false);
-
-       return instr;
-
-}
-
-static struct ir3_instruction *
-instr_cp(struct ir3_instruction *instr, bool keep)
-{
-       /* if we've already visited this instruction, bail now: */
-       if (ir3_instr_check_mark(instr))
-               return instr;
-
-       if (is_meta(instr) && (instr->opc == OPC_META_FI))
-               return instr_cp_fanin(instr);
-
-       if (is_eligible_mov(instr) && !keep) {
-               struct ir3_register *src = instr->regs[1];
-               return instr_cp(src->instr, false);
-       }
-
-       walk_children(instr, false);
-
-       return instr;
-}
-
-static void block_cp(struct ir3_block *block)
-{
-       unsigned i, j;
-
-       for (i = 0; i < block->noutputs; i++) {
-               if (block->outputs[i]) {
-                       struct ir3_instruction *out =
-                                       instr_cp(block->outputs[i], false);
-
-                       /* To deal with things like this:
-                        *
-                        *   43: MOV OUT[2], TEMP[5]
-                        *   44: MOV OUT[0], TEMP[5]
-                        *
-                        * we need to ensure that no two outputs point to
-                        * the same instruction
-                        */
-                       for (j = 0; j < i; j++) {
-                               if (block->outputs[j] == out) {
-                                       out = instr_cp(block->outputs[i], true);
-                                       break;
-                               }
-                       }
-
-                       block->outputs[i] = out;
-               }
-       }
-}
-
-void ir3_block_cp(struct ir3_block *block)
-{
-       ir3_clear_mark(block->shader);
-       block_cp(block);
-}
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c
deleted file mode 100644 (file)
index dcc0362..0000000
+++ /dev/null
@@ -1,159 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "util/u_math.h"
-
-#include "ir3.h"
-
-/*
- * Instruction Depth:
- *
- * Calculates weighted instruction depth, ie. the sum of # of needed
- * instructions plus delay slots back to original input (ie INPUT or
- * CONST).  That is to say, an instructions depth is:
- *
- *   depth(instr) {
- *     d = 0;
- *     // for each src register:
- *     foreach (src in instr->regs[1..n])
- *       d = max(d, delayslots(src->instr, n) + depth(src->instr));
- *     return d + 1;
- *   }
- *
- * After an instruction's depth is calculated, it is inserted into the
- * blocks depth sorted list, which is used by the scheduling pass.
- */
-
-/* calculate required # of delay slots between the instruction that
- * assigns a value and the one that consumes
- */
-int ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n)
-{
-       /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
-        * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
-        * handled with sync bits
-        */
-
-       if (is_meta(assigner))
-               return 0;
-
-       if (writes_addr(assigner))
-               return 6;
-
-       /* handled via sync flags: */
-       if (is_sfu(assigner) || is_tex(assigner))
-               return 0;
-
-       /* assigner must be alu: */
-       if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer)) {
-               return 6;
-       } else if ((consumer->category == 3) &&
-                       is_mad(consumer->opc) && (n == 2)) {
-               /* special case, 3rd src to cat3 not required on first cycle */
-               return 1;
-       } else {
-               return 3;
-       }
-}
-
-static void insert_by_depth(struct ir3_instruction *instr)
-{
-       struct ir3_block *block = instr->block;
-       struct ir3_instruction *n = block->head;
-       struct ir3_instruction *p = NULL;
-
-       while (n && (n != instr) && (n->depth > instr->depth)) {
-               p = n;
-               n = n->next;
-       }
-
-       instr->next = n;
-       if (p)
-               p->next = instr;
-       else
-               block->head = instr;
-}
-
-static void ir3_instr_depth(struct ir3_instruction *instr)
-{
-       unsigned i;
-
-       /* if we've already visited this instruction, bail now: */
-       if (ir3_instr_check_mark(instr))
-               return;
-
-       instr->depth = 0;
-
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *src = instr->regs[i];
-               if (src->flags & IR3_REG_SSA) {
-                       unsigned sd;
-
-                       /* visit child to compute it's depth: */
-                       ir3_instr_depth(src->instr);
-
-                       sd = ir3_delayslots(src->instr, instr, i-1) +
-                                       src->instr->depth;
-
-                       instr->depth = MAX2(instr->depth, sd);
-               }
-       }
-
-       /* meta-instructions don't add cycles, other than PHI.. which
-        * might translate to a real instruction..
-        *
-        * well, not entirely true, fan-in/out, etc might need to need
-        * to generate some extra mov's in edge cases, etc.. probably
-        * we might want to do depth calculation considering the worst
-        * case for these??
-        */
-       if (!is_meta(instr))
-               instr->depth++;
-
-       insert_by_depth(instr);
-}
-
-void ir3_block_depth(struct ir3_block *block)
-{
-       unsigned i;
-
-       block->head = NULL;
-
-       ir3_clear_mark(block->shader);
-       for (i = 0; i < block->noutputs; i++)
-               if (block->outputs[i])
-                       ir3_instr_depth(block->outputs[i]);
-
-       /* at this point, any unvisited input is unused: */
-       for (i = 0; i < block->ninputs; i++) {
-               struct ir3_instruction *in = block->inputs[i];
-               if (in && !ir3_instr_check_mark(in))
-                       block->inputs[i] = NULL;
-       }
-}
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c
deleted file mode 100644 (file)
index 1a6f49d..0000000
+++ /dev/null
@@ -1,425 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <stdarg.h>
-
-#include "ir3.h"
-
-#define PTRID(x) ((unsigned long)(x))
-
-struct ir3_dump_ctx {
-       FILE *f;
-       bool verbose;
-};
-
-static void dump_instr_name(struct ir3_dump_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       /* for debugging: */
-       if (ctx->verbose) {
-#ifdef DEBUG
-               fprintf(ctx->f, "%04u:", instr->serialno);
-#endif
-               fprintf(ctx->f, "%03u: ", instr->depth);
-       }
-
-       if (instr->flags & IR3_INSTR_SY)
-               fprintf(ctx->f, "(sy)");
-       if (instr->flags & IR3_INSTR_SS)
-               fprintf(ctx->f, "(ss)");
-
-       if (is_meta(instr)) {
-               switch(instr->opc) {
-               case OPC_META_PHI:
-                       fprintf(ctx->f, "&#934;");
-                       break;
-               case OPC_META_DEREF:
-                       fprintf(ctx->f, "(*)");
-                       break;
-               default:
-                       /* shouldn't hit here.. just for debugging: */
-                       switch (instr->opc) {
-                       case OPC_META_INPUT:  fprintf(ctx->f, "_meta:in");   break;
-                       case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out");  break;
-                       case OPC_META_FO:     fprintf(ctx->f, "_meta:fo");   break;
-                       case OPC_META_FI:     fprintf(ctx->f, "_meta:fi");   break;
-                       case OPC_META_FLOW:   fprintf(ctx->f, "_meta:flow"); break;
-
-                       default: fprintf(ctx->f, "_meta:%d", instr->opc); break;
-                       }
-                       break;
-               }
-       } else if (instr->category == 1) {
-               static const char *type[] = {
-                               [TYPE_F16] = "f16",
-                               [TYPE_F32] = "f32",
-                               [TYPE_U16] = "u16",
-                               [TYPE_U32] = "u32",
-                               [TYPE_S16] = "s16",
-                               [TYPE_S32] = "s32",
-                               [TYPE_U8]  = "u8",
-                               [TYPE_S8]  = "s8",
-               };
-               if (instr->cat1.src_type == instr->cat1.dst_type)
-                       fprintf(ctx->f, "mov");
-               else
-                       fprintf(ctx->f, "cov");
-               fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
-       } else {
-               fprintf(ctx->f, "%s", ir3_instr_name(instr));
-               if (instr->flags & IR3_INSTR_3D)
-                       fprintf(ctx->f, ".3d");
-               if (instr->flags & IR3_INSTR_A)
-                       fprintf(ctx->f, ".a");
-               if (instr->flags & IR3_INSTR_O)
-                       fprintf(ctx->f, ".o");
-               if (instr->flags & IR3_INSTR_P)
-                       fprintf(ctx->f, ".p");
-               if (instr->flags & IR3_INSTR_S)
-                       fprintf(ctx->f, ".s");
-               if (instr->flags & IR3_INSTR_S2EN)
-                       fprintf(ctx->f, ".s2en");
-       }
-}
-
-static void dump_reg_name(struct ir3_dump_ctx *ctx,
-               struct ir3_register *reg)
-{
-       if ((reg->flags & IR3_REG_ABS) && (reg->flags & IR3_REG_NEGATE))
-               fprintf(ctx->f, "(absneg)");
-       else if (reg->flags & IR3_REG_NEGATE)
-               fprintf(ctx->f, "(neg)");
-       else if (reg->flags & IR3_REG_ABS)
-               fprintf(ctx->f, "(abs)");
-
-       if (reg->flags & IR3_REG_IMMED) {
-               fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
-       } else if (reg->flags & IR3_REG_SSA) {
-               if (ctx->verbose) {
-                       fprintf(ctx->f, "_[");
-                       dump_instr_name(ctx, reg->instr);
-                       fprintf(ctx->f, "]");
-               }
-       } else {
-               if (reg->flags & IR3_REG_HALF)
-                       fprintf(ctx->f, "h");
-               if (reg->flags & IR3_REG_CONST)
-                       fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
-               else
-                       fprintf(ctx->f, "r%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
-       }
-}
-
-static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
-               struct ir3_instruction *instr);
-static void ir3_block_dump(struct ir3_dump_ctx *ctx,
-               struct ir3_block *block, const char *name);
-
-static void dump_instr(struct ir3_dump_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       /* if we've already visited this instruction, bail now: */
-       if (ir3_instr_check_mark(instr))
-               return;
-
-       /* some meta-instructions need to be handled specially: */
-       if (is_meta(instr)) {
-               if ((instr->opc == OPC_META_FO) ||
-                               (instr->opc == OPC_META_FI)) {
-                       unsigned i;
-                       for (i = 1; i < instr->regs_count; i++) {
-                               struct ir3_register *reg = instr->regs[i];
-                               if (reg->flags & IR3_REG_SSA)
-                                       dump_instr(ctx, reg->instr);
-                       }
-               } else if (instr->opc == OPC_META_FLOW) {
-                       struct ir3_register *reg = instr->regs[1];
-                       ir3_block_dump(ctx, instr->flow.if_block, "if");
-                       if (instr->flow.else_block)
-                               ir3_block_dump(ctx, instr->flow.else_block, "else");
-                       if (reg->flags & IR3_REG_SSA)
-                               dump_instr(ctx, reg->instr);
-               } else if ((instr->opc == OPC_META_PHI) ||
-                               (instr->opc == OPC_META_DEREF)) {
-                       /* treat like a normal instruction: */
-                       ir3_instr_dump(ctx, instr);
-               }
-       } else {
-               ir3_instr_dump(ctx, instr);
-       }
-}
-
-/* arrarraggh!  if link is to something outside of the current block, we
- * need to defer emitting the link until the end of the block, since the
- * edge triggers pre-creation of the node it links to inside the cluster,
- * even though it is meant to be outside..
- */
-static struct {
-       char buf[40960];
-       unsigned n;
-} edge_buf;
-
-/* helper to print or defer: */
-static void printdef(struct ir3_dump_ctx *ctx,
-               bool defer, const char *fmt, ...)
-{
-       va_list ap;
-       va_start(ap, fmt);
-       if (defer) {
-               unsigned n = edge_buf.n;
-               n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n,
-                               fmt, ap);
-               edge_buf.n = n;
-       } else {
-               vfprintf(ctx->f, fmt, ap);
-       }
-       va_end(ap);
-}
-
-static void dump_link2(struct ir3_dump_ctx *ctx,
-               struct ir3_instruction *instr, const char *target, bool defer)
-{
-       /* some meta-instructions need to be handled specially: */
-       if (is_meta(instr)) {
-               if (instr->opc == OPC_META_INPUT) {
-                       printdef(ctx, defer, "input%lx:<in%u>:w -> %s",
-                                       PTRID(instr->inout.block),
-                                       instr->regs[0]->num, target);
-               } else if (instr->opc == OPC_META_FO) {
-                       struct ir3_register *reg = instr->regs[1];
-                       dump_link2(ctx, reg->instr, target, defer);
-                       printdef(ctx, defer, "[label=\".%c\"]",
-                                       "xyzw"[instr->fo.off & 0x3]);
-               } else if (instr->opc == OPC_META_FI) {
-                       unsigned i;
-
-                       /* recursively dump all parents and links */
-                       for (i = 1; i < instr->regs_count; i++) {
-                               struct ir3_register *reg = instr->regs[i];
-                               if (reg->flags & IR3_REG_SSA) {
-                                       dump_link2(ctx, reg->instr, target, defer);
-                                       printdef(ctx, defer, "[label=\".%c\"]",
-                                                       "xyzw"[(i - 1) & 0x3]);
-                               }
-                       }
-               } else if (instr->opc == OPC_META_OUTPUT) {
-                       printdef(ctx, defer, "output%lx:<out%u>:w -> %s",
-                                       PTRID(instr->inout.block),
-                                       instr->regs[0]->num, target);
-               } else if ((instr->opc == OPC_META_PHI) ||
-                               (instr->opc == OPC_META_DEREF)) {
-                       /* treat like a normal instruction: */
-                       printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
-               }
-       } else {
-               printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
-       }
-}
-
-static void dump_link(struct ir3_dump_ctx *ctx,
-               struct ir3_instruction *instr,
-               struct ir3_block *block, const char *target)
-{
-       bool defer = instr->block != block;
-       dump_link2(ctx, instr, target, defer);
-       printdef(ctx, defer, "\n");
-}
-
-static struct ir3_register *follow_flow(struct ir3_register *reg)
-{
-       if (reg->flags & IR3_REG_SSA) {
-               struct ir3_instruction *instr = reg->instr;
-               /* go with the flow.. */
-               if (is_meta(instr) && (instr->opc == OPC_META_FLOW))
-                       return instr->regs[1];
-       }
-       return reg;
-}
-
-static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       unsigned i;
-
-       fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{",
-                       PTRID(instr));
-       dump_instr_name(ctx, instr);
-
-       /* destination register: */
-       fprintf(ctx->f, "|<dst0>");
-
-       /* source register(s): */
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *reg = follow_flow(instr->regs[i]);
-
-               fprintf(ctx->f, "|");
-
-               if (reg->flags & IR3_REG_SSA)
-                       fprintf(ctx->f, "<src%u> ", (i - 1));
-
-               dump_reg_name(ctx, reg);
-       }
-
-       fprintf(ctx->f, "}\"];\n");
-
-       /* and recursively dump dependent instructions: */
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               char target[32];  /* link target */
-
-               if (!(reg->flags & IR3_REG_SSA))
-                       continue;
-
-               snprintf(target, sizeof(target), "instr%lx:<src%u>",
-                               PTRID(instr), (i - 1));
-
-               dump_instr(ctx, reg->instr);
-               dump_link(ctx, follow_flow(reg)->instr, instr->block, target);
-       }
-}
-
-static void ir3_block_dump(struct ir3_dump_ctx *ctx,
-               struct ir3_block *block, const char *name)
-{
-       unsigned i, n;
-
-       n = edge_buf.n;
-
-       fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block));
-       fprintf(ctx->f, "label=\"%s\";\n", name);
-
-       /* draw inputs: */
-       fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block));
-       for (i = 0; i < block->ninputs; i++)
-               if (block->inputs[i])
-                       fprintf(ctx->f, "|<in%u> i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
-       fprintf(ctx->f, "\"];\n");
-
-       /* draw instruction graph: */
-       for (i = 0; i < block->noutputs; i++)
-               dump_instr(ctx, block->outputs[i]);
-
-       /* draw outputs: */
-       fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block));
-       for (i = 0; i < block->noutputs; i++)
-               fprintf(ctx->f, "|<out%u> o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
-       fprintf(ctx->f, "\"];\n");
-
-       /* and links to outputs: */
-       for (i = 0; i < block->noutputs; i++) {
-               char target[32];  /* link target */
-
-               /* NOTE: there could be outputs that are never assigned,
-                * so skip them
-                */
-               if (!block->outputs[i])
-                       continue;
-
-               snprintf(target, sizeof(target), "output%lx:<out%u>:e",
-                               PTRID(block), i);
-
-               dump_link(ctx, block->outputs[i], block, target);
-       }
-
-       fprintf(ctx->f, "}\n");
-
-       /* and links to inputs: */
-       if (block->parent) {
-               for (i = 0; i < block->ninputs; i++) {
-                       char target[32];  /* link target */
-
-                       if (!block->inputs[i])
-                               continue;
-
-                       dump_instr(ctx, block->inputs[i]);
-
-                       snprintf(target, sizeof(target), "input%lx:<in%u>:e",
-                                       PTRID(block), i);
-
-                       dump_link(ctx, block->inputs[i], block, target);
-               }
-       }
-
-       /* dump deferred edges: */
-       if (edge_buf.n > n) {
-               fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]);
-               edge_buf.n = n;
-       }
-}
-
-void ir3_dump(struct ir3 *shader, const char *name,
-               struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
-               FILE *f)
-{
-       struct ir3_dump_ctx ctx = {
-                       .f = f,
-       };
-       ir3_clear_mark(shader);
-       fprintf(ctx.f, "digraph G {\n");
-       fprintf(ctx.f, "rankdir=RL;\n");
-       fprintf(ctx.f, "nodesep=0.25;\n");
-       fprintf(ctx.f, "ranksep=1.5;\n");
-       ir3_block_dump(&ctx, block, name);
-       fprintf(ctx.f, "}\n");
-}
-
-/*
- * For Debugging:
- */
-
-void
-ir3_dump_instr_single(struct ir3_instruction *instr)
-{
-       struct ir3_dump_ctx ctx = {
-                       .f = stdout,
-                       .verbose = true,
-       };
-       unsigned i;
-
-       dump_instr_name(&ctx, instr);
-       for (i = 0; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               printf(i ? ", " : " ");
-               dump_reg_name(&ctx, reg);
-       }
-       printf("\n");
-}
-
-void
-ir3_dump_instr_list(struct ir3_instruction *instr)
-{
-       unsigned n = 0;
-
-       while (instr) {
-               ir3_dump_instr_single(instr);
-               if (!is_meta(instr))
-                       n++;
-               instr = instr->next;
-       }
-       printf("%u instructions\n", n);
-}
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_flatten.c b/src/gallium/drivers/freedreno/a3xx/ir3_flatten.c
deleted file mode 100644 (file)
index 9389227..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <stdarg.h>
-
-#include "ir3.h"
-
-/*
- * Flatten: flatten out legs of if/else, etc
- *
- * TODO probably should use some heuristic to decide to not flatten
- * if one side of the other is too large / deeply nested / whatever?
- */
-
-struct ir3_flatten_ctx {
-       struct ir3_block *block;
-       unsigned cnt;
-};
-
-static struct ir3_register *unwrap(struct ir3_register *reg)
-{
-
-       if (reg->flags & IR3_REG_SSA) {
-               struct ir3_instruction *instr = reg->instr;
-               if (is_meta(instr)) {
-                       switch (instr->opc) {
-                       case OPC_META_OUTPUT:
-                       case OPC_META_FLOW:
-                               if (instr->regs_count > 1)
-                                       return instr->regs[1];
-                               return NULL;
-                       default:
-                               break;
-                       }
-               }
-       }
-       return reg;
-}
-
-static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       unsigned i;
-
-       /* if we've already visited this instruction, bail now: */
-       if (ir3_instr_check_mark(instr))
-               return;
-
-       instr->block = ctx->block;
-
-       /* TODO: maybe some threshold to decide whether to
-        * flatten or not??
-        */
-       if (is_meta(instr)) {
-               if (instr->opc == OPC_META_PHI) {
-                       struct ir3_register *cond, *t, *f;
-
-                       cond = unwrap(instr->regs[1]);
-                       t    = unwrap(instr->regs[2]);  /* true val */
-                       f    = unwrap(instr->regs[3]);  /* false val */
-
-                       /* must have cond, but t or f may be null if only written
-                        * one one side of the if/else (in which case we can just
-                        * convert the PHI to a simple move).
-                        */
-                       assert(cond);
-                       assert(t || f);
-
-                       if (t && f) {
-                               /* convert the PHI instruction to sel.{b16,b32} */
-                               instr->category = 3;
-
-                               /* instruction type based on dst size: */
-                               if (instr->regs[0]->flags & IR3_REG_HALF)
-                                       instr->opc = OPC_SEL_B16;
-                               else
-                                       instr->opc = OPC_SEL_B32;
-
-                               instr->regs[1] = t;
-                               instr->regs[2] = cond;
-                               instr->regs[3] = f;
-                       } else {
-                               /* convert to simple mov: */
-                               instr->category = 1;
-                               instr->cat1.dst_type = TYPE_F32;
-                               instr->cat1.src_type = TYPE_F32;
-                               instr->regs_count = 2;
-                               instr->regs[1] = t ? t : f;
-                       }
-
-                       ctx->cnt++;
-               } else if ((instr->opc == OPC_META_INPUT) &&
-                               (instr->regs_count == 2)) {
-                       type_t ftype;
-
-                       if (instr->regs[0]->flags & IR3_REG_HALF)
-                               ftype = TYPE_F16;
-                       else
-                               ftype = TYPE_F32;
-
-                       /* convert meta:input to mov: */
-                       instr->category = 1;
-                       instr->cat1.src_type = ftype;
-                       instr->cat1.dst_type = ftype;
-               }
-       }
-
-       /* recursively visit children: */
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *src = instr->regs[i];
-               if (src->flags & IR3_REG_SSA)
-                       ir3_instr_flatten(ctx, src->instr);
-       }
-}
-
-/* return >= 0 is # of phi's flattened, < 0 is error */
-int ir3_block_flatten(struct ir3_block *block)
-{
-       struct ir3_flatten_ctx ctx = {
-                       .block = block,
-       };
-       unsigned i;
-
-       ir3_clear_mark(block->shader);
-       for(i = 0; i < block->noutputs; i++)
-               if (block->outputs[i])
-                       ir3_instr_flatten(&ctx, block->outputs[i]);
-
-       return ctx.cnt;
-}
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
deleted file mode 100644 (file)
index b916dd5..0000000
+++ /dev/null
@@ -1,790 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "pipe/p_shader_tokens.h"
-#include "util/u_math.h"
-
-#include "ir3.h"
-#include "ir3_visitor.h"
-
-/*
- * Register Assignment:
- *
- * NOTE: currently only works on a single basic block.. need to think
- * about how multiple basic blocks are going to get scheduled.  But
- * I think I want to re-arrange how blocks work, ie. get rid of the
- * block nesting thing..
- *
- * NOTE: we could do register coalescing (eliminate moves) as part of
- * the RA step.. OTOH I think we need to do scheduling before register
- * assignment.  And if we remove a mov that effects scheduling (unless
- * we leave a placeholder nop, which seems lame), so I'm not really
- * sure how practical this is to do both in a single stage.  But OTOH
- * I'm not really sure a sane way for the CP stage to realize when it
- * cannot remove a mov due to multi-register constraints..
- *
- */
-
-struct ir3_ra_ctx {
-       struct ir3_block *block;
-       enum shader_t type;
-       bool half_precision;
-       bool frag_coord;
-       bool frag_face;
-       bool has_samp;
-       int cnt;
-       bool error;
-};
-
-/* sorta ugly way to retrofit half-precision support.. rather than
- * passing extra param around, just OR in a high bit.  All the low
- * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
- * will continue to work as long as you don't underflow (and that
- * would go badly anyways).
- */
-#define REG_HALF  0x8000
-
-struct ir3_ra_assignment {
-       int8_t  off;        /* offset of instruction dst within range */
-       uint8_t num;        /* number of components for the range */
-};
-
-static void ra_assign(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *assigner, int num);
-static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
-
-/*
- * Register Allocation:
- */
-
-#define REG(n, wm, f) (struct ir3_register){ \
-               .flags  = (f), \
-               .num    = (n), \
-               .wrmask = TGSI_WRITEMASK_ ## wm, \
-       }
-
-/* check that the register exists, is a GPR and is not special (a0/p0) */
-static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
-{
-       if ((n < instr->regs_count) && reg_gpr(instr->regs[n]))
-               return instr->regs[n];
-       return NULL;
-}
-
-static int output_base(struct ir3_ra_ctx *ctx)
-{
-       /* ugg, for fragment shader we need to have input at r0.x
-        * (or at least if there is a way to configure it, I can't
-        * see how because the blob driver always uses r0.x (ie.
-        * all zeros)
-        */
-       if (ctx->type == SHADER_FRAGMENT) {
-               if (ctx->half_precision)
-                       return ctx->frag_face ? 4 : 3;
-               return ctx->frag_coord ? 8 : 4;
-       }
-       return 0;
-}
-
-/* live means read before written */
-static void compute_liveregs(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *instr, regmask_t *liveregs)
-{
-       struct ir3_block *block = instr->block;
-       regmask_t written;
-       unsigned i, j;
-
-       regmask_init(liveregs);
-       regmask_init(&written);
-
-       for (instr = instr->next; instr; instr = instr->next) {
-               struct ir3_register *r;
-
-               if (is_meta(instr))
-                       continue;
-
-               /* check first src's read: */
-               for (j = 1; j < instr->regs_count; j++) {
-                       r = reg_check(instr, j);
-                       if (r)
-                               regmask_set_if_not(liveregs, r, &written);
-               }
-
-               /* then dst written (if assigned already): */
-               if (instr->flags & IR3_INSTR_MARK) {
-                       r = reg_check(instr, 0);
-                       if (r)
-                               regmask_set(&written, r);
-               }
-       }
-
-       /* be sure to account for output registers too: */
-       for (i = 0; i < block->noutputs; i++) {
-               struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
-               regmask_set_if_not(liveregs, &reg, &written);
-       }
-}
-
-/* calculate registers that are clobbered before last use of 'assigner'.
- * This needs to be done backwards, although it could possibly be
- * combined into compute_liveregs().  (Ie. compute_liveregs() could
- * reverse the list, then do this part backwards reversing the list
- * again back to original order.)  Otoh, probably I should try to
- * construct a proper interference graph instead.
- *
- * XXX this need to follow the same recursion path that is used for
- * to rename/assign registers (ie. ra_assign_src()).. this is a bit
- * ugly right now, maybe refactor into node iterator sort of things
- * that iterates nodes in the correct order?
- */
-static bool compute_clobbers(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *instr, struct ir3_instruction *assigner,
-               regmask_t *liveregs)
-{
-       unsigned i;
-       bool live = false, was_live = false;
-
-       if (instr == NULL) {
-               struct ir3_block *block = ctx->block;
-
-               /* if at the end, check outputs: */
-               for (i = 0; i < block->noutputs; i++)
-                       if (block->outputs[i] == assigner)
-                               return true;
-               return false;
-       }
-
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
-                       if (is_meta(instr)) {
-                               switch (instr->opc) {
-                               case OPC_META_INPUT:
-                                       // TODO
-                                       assert(0);
-                                       break;
-                               case OPC_META_FO:
-                               case OPC_META_FI:
-                                       was_live |= compute_clobbers(ctx, instr->next,
-                                                       instr, liveregs);
-                                       break;
-                               default:
-                                       break;
-                               }
-                       }
-                       live = true;
-                       break;
-               }
-       }
-
-       was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
-
-       if (was_live && (instr->regs_count > 0) &&
-                       (instr->flags & IR3_INSTR_MARK) &&
-                       !is_meta(instr))
-               regmask_set(liveregs, instr->regs[0]);
-
-       return live || was_live;
-}
-
-static int find_available(regmask_t *liveregs, int size, bool half)
-{
-       unsigned i;
-       unsigned f = half ? IR3_REG_HALF : 0;
-       for (i = 0; i < MAX_REG - size; i++) {
-               if (!regmask_get(liveregs, &REG(i, X, f))) {
-                       unsigned start = i++;
-                       for (; (i < MAX_REG) && ((i - start) < size); i++)
-                               if (regmask_get(liveregs, &REG(i, X, f)))
-                                       break;
-                       if ((i - start) >= size)
-                               return start;
-               }
-       }
-       assert(0);
-       return -1;
-}
-
-static int alloc_block(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *instr, int size)
-{
-       if (!instr) {
-               /* special case, allocating shader outputs.  At this
-                * point, nothing is allocated, just start the shader
-                * outputs at r0.x and let compute_liveregs() take
-                * care of the rest from here:
-                */
-               return 0;
-       } else {
-               struct ir3_register *dst = instr->regs[0];
-               regmask_t liveregs;
-
-               compute_liveregs(ctx, instr, &liveregs);
-
-               // XXX XXX XXX XXX XXX XXX XXX XXX XXX
-               // XXX hack.. maybe ra_calc should give us a list of
-               // instrs to compute_clobbers() on?
-               if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
-                               (instr->regs_count == 1)) {
-                       unsigned i, base = instr->regs[0]->num & ~0x3;
-                       for (i = 0; i < 4; i++) {
-                               struct ir3_instruction *in = ctx->block->inputs[base + i];
-                               if (in)
-                                       compute_clobbers(ctx, in->next, in, &liveregs);
-                       }
-               } else
-               // XXX XXX XXX XXX XXX XXX XXX XXX XXX
-               compute_clobbers(ctx, instr->next, instr, &liveregs);
-
-               return find_available(&liveregs, size,
-                               !!(dst->flags & IR3_REG_HALF));
-       }
-}
-
-/*
- * Constraint Calculation:
- */
-
-struct ra_calc_visitor {
-       struct ir3_visitor base;
-       struct ir3_ra_assignment a;
-};
-
-static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
-{
-       return (struct ra_calc_visitor *)v;
-}
-
-/* calculate register assignment for the instruction.  If the register
- * written by this instruction is required to be part of a range, to
- * handle other (input/output/sam/bary.f/etc) contiguous register range
- * constraints, that is calculated handled here.
- */
-static void ra_calc_dst(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_calc_visitor *c = ra_calc_visitor(v);
-       if (is_tex(instr)) {
-               c->a.off = 0;
-               c->a.num = 4;
-       } else {
-               c->a.off = 0;
-               c->a.num = 1;
-       }
-}
-
-static void
-ra_calc_dst_shader_input(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_calc_visitor *c = ra_calc_visitor(v);
-       struct ir3_block *block = instr->block;
-       struct ir3_register *dst = instr->regs[0];
-       unsigned base = dst->num & ~0x3;
-       unsigned i, num = 0;
-
-       assert(!(dst->flags & IR3_REG_IA));
-
-       /* check what input components we need: */
-       for (i = 0; i < 4; i++) {
-               unsigned idx = base + i;
-               if ((idx < block->ninputs) && block->inputs[idx])
-                       num = i + 1;
-       }
-
-       c->a.off = dst->num - base;
-       c->a.num = num;
-}
-
-static void ra_calc_src_fanin(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_calc_visitor *c = ra_calc_visitor(v);
-       unsigned srcn = ir3_instr_regno(instr, reg) - 1;
-       c->a.off += srcn;
-       c->a.num += srcn;
-       c->a.num = MAX2(c->a.num, instr->regs_count - 1);
-}
-
-static const struct ir3_visitor_funcs calc_visitor_funcs = {
-               .instr = ir3_visit_instr,
-               .dst_shader_input = ra_calc_dst_shader_input,
-               .dst_fanout = ra_calc_dst,
-               .dst_fanin = ra_calc_dst,
-               .dst = ra_calc_dst,
-               .src_fanout = ir3_visit_reg,
-               .src_fanin = ra_calc_src_fanin,
-               .src = ir3_visit_reg,
-};
-
-static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
-{
-       struct ra_calc_visitor v = {
-                       .base.funcs = &calc_visitor_funcs,
-       };
-
-       ir3_visit_instr(&v.base, assigner);
-
-       return v.a;
-}
-
-/*
- * Register Assignment:
- */
-
-struct ra_assign_visitor {
-       struct ir3_visitor base;
-       struct ir3_ra_ctx *ctx;
-       int num;
-};
-
-static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
-{
-       return (struct ra_assign_visitor *)v;
-}
-
-static type_t half_type(type_t type)
-{
-       switch (type) {
-       case TYPE_F32: return TYPE_F16;
-       case TYPE_U32: return TYPE_U16;
-       case TYPE_S32: return TYPE_S16;
-       /* instructions may already be fixed up: */
-       case TYPE_F16:
-       case TYPE_U16:
-       case TYPE_S16:
-               return type;
-       default:
-               assert(0);
-               return ~0;
-       }
-}
-
-/* some instructions need fix-up if dst register is half precision: */
-static void fixup_half_instr_dst(struct ir3_instruction *instr)
-{
-       switch (instr->category) {
-       case 1: /* move instructions */
-               instr->cat1.dst_type = half_type(instr->cat1.dst_type);
-               break;
-       case 3:
-               switch (instr->opc) {
-               case OPC_MAD_F32:
-                       instr->opc = OPC_MAD_F16;
-                       break;
-               case OPC_SEL_B32:
-                       instr->opc = OPC_SEL_B16;
-                       break;
-               case OPC_SEL_S32:
-                       instr->opc = OPC_SEL_S16;
-                       break;
-               case OPC_SEL_F32:
-                       instr->opc = OPC_SEL_F16;
-                       break;
-               case OPC_SAD_S32:
-                       instr->opc = OPC_SAD_S16;
-                       break;
-               /* instructions may already be fixed up: */
-               case OPC_MAD_F16:
-               case OPC_SEL_B16:
-               case OPC_SEL_S16:
-               case OPC_SEL_F16:
-               case OPC_SAD_S16:
-                       break;
-               default:
-                       assert(0);
-                       break;
-               }
-               break;
-       case 5:
-               instr->cat5.type = half_type(instr->cat5.type);
-               break;
-       }
-}
-/* some instructions need fix-up if src register is half precision: */
-static void fixup_half_instr_src(struct ir3_instruction *instr)
-{
-       switch (instr->category) {
-       case 1: /* move instructions */
-               instr->cat1.src_type = half_type(instr->cat1.src_type);
-               break;
-       }
-}
-
-static void ra_assign_reg(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
-
-       if (is_flow(instr) && (instr->opc == OPC_KILL))
-               return;
-
-       reg->flags &= ~IR3_REG_SSA;
-       reg->num = a->num & ~REG_HALF;
-
-       assert(reg->num >= 0);
-
-       if (a->num & REG_HALF) {
-               reg->flags |= IR3_REG_HALF;
-               /* if dst reg being assigned, patch up the instr: */
-               if (reg == instr->regs[0])
-                       fixup_half_instr_dst(instr);
-               else
-                       fixup_half_instr_src(instr);
-       }
-}
-
-static void ra_assign_dst_shader_input(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
-       unsigned i, base = reg->num & ~0x3;
-       int off = base - reg->num;
-
-       ra_assign_reg(v, instr, reg);
-       reg->flags |= IR3_REG_IA;
-
-       /* trigger assignment of all our companion input components: */
-       for (i = 0; i < 4; i++) {
-               struct ir3_instruction *in = instr->block->inputs[i+base];
-               if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
-                       ra_assign(a->ctx, in, a->num + off + i);
-       }
-}
-
-static void ra_assign_dst_fanout(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
-       struct ir3_register *src = instr->regs[1];
-       ra_assign_reg(v, instr, reg);
-       if (src->flags & IR3_REG_SSA)
-               ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
-}
-
-static void ra_assign_src_fanout(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
-       ra_assign_reg(v, instr, reg);
-       ra_assign(a->ctx, instr, a->num + instr->fo.off);
-}
-
-
-static void ra_assign_src_fanin(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       struct ra_assign_visitor *a = ra_assign_visitor(v);
-       unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
-       ra_assign_reg(v, instr, reg);
-       ra_assign(a->ctx, instr, a->num - srcn);
-       for (j = 1; j < instr->regs_count; j++) {
-               struct ir3_register *reg = instr->regs[j];
-               if (reg->flags & IR3_REG_SSA)  /* could be renamed already */
-                       ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
-       }
-}
-
-static const struct ir3_visitor_funcs assign_visitor_funcs = {
-               .instr = ir3_visit_instr,
-               .dst_shader_input = ra_assign_dst_shader_input,
-               .dst_fanout = ra_assign_dst_fanout,
-               .dst_fanin = ra_assign_reg,
-               .dst = ra_assign_reg,
-               .src_fanout = ra_assign_src_fanout,
-               .src_fanin = ra_assign_src_fanin,
-               .src = ra_assign_reg,
-};
-
-static void ra_assign(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *assigner, int num)
-{
-       struct ra_assign_visitor v = {
-                       .base.funcs = &assign_visitor_funcs,
-                       .ctx = ctx,
-                       .num = num,
-       };
-
-       /* if we've already visited this instruction, bail now: */
-       if (ir3_instr_check_mark(assigner)) {
-               debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
-               if (assigner->regs[0]->num != (num & ~REG_HALF)) {
-                       /* impossible situation, should have been resolved
-                        * at an earlier stage by inserting extra mov's:
-                        */
-                       ctx->error = true;
-               }
-               return;
-       }
-
-       ir3_visit_instr(&v.base, assigner);
-}
-
-/*
- *
- */
-
-static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       struct ir3_register *dst;
-       unsigned num;
-
-       /* skip over nop's */
-       if (instr->regs_count == 0)
-               return;
-
-       dst = instr->regs[0];
-
-       /* if we've already visited this instruction, bail now: */
-       if (instr->flags & IR3_INSTR_MARK)
-               return;
-
-       /* allocate register(s): */
-       if (is_addr(instr)) {
-               num = instr->regs[2]->num;
-       } else if (reg_gpr(dst)) {
-               struct ir3_ra_assignment a;
-               a = ra_calc(instr);
-               num = alloc_block(ctx, instr, a.num) + a.off;
-       } else if (dst->flags & IR3_REG_ADDR) {
-               dst->flags &= ~IR3_REG_ADDR;
-               num = regid(REG_A0, 0) | REG_HALF;
-       } else {
-               /* predicate register (p0).. etc */
-               return;
-       }
-
-       ra_assign(ctx, instr, num);
-}
-
-/* flatten into shader: */
-// XXX this should probably be somewhere else:
-static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-       struct ir3_instruction *n;
-       struct ir3 *shader = block->shader;
-       struct ir3_instruction *end =
-                       ir3_instr_create(block, 0, OPC_END);
-       struct ir3_instruction *last_input = NULL;
-       struct ir3_instruction *last_rel = NULL;
-       regmask_t needs_ss_war;       /* write after read */
-       regmask_t needs_ss;
-       regmask_t needs_sy;
-
-       regmask_init(&needs_ss_war);
-       regmask_init(&needs_ss);
-       regmask_init(&needs_sy);
-
-       shader->instrs_count = 0;
-
-       for (n = block->head; n; n = n->next) {
-               struct ir3_register *reg;
-               unsigned i;
-
-               if (is_meta(n))
-                       continue;
-
-               for (i = 1; i < n->regs_count; i++) {
-                       reg = n->regs[i];
-
-                       if (reg_gpr(reg)) {
-
-                               /* TODO: we probably only need (ss) for alu
-                                * instr consuming sfu result.. need to make
-                                * some tests for both this and (sy)..
-                                */
-                               if (regmask_get(&needs_ss, reg)) {
-                                       n->flags |= IR3_INSTR_SS;
-                                       regmask_init(&needs_ss);
-                               }
-
-                               if (regmask_get(&needs_sy, reg)) {
-                                       n->flags |= IR3_INSTR_SY;
-                                       regmask_init(&needs_sy);
-                               }
-                       }
-
-                       /* TODO: is it valid to have address reg loaded from a
-                        * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
-                        * last_rel check below should be moved ahead of this:
-                        */
-                       if (reg->flags & IR3_REG_RELATIV)
-                               last_rel = n;
-               }
-
-               if (n->regs_count > 0) {
-                       reg = n->regs[0];
-                       if (regmask_get(&needs_ss_war, reg)) {
-                               n->flags |= IR3_INSTR_SS;
-                               regmask_init(&needs_ss_war); // ??? I assume?
-                       }
-
-                       if (last_rel && (reg->num == regid(REG_A0, 0))) {
-                               last_rel->flags |= IR3_INSTR_UL;
-                               last_rel = NULL;
-                       }
-               }
-
-               /* cat5+ does not have an (ss) bit, if needed we need to
-                * insert a nop to carry the sync flag.  Would be kinda
-                * clever if we were aware of this during scheduling, but
-                * this should be a pretty rare case:
-                */
-               if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) {
-                       struct ir3_instruction *nop;
-                       nop = ir3_instr_create(block, 0, OPC_NOP);
-                       nop->flags |= IR3_INSTR_SS;
-                       n->flags &= ~IR3_INSTR_SS;
-               }
-
-               /* need to be able to set (ss) on first instruction: */
-               if ((shader->instrs_count == 0) && (n->category >= 5))
-                       ir3_instr_create(block, 0, OPC_NOP);
-
-               if (is_nop(n) && shader->instrs_count) {
-                       struct ir3_instruction *last =
-                                       shader->instrs[shader->instrs_count-1];
-                       if (is_nop(last) && (last->repeat < 5)) {
-                               last->repeat++;
-                               last->flags |= n->flags;
-                               continue;
-                       }
-               }
-
-               shader->instrs[shader->instrs_count++] = n;
-
-               if (is_sfu(n))
-                       regmask_set(&needs_ss, n->regs[0]);
-
-               if (is_tex(n)) {
-                       /* this ends up being the # of samp instructions.. but that
-                        * is ok, everything else only cares whether it is zero or
-                        * not.  We do this here, rather than when we encounter a
-                        * SAMP decl, because (especially in binning pass shader)
-                        * the samp instruction(s) could get eliminated if the
-                        * result is not used.
-                        */
-                       ctx->has_samp = true;
-                       regmask_set(&needs_sy, n->regs[0]);
-               }
-
-               /* both tex/sfu appear to not always immediately consume
-                * their src register(s):
-                */
-               if (is_tex(n) || is_sfu(n)) {
-                       for (i = 1; i < n->regs_count; i++) {
-                               reg = n->regs[i];
-                               if (reg_gpr(reg))
-                                       regmask_set(&needs_ss_war, reg);
-                       }
-               }
-
-               if (is_input(n))
-                       last_input = n;
-       }
-
-       if (last_input)
-               last_input->regs[0]->flags |= IR3_REG_EI;
-
-       if (last_rel)
-               last_rel->flags |= IR3_INSTR_UL;
-
-       shader->instrs[shader->instrs_count++] = end;
-
-       shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
-}
-
-static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-       struct ir3_instruction *n;
-
-       if (!block->parent) {
-               unsigned i, j;
-               int base, off = output_base(ctx);
-
-               base = alloc_block(ctx, NULL, block->noutputs + off);
-
-               if (ctx->half_precision)
-                       base |= REG_HALF;
-
-               for (i = 0; i < block->noutputs; i++)
-                       if (block->outputs[i] && !is_kill(block->outputs[i]))
-                               ra_assign(ctx, block->outputs[i], base + i + off);
-
-               if (ctx->type == SHADER_FRAGMENT) {
-                       i = 0;
-                       if (ctx->frag_face) {
-                               /* if we have frag_face, it gets hr0.x */
-                               ra_assign(ctx, block->inputs[i], REG_HALF | 0);
-                               i += 4;
-                       }
-                       for (j = 0; i < block->ninputs; i++, j++)
-                               if (block->inputs[i])
-                                       ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j);
-               } else {
-                       for (i = 0; i < block->ninputs; i++)
-                               if (block->inputs[i])
-                                       ir3_instr_ra(ctx, block->inputs[i]);
-               }
-       }
-
-       /* then loop over instruction list and assign registers:
-        */
-       n = block->head;
-       while (n) {
-               ir3_instr_ra(ctx, n);
-               if (ctx->error)
-                       return -1;
-               n = n->next;
-       }
-
-       legalize(ctx, block);
-
-       return 0;
-}
-
-int ir3_block_ra(struct ir3_block *block, enum shader_t type,
-               bool half_precision, bool frag_coord, bool frag_face,
-               bool *has_samp)
-{
-       struct ir3_ra_ctx ctx = {
-                       .block = block,
-                       .type = type,
-                       .half_precision = half_precision,
-                       .frag_coord = frag_coord,
-                       .frag_face = frag_face,
-       };
-       int ret;
-
-       ir3_clear_mark(block->shader);
-       ret = block_ra(&ctx, block);
-       *has_samp = ctx.has_samp;
-
-       return ret;
-}
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c
deleted file mode 100644 (file)
index 3ef6773..0000000
+++ /dev/null
@@ -1,401 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-
-#include "util/u_math.h"
-
-#include "ir3.h"
-
-enum {
-       SCHEDULED = -1,
-       DELAYED = -2,
-};
-
-/*
- * Instruction Scheduling:
- *
- * Using the depth sorted list from depth pass, attempt to recursively
- * schedule deepest unscheduled path.  The first instruction that cannot
- * be scheduled, returns the required delay slots it needs, at which
- * point we return back up to the top and attempt to schedule by next
- * highest depth.  After a sufficient number of instructions have been
- * scheduled, return back to beginning of list and start again.  If you
- * reach the end of depth sorted list without being able to insert any
- * instruction, insert nop's.  Repeat until no more unscheduled
- * instructions.
- *
- * There are a few special cases that need to be handled, since sched
- * is currently independent of register allocation.  Usages of address
- * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
- * if you have two pairs of instructions that write the same special
- * register and then read it, then those pairs cannot be interleaved.
- * To solve this, when we are in such a scheduling "critical section",
- * and we encounter a conflicting write to a special register, we try
- * to schedule any remaining instructions that use that value first.
- */
-
-struct ir3_sched_ctx {
-       struct ir3_instruction *scheduled; /* last scheduled instr */
-       struct ir3_instruction *addr;      /* current a0.x user, if any */
-       struct ir3_instruction *pred;      /* current p0.x user, if any */
-       unsigned cnt;
-};
-
-static struct ir3_instruction *
-deepest(struct ir3_instruction **srcs, unsigned nsrcs)
-{
-       struct ir3_instruction *d = NULL;
-       unsigned i = 0, id = 0;
-
-       while ((i < nsrcs) && !(d = srcs[id = i]))
-               i++;
-
-       if (!d)
-               return NULL;
-
-       for (; i < nsrcs; i++)
-               if (srcs[i] && (srcs[i]->depth > d->depth))
-                       d = srcs[id = i];
-
-       srcs[id] = NULL;
-
-       return d;
-}
-
-static unsigned distance(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr, unsigned maxd)
-{
-       struct ir3_instruction *n = ctx->scheduled;
-       unsigned d = 0;
-       while (n && (n != instr) && (d < maxd)) {
-               if (is_alu(n) || is_flow(n))
-                       d++;
-               n = n->next;
-       }
-       return d;
-}
-
-/* TODO maybe we want double linked list? */
-static struct ir3_instruction * prev(struct ir3_instruction *instr)
-{
-       struct ir3_instruction *p = instr->block->head;
-       while (p && (p->next != instr))
-               p = p->next;
-       return p;
-}
-
-static void schedule(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr, bool remove)
-{
-       struct ir3_block *block = instr->block;
-
-       /* maybe there is a better way to handle this than just stuffing
-        * a nop.. ideally we'd know about this constraint in the
-        * scheduling and depth calculation..
-        */
-       if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr))
-               schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
-
-       /* remove from depth list:
-        */
-       if (remove) {
-               struct ir3_instruction *p = prev(instr);
-
-               /* NOTE: this can happen for inputs which are not
-                * read.. in that case there is no need to schedule
-                * the input, so just bail:
-                */
-               if (instr != (p ? p->next : block->head))
-                       return;
-
-               if (p)
-                       p->next = instr->next;
-               else
-                       block->head = instr->next;
-       }
-
-       if (writes_addr(instr)) {
-               assert(ctx->addr == NULL);
-               ctx->addr = instr;
-       }
-
-       if (writes_pred(instr)) {
-               assert(ctx->pred == NULL);
-               ctx->pred = instr;
-       }
-
-       instr->flags |= IR3_INSTR_MARK;
-
-       instr->next = ctx->scheduled;
-       ctx->scheduled = instr;
-
-       ctx->cnt++;
-}
-
-/*
- * Delay-slot calculation.  Follows fanin/fanout.
- */
-
-static unsigned delay_calc2(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned srcn)
-{
-       unsigned delay = 0;
-
-       if (is_meta(assigner)) {
-               unsigned i;
-               for (i = 1; i < assigner->regs_count; i++) {
-                       struct ir3_register *reg = assigner->regs[i];
-                       if (reg->flags & IR3_REG_SSA) {
-                               unsigned d = delay_calc2(ctx, reg->instr,
-                                               consumer, srcn);
-                               delay = MAX2(delay, d);
-                       }
-               }
-       } else {
-               delay = ir3_delayslots(assigner, consumer, srcn);
-               delay -= distance(ctx, assigner, delay);
-       }
-
-       return delay;
-}
-
-static unsigned delay_calc(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       unsigned i, delay = 0;
-
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               if (reg->flags & IR3_REG_SSA) {
-                       unsigned d = delay_calc2(ctx, reg->instr,
-                                       instr, i - 1);
-                       delay = MAX2(delay, d);
-               }
-       }
-
-       return delay;
-}
-
-/* A negative return value signals that an instruction has been newly
- * scheduled, return back up to the top of the stack (to block_sched())
- */
-static int trysched(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1];
-       struct ir3_instruction *src;
-       unsigned i, delay, nsrcs = 0;
-
-       /* if already scheduled: */
-       if (instr->flags & IR3_INSTR_MARK)
-               return 0;
-
-       /* figure out our src's: */
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               if (reg->flags & IR3_REG_SSA)
-                       srcs[nsrcs++] = reg->instr;
-       }
-
-       /* for each src register in sorted order:
-        */
-       delay = 0;
-       while ((src = deepest(srcs, nsrcs))) {
-               delay = trysched(ctx, src);
-               if (delay)
-                       return delay;
-       }
-
-       /* all our dependents are scheduled, figure out if
-        * we have enough delay slots to schedule ourself:
-        */
-       delay = delay_calc(ctx, instr);
-       if (delay)
-               return delay;
-
-       /* if this is a write to address/predicate register, and that
-        * register is currently in use, we need to defer until it is
-        * free:
-        */
-       if (writes_addr(instr) && ctx->addr) {
-               assert(ctx->addr != instr);
-               return DELAYED;
-       }
-       if (writes_pred(instr) && ctx->pred) {
-               assert(ctx->pred != instr);
-               return DELAYED;
-       }
-
-       schedule(ctx, instr, true);
-       return SCHEDULED;
-}
-
-static struct ir3_instruction * reverse(struct ir3_instruction *instr)
-{
-       struct ir3_instruction *reversed = NULL;
-       while (instr) {
-               struct ir3_instruction *next = instr->next;
-               instr->next = reversed;
-               reversed = instr;
-               instr = next;
-       }
-       return reversed;
-}
-
-static bool uses_current_addr(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       unsigned i;
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               if (reg->flags & IR3_REG_SSA) {
-                       if (is_addr(reg->instr)) {
-                               struct ir3_instruction *addr;
-                               addr = reg->instr->regs[1]->instr; /* the mova */
-                               if (ctx->addr == addr)
-                                       return true;
-                       }
-               }
-       }
-       return false;
-}
-
-static bool uses_current_pred(struct ir3_sched_ctx *ctx,
-               struct ir3_instruction *instr)
-{
-       unsigned i;
-       for (i = 1; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               if ((reg->flags & IR3_REG_SSA) && (ctx->pred == reg->instr))
-                               return true;
-       }
-       return false;
-}
-
-/* when we encounter an instruction that writes to the address register
- * when it is in use, we delay that instruction and try to schedule all
- * other instructions using the current address register:
- */
-static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
-               struct ir3_block *block)
-{
-       struct ir3_instruction *instr = block->head;
-       bool addr_in_use = false;
-       bool pred_in_use = false;
-       unsigned cnt = ~0;
-
-       while (instr) {
-               struct ir3_instruction *next = instr->next;
-               bool addr = uses_current_addr(ctx, instr);
-               bool pred = uses_current_pred(ctx, instr);
-
-               if (addr || pred) {
-                       int ret = trysched(ctx, instr);
-                       if (ret == SCHEDULED)
-                               cnt = 0;
-                       else if (ret > 0)
-                               cnt = MIN2(cnt, ret);
-                       if (addr)
-                               addr_in_use = true;
-                       if (pred)
-                               pred_in_use = true;
-               }
-
-               instr = next;
-       }
-
-       if (!addr_in_use)
-               ctx->addr = NULL;
-
-       if (!pred_in_use)
-               ctx->pred = NULL;
-
-       return cnt;
-}
-
-static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
-{
-       struct ir3_instruction *instr;
-
-       /* schedule all the shader input's (meta-instr) first so that
-        * the RA step sees that the input registers contain a value
-        * from the start of the shader:
-        */
-       if (!block->parent) {
-               unsigned i;
-               for (i = 0; i < block->ninputs; i++) {
-                       struct ir3_instruction *in = block->inputs[i];
-                       if (in)
-                               schedule(ctx, in, true);
-               }
-       }
-
-       while ((instr = block->head)) {
-               /* NOTE: always grab next *before* trysched(), in case the
-                * instruction is actually scheduled (and therefore moved
-                * from depth list into scheduled list)
-                */
-               struct ir3_instruction *next = instr->next;
-               int cnt = trysched(ctx, instr);
-
-               if (cnt == DELAYED)
-                       cnt = block_sched_undelayed(ctx, block);
-
-               /* -1 is signal to return up stack, but to us means same as 0: */
-               cnt = MAX2(0, cnt);
-               cnt += ctx->cnt;
-               instr = next;
-
-               /* if deepest remaining instruction cannot be scheduled, try
-                * the increasingly more shallow instructions until needed
-                * number of delay slots is filled:
-                */
-               while (instr && (cnt > ctx->cnt)) {
-                       next = instr->next;
-                       trysched(ctx, instr);
-                       instr = next;
-               }
-
-               /* and if we run out of instructions that can be scheduled,
-                * then it is time for nop's:
-                */
-               while (cnt > ctx->cnt)
-                       schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
-       }
-
-       /* at this point, scheduled list is in reverse order, so fix that: */
-       block->head = reverse(ctx->scheduled);
-}
-
-void ir3_block_sched(struct ir3_block *block)
-{
-       struct ir3_sched_ctx ctx = {0};
-       ir3_clear_mark(block->shader);
-       block_sched(&ctx, block);
-}
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_visitor.h b/src/gallium/drivers/freedreno/a3xx/ir3_visitor.h
deleted file mode 100644 (file)
index 1c60d16..0000000
+++ /dev/null
@@ -1,154 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#ifndef IR3_VISITOR_H_
-#define IR3_VISITOR_H_
-
-/**
- * Visitor which follows dst to src relationships between instructions,
- * first visiting the dst (writer) instruction, followed by src (reader)
- * instruction(s).
- *
- * TODO maybe we want multiple different visitors to walk the
- * graph in different ways?
- */
-
-struct ir3_visitor;
-
-typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v,
-               struct ir3_instruction *instr);
-
-typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg);
-
-struct ir3_visitor_funcs {
-       ir3_visit_instr_func instr;  // TODO do we need??
-
-       ir3_visit_reg_func dst_shader_input;
-       ir3_visit_reg_func dst_block_input;
-       ir3_visit_reg_func dst_fanout;
-       ir3_visit_reg_func dst_fanin;
-       ir3_visit_reg_func dst;
-
-       ir3_visit_reg_func src_block_input;
-       ir3_visit_reg_func src_fanout;
-       ir3_visit_reg_func src_fanin;
-       ir3_visit_reg_func src;
-};
-
-struct ir3_visitor {
-       const struct ir3_visitor_funcs *funcs;
-       bool error;
-};
-
-#include "util/u_debug.h"
-
-static void visit_instr_dst(struct ir3_visitor *v,
-               struct ir3_instruction *instr)
-{
-       struct ir3_register *reg = instr->regs[0];
-
-       if (is_meta(instr)) {
-               switch (instr->opc) {
-               case OPC_META_INPUT:
-                       if (instr->regs_count == 1)
-                               v->funcs->dst_shader_input(v, instr, reg);
-                       else
-                               v->funcs->dst_block_input(v, instr, reg);
-                       return;
-               case OPC_META_FO:
-                       v->funcs->dst_fanout(v, instr, reg);
-                       return;
-               case OPC_META_FI:
-                       v->funcs->dst_fanin(v, instr, reg);
-                       return;
-               default:
-                       break;
-
-               }
-       }
-
-       v->funcs->dst(v, instr, reg);
-}
-
-static void visit_instr_src(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       if (is_meta(instr)) {
-               switch (instr->opc) {
-               case OPC_META_INPUT:
-                       /* shader-input does not have a src, only block input: */
-                       debug_assert(instr->regs_count == 2);
-                       v->funcs->src_block_input(v, instr, reg);
-                       return;
-               case OPC_META_FO:
-                       v->funcs->src_fanout(v, instr, reg);
-                       return;
-               case OPC_META_FI:
-                       v->funcs->src_fanin(v, instr, reg);
-                       return;
-               default:
-                       break;
-
-               }
-       }
-
-       v->funcs->src(v, instr, reg);
-}
-
-static void ir3_visit_instr(struct ir3_visitor *v,
-               struct ir3_instruction *instr)
-{
-       struct ir3_instruction *n;
-
-       /* visit instruction that assigns value: */
-       if (instr->regs_count > 0)
-               visit_instr_dst(v, instr);
-
-       /* and of any following instructions which read that value: */
-       n = instr->next;
-       while (n && !v->error) {
-               unsigned i;
-
-               for (i = 1; i < n->regs_count; i++) {
-                       struct ir3_register *reg = n->regs[i];
-                       if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr))
-                               visit_instr_src(v, n, reg);
-               }
-
-               n = n->next;
-       }
-}
-
-static void ir3_visit_reg(struct ir3_visitor *v,
-               struct ir3_instruction *instr, struct ir3_register *reg)
-{
-       /* no-op */
-}
-
-#endif /* IR3_VISITOR_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
new file mode 100644 (file)
index 0000000..8c3704b
--- /dev/null
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#include <util/u_debug.h>
+
+#include "disasm.h"
+#include "instr-a3xx.h"
+
+static enum debug_t debug;
+
+#define printf debug_printf
+
+static const char *levels[] = {
+               "",
+               "\t",
+               "\t\t",
+               "\t\t\t",
+               "\t\t\t\t",
+               "\t\t\t\t\t",
+               "\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t\t",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+};
+
+static const char *component = "xyzw";
+
+static const char *type[] = {
+               [TYPE_F16] = "f16",
+               [TYPE_F32] = "f32",
+               [TYPE_U16] = "u16",
+               [TYPE_U32] = "u32",
+               [TYPE_S16] = "s16",
+               [TYPE_S32] = "s32",
+               [TYPE_U8]  = "u8",
+               [TYPE_S8]  = "s8",
+};
+
+static void print_reg(reg_t reg, bool full, bool r, bool c, bool im,
+               bool neg, bool abs, bool addr_rel)
+{
+       const char type = c ? 'c' : 'r';
+
+       // XXX I prefer - and || for neg/abs, but preserving format used
+       // by libllvm-a3xx for easy diffing..
+
+       if (abs && neg)
+               printf("(absneg)");
+       else if (neg)
+               printf("(neg)");
+       else if (abs)
+               printf("(abs)");
+
+       if (r)
+               printf("(r)");
+
+       if (im) {
+               printf("%d", reg.iim_val);
+       } else if (addr_rel) {
+               /* I would just use %+d but trying to make it diff'able with
+                * libllvm-a3xx...
+                */
+               if (reg.iim_val < 0)
+                       printf("%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+               else if (reg.iim_val > 0)
+                       printf("%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+               else
+                       printf("%s%c<a0.x>", full ? "" : "h", type);
+       } else if ((reg.num == REG_A0) && !c) {
+               printf("a0.%c", component[reg.comp]);
+       } else if ((reg.num == REG_P0) && !c) {
+               printf("p0.%c", component[reg.comp]);
+       } else {
+               printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
+       }
+}
+
+
+/* current instruction repeat flag: */
+static unsigned repeat;
+
+static void print_reg_dst(reg_t reg, bool full, bool addr_rel)
+{
+       print_reg(reg, full, false, false, false, false, false, addr_rel);
+}
+
+static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im,
+               bool neg, bool abs, bool addr_rel)
+{
+       print_reg(reg, full, r, c, im, neg, abs, addr_rel);
+}
+
+static void print_instr_cat0(instr_t *instr)
+{
+       instr_cat0_t *cat0 = &instr->cat0;
+
+       switch (cat0->opc) {
+       case OPC_KILL:
+               printf(" %sp0.%c", cat0->inv ? "!" : "",
+                               component[cat0->comp]);
+               break;
+       case OPC_BR:
+               printf(" %sp0.%c, #%d", cat0->inv ? "!" : "",
+                               component[cat0->comp], cat0->immed);
+               break;
+       case OPC_JUMP:
+       case OPC_CALL:
+               printf(" #%d", cat0->immed);
+               break;
+       }
+
+       if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4))
+               printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4);
+}
+
+static void print_instr_cat1(instr_t *instr)
+{
+       instr_cat1_t *cat1 = &instr->cat1;
+
+       if (cat1->ul)
+               printf("(ul)");
+
+       if (cat1->src_type == cat1->dst_type) {
+               if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
+                       /* special case (nmemonic?): */
+                       printf("mova");
+               } else {
+                       printf("mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+               }
+       } else {
+               printf("cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+       }
+
+       printf(" ");
+
+       if (cat1->even)
+               printf("(even)");
+
+       if (cat1->pos_inf)
+               printf("(pos_infinity)");
+
+       print_reg_dst((reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+                       cat1->dst_rel);
+
+       printf(", ");
+
+       /* ugg, have to special case this.. vs print_reg().. */
+       if (cat1->src_im) {
+               if (type_float(cat1->src_type))
+                       printf("(%f)", cat1->fim_val);
+               else
+                       printf("%d", cat1->iim_val);
+       } else if (cat1->src_rel && !cat1->src_c) {
+               /* I would just use %+d but trying to make it diff'able with
+                * libllvm-a3xx...
+                */
+               char type = cat1->src_rel_c ? 'c' : 'r';
+               if (cat1->off < 0)
+                       printf("%c<a0.x - %d>", type, -cat1->off);
+               else if (cat1->off > 0)
+                       printf("%c<a0.x + %d>", type, cat1->off);
+               else
+                       printf("c<a0.x>");
+       } else {
+               print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32,
+                               cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
+       }
+
+       if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
+               printf("\t{1: %x}", cat1->must_be_0);
+}
+
+static void print_instr_cat2(instr_t *instr)
+{
+       instr_cat2_t *cat2 = &instr->cat2;
+       static const char *cond[] = {
+                       "lt",
+                       "le",
+                       "gt",
+                       "ge",
+                       "eq",
+                       "ne",
+                       "?6?",
+       };
+
+       switch (cat2->opc) {
+       case OPC_CMPS_F:
+       case OPC_CMPS_U:
+       case OPC_CMPS_S:
+       case OPC_CMPV_F:
+       case OPC_CMPV_U:
+       case OPC_CMPV_S:
+               printf(".%s", cond[cat2->cond]);
+               break;
+       }
+
+       printf(" ");
+       if (cat2->ei)
+               printf("(ei)");
+       print_reg_dst((reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+       printf(", ");
+
+       if (cat2->c1.src1_c) {
+               print_reg_src((reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
+                               cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
+                               cat2->src1_abs, false);
+       } else if (cat2->rel1.src1_rel) {
+               print_reg_src((reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
+                               cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
+                               cat2->src1_abs, cat2->rel1.src1_rel);
+       } else {
+               print_reg_src((reg_t)(cat2->src1), cat2->full, cat2->src1_r,
+                               false, cat2->src1_im, cat2->src1_neg,
+                               cat2->src1_abs, false);
+       }
+
+       switch (cat2->opc) {
+       case OPC_ABSNEG_F:
+       case OPC_ABSNEG_S:
+       case OPC_CLZ_B:
+       case OPC_CLZ_S:
+       case OPC_SIGN_F:
+       case OPC_FLOOR_F:
+       case OPC_CEIL_F:
+       case OPC_RNDNE_F:
+       case OPC_RNDAZ_F:
+       case OPC_TRUNC_F:
+       case OPC_NOT_B:
+       case OPC_BFREV_B:
+       case OPC_SETRM:
+       case OPC_CBITS_B:
+               /* these only have one src reg */
+               break;
+       default:
+               printf(", ");
+               if (cat2->c2.src2_c) {
+                       print_reg_src((reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
+                                       cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
+                                       cat2->src2_abs, false);
+               } else if (cat2->rel2.src2_rel) {
+                       print_reg_src((reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
+                                       cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
+                                       cat2->src2_abs, cat2->rel2.src2_rel);
+               } else {
+                       print_reg_src((reg_t)(cat2->src2), cat2->full, cat2->src2_r,
+                                       false, cat2->src2_im, cat2->src2_neg,
+                                       cat2->src2_abs, false);
+               }
+               break;
+       }
+}
+
+static void print_instr_cat3(instr_t *instr)
+{
+       instr_cat3_t *cat3 = &instr->cat3;
+       bool full = instr_cat3_full(cat3);
+
+       printf(" ");
+       print_reg_dst((reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+       printf(", ");
+       if (cat3->c1.src1_c) {
+               print_reg_src((reg_t)(cat3->c1.src1), full,
+                               cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
+                               false, false);
+       } else if (cat3->rel1.src1_rel) {
+               print_reg_src((reg_t)(cat3->rel1.src1), full,
+                               cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
+                               false, cat3->rel1.src1_rel);
+       } else {
+               print_reg_src((reg_t)(cat3->src1), full,
+                               cat3->src1_r, false, false, cat3->src1_neg,
+                               false, false);
+       }
+       printf(", ");
+       print_reg_src((reg_t)cat3->src2, full,
+                       cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
+                       false, false);
+       printf(", ");
+       if (cat3->c2.src3_c) {
+               print_reg_src((reg_t)(cat3->c2.src3), full,
+                               cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
+                               false, false);
+       } else if (cat3->rel2.src3_rel) {
+               print_reg_src((reg_t)(cat3->rel2.src3), full,
+                               cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
+                               false, cat3->rel2.src3_rel);
+       } else {
+               print_reg_src((reg_t)(cat3->src3), full,
+                               cat3->src3_r, false, false, cat3->src3_neg,
+                               false, false);
+       }
+}
+
+static void print_instr_cat4(instr_t *instr)
+{
+       instr_cat4_t *cat4 = &instr->cat4;
+
+       printf(" ");
+       print_reg_dst((reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+       printf(", ");
+
+       if (cat4->c.src_c) {
+               print_reg_src((reg_t)(cat4->c.src), cat4->full,
+                               cat4->src_r, cat4->c.src_c, cat4->src_im,
+                               cat4->src_neg, cat4->src_abs, false);
+       } else if (cat4->rel.src_rel) {
+               print_reg_src((reg_t)(cat4->rel.src), cat4->full,
+                               cat4->src_r, cat4->rel.src_c, cat4->src_im,
+                               cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
+       } else {
+               print_reg_src((reg_t)(cat4->src), cat4->full,
+                               cat4->src_r, false, cat4->src_im,
+                               cat4->src_neg, cat4->src_abs, false);
+       }
+
+       if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
+               printf("\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+}
+
+static void print_instr_cat5(instr_t *instr)
+{
+       static const struct {
+               bool src1, src2, samp, tex;
+       } info[0x1f] = {
+                       [OPC_ISAM]     = { true,  false, true,  true,  },
+                       [OPC_ISAML]    = { true,  true,  true,  true,  },
+                       [OPC_ISAMM]    = { true,  false, true,  true,  },
+                       [OPC_SAM]      = { true,  false, true,  true,  },
+                       [OPC_SAMB]     = { true,  true,  true,  true,  },
+                       [OPC_SAML]     = { true,  true,  true,  true,  },
+                       [OPC_SAMGQ]    = { true,  false, true,  true,  },
+                       [OPC_GETLOD]   = { true,  false, true,  true,  },
+                       [OPC_CONV]     = { true,  true,  true,  true,  },
+                       [OPC_CONVM]    = { true,  true,  true,  true,  },
+                       [OPC_GETSIZE]  = { true,  false, false, true,  },
+                       [OPC_GETBUF]   = { false, false, false, true,  },
+                       [OPC_GETPOS]   = { true,  false, false, true,  },
+                       [OPC_GETINFO]  = { false, false, false, true,  },
+                       [OPC_DSX]      = { true,  false, false, false, },
+                       [OPC_DSY]      = { true,  false, false, false, },
+                       [OPC_GATHER4R] = { true,  false, true,  true,  },
+                       [OPC_GATHER4G] = { true,  false, true,  true,  },
+                       [OPC_GATHER4B] = { true,  false, true,  true,  },
+                       [OPC_GATHER4A] = { true,  false, true,  true,  },
+                       [OPC_SAMGP0]   = { true,  false, true,  true,  },
+                       [OPC_SAMGP1]   = { true,  false, true,  true,  },
+                       [OPC_SAMGP2]   = { true,  false, true,  true,  },
+                       [OPC_SAMGP3]   = { true,  false, true,  true,  },
+                       [OPC_DSXPP_1]  = { true,  false, false, false, },
+                       [OPC_DSYPP_1]  = { true,  false, false, false, },
+                       [OPC_RGETPOS]  = { false, false, false, false, },
+                       [OPC_RGETINFO] = { false, false, false, false, },
+       };
+       instr_cat5_t *cat5 = &instr->cat5;
+       int i;
+
+       if (cat5->is_3d)   printf(".3d");
+       if (cat5->is_a)    printf(".a");
+       if (cat5->is_o)    printf(".o");
+       if (cat5->is_p)    printf(".p");
+       if (cat5->is_s)    printf(".s");
+       if (cat5->is_s2en) printf(".s2en");
+
+       printf(" ");
+
+       switch (cat5->opc) {
+       case OPC_DSXPP_1:
+       case OPC_DSYPP_1:
+               break;
+       default:
+               printf("(%s)", type[cat5->type]);
+               break;
+       }
+
+       printf("(");
+       for (i = 0; i < 4; i++)
+               if (cat5->wrmask & (1 << i))
+                       printf("%c", "xyzw"[i]);
+       printf(")");
+
+       print_reg_dst((reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+
+       if (info[cat5->opc].src1) {
+               printf(", ");
+               print_reg_src((reg_t)(cat5->src1), cat5->full, false, false, false,
+                               false, false, false);
+       }
+
+       if (cat5->is_s2en) {
+               printf(", ");
+               print_reg_src((reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
+                               false, false, false);
+               printf(", ");
+               print_reg_src((reg_t)(cat5->s2en.src3), false, false, false, false,
+                               false, false, false);
+       } else {
+               if (cat5->is_o || info[cat5->opc].src2) {
+                       printf(", ");
+                       print_reg_src((reg_t)(cat5->norm.src2), cat5->full,
+                                       false, false, false, false, false, false);
+               }
+               if (info[cat5->opc].samp)
+                       printf(", s#%d", cat5->norm.samp);
+               if (info[cat5->opc].tex)
+                       printf(", t#%d", cat5->norm.tex);
+       }
+
+       if (debug & PRINT_VERBOSE) {
+               if (cat5->is_s2en) {
+                       if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
+                               printf("\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
+               } else {
+                       if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
+                               printf("\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
+               }
+       }
+}
+
+static int32_t u2i(uint32_t val, int nbits)
+{
+       return ((val >> (nbits-1)) * ~((1 << nbits) - 1)) | val;
+}
+
+static void print_instr_cat6(instr_t *instr)
+{
+       instr_cat6_t *cat6 = &instr->cat6;
+
+       printf(".%s ", type[cat6->type]);
+
+       switch (cat6->opc) {
+       case OPC_LDG:
+       case OPC_LDP:
+       case OPC_LDL:
+       case OPC_LDLW:
+       case OPC_LDLV:
+               /* load instructions: */
+               print_reg_dst((reg_t)(cat6->a.dst), type_size(cat6->type) == 32, false);
+               printf(",");
+               switch (cat6->opc) {
+               case OPC_LDG:
+                       printf("g");
+                       break;
+               case OPC_LDP:
+                       printf("p");
+                       break;
+               case OPC_LDL:
+               case OPC_LDLW:
+               case OPC_LDLV:
+                       printf("l");
+                       break;
+               }
+               printf("[");
+               print_reg_src((reg_t)(cat6->a.src), true,
+                               false, false, false, false, false, false);
+               if (cat6->a.off)
+                       printf("%+d", cat6->a.off);
+               printf("]");
+               break;
+       case OPC_PREFETCH:
+               /* similar to load instructions: */
+               printf("g[");
+               print_reg_src((reg_t)(cat6->a.src), true,
+                               false, false, false, false, false, false);
+               if (cat6->a.off)
+                       printf("%+d", cat6->a.off);
+               printf("]");
+               break;
+       case OPC_STG:
+       case OPC_STP:
+       case OPC_STL:
+       case OPC_STLW:
+               /* store instructions: */
+               switch (cat6->opc) {
+               case OPC_STG:
+                       printf("g");
+                       break;
+               case OPC_STP:
+                       printf("p");
+                       break;
+               case OPC_STL:
+               case OPC_STLW:
+                       printf("l");
+                       break;
+               }
+               printf("[");
+               print_reg_dst((reg_t)(cat6->b.dst), true, false);
+               if (cat6->b.off || cat6->b.off_hi)
+                       printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13));
+               printf("]");
+               printf(",");
+               print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32,
+                               false, false, false, false, false, false);
+
+               break;
+       case OPC_STI:
+               /* sti has same encoding as other store instructions, but
+                * slightly different syntax:
+                */
+               print_reg_dst((reg_t)(cat6->b.dst), false /* XXX is it always half? */, false);
+               if (cat6->b.off || cat6->b.off_hi)
+                       printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13));
+               printf(",");
+               print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32,
+                               false, false, false, false, false, false);
+               break;
+       }
+
+       printf(", %d", cat6->iim_val);
+
+       if (debug & PRINT_VERBOSE) {
+               switch (cat6->opc) {
+               case OPC_LDG:
+               case OPC_LDP:
+                       /* load instructions: */
+                       if (cat6->a.dummy1|cat6->a.dummy2|cat6->a.dummy3)
+                               printf("\t{6: %x,%x,%x}", cat6->a.dummy1, cat6->a.dummy2, cat6->a.dummy3);
+                       if ((cat6->a.must_be_one1 != 1) || (cat6->a.must_be_one2 != 1))
+                               printf("{?? %d,%d ??}", cat6->a.must_be_one1, cat6->a.must_be_one2);
+                       break;
+               case OPC_STG:
+               case OPC_STP:
+               case OPC_STI:
+                       /* store instructions: */
+                       if (cat6->b.dummy1|cat6->b.dummy2)
+                               printf("\t{6: %x,%x}", cat6->b.dummy1, cat6->b.dummy2);
+                       if ((cat6->b.must_be_one1 != 1) || (cat6->b.must_be_one2 != 1) ||
+                                       (cat6->b.must_be_zero1 != 0))
+                               printf("{?? %d,%d,%d ??}", cat6->b.must_be_one1, cat6->b.must_be_one2,
+                                               cat6->b.must_be_zero1);
+                       break;
+               }
+       }
+}
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+struct opc_info {
+       uint16_t cat;
+       uint16_t opc;
+       const char *name;
+       void (*print)(instr_t *instr);
+} opcs[1 << (3+NOPC_BITS)] = {
+#define OPC(cat, opc, name) [((cat) << NOPC_BITS) | (opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+       /* category 0: */
+       OPC(0, OPC_NOP,          nop),
+       OPC(0, OPC_BR,           br),
+       OPC(0, OPC_JUMP,         jump),
+       OPC(0, OPC_CALL,         call),
+       OPC(0, OPC_RET,          ret),
+       OPC(0, OPC_KILL,         kill),
+       OPC(0, OPC_END,          end),
+       OPC(0, OPC_EMIT,         emit),
+       OPC(0, OPC_CUT,          cut),
+       OPC(0, OPC_CHMASK,       chmask),
+       OPC(0, OPC_CHSH,         chsh),
+       OPC(0, OPC_FLOW_REV,     flow_rev),
+
+       /* category 1: */
+       OPC(1, 0, ),
+
+       /* category 2: */
+       OPC(2, OPC_ADD_F,        add.f),
+       OPC(2, OPC_MIN_F,        min.f),
+       OPC(2, OPC_MAX_F,        max.f),
+       OPC(2, OPC_MUL_F,        mul.f),
+       OPC(2, OPC_SIGN_F,       sign.f),
+       OPC(2, OPC_CMPS_F,       cmps.f),
+       OPC(2, OPC_ABSNEG_F,     absneg.f),
+       OPC(2, OPC_CMPV_F,       cmpv.f),
+       OPC(2, OPC_FLOOR_F,      floor.f),
+       OPC(2, OPC_CEIL_F,       ceil.f),
+       OPC(2, OPC_RNDNE_F,      rndne.f),
+       OPC(2, OPC_RNDAZ_F,      rndaz.f),
+       OPC(2, OPC_TRUNC_F,      trunc.f),
+       OPC(2, OPC_ADD_U,        add.u),
+       OPC(2, OPC_ADD_S,        add.s),
+       OPC(2, OPC_SUB_U,        sub.u),
+       OPC(2, OPC_SUB_S,        sub.s),
+       OPC(2, OPC_CMPS_U,       cmps.u),
+       OPC(2, OPC_CMPS_S,       cmps.s),
+       OPC(2, OPC_MIN_U,        min.u),
+       OPC(2, OPC_MIN_S,        min.s),
+       OPC(2, OPC_MAX_U,        max.u),
+       OPC(2, OPC_MAX_S,        max.s),
+       OPC(2, OPC_ABSNEG_S,     absneg.s),
+       OPC(2, OPC_AND_B,        and.b),
+       OPC(2, OPC_OR_B,         or.b),
+       OPC(2, OPC_NOT_B,        not.b),
+       OPC(2, OPC_XOR_B,        xor.b),
+       OPC(2, OPC_CMPV_U,       cmpv.u),
+       OPC(2, OPC_CMPV_S,       cmpv.s),
+       OPC(2, OPC_MUL_U,        mul.u),
+       OPC(2, OPC_MUL_S,        mul.s),
+       OPC(2, OPC_MULL_U,       mull.u),
+       OPC(2, OPC_BFREV_B,      bfrev.b),
+       OPC(2, OPC_CLZ_S,        clz.s),
+       OPC(2, OPC_CLZ_B,        clz.b),
+       OPC(2, OPC_SHL_B,        shl.b),
+       OPC(2, OPC_SHR_B,        shr.b),
+       OPC(2, OPC_ASHR_B,       ashr.b),
+       OPC(2, OPC_BARY_F,       bary.f),
+       OPC(2, OPC_MGEN_B,       mgen.b),
+       OPC(2, OPC_GETBIT_B,     getbit.b),
+       OPC(2, OPC_SETRM,        setrm),
+       OPC(2, OPC_CBITS_B,      cbits.b),
+       OPC(2, OPC_SHB,          shb),
+       OPC(2, OPC_MSAD,         msad),
+
+       /* category 3: */
+       OPC(3, OPC_MAD_U16,      mad.u16),
+       OPC(3, OPC_MADSH_U16,    madsh.u16),
+       OPC(3, OPC_MAD_S16,      mad.s16),
+       OPC(3, OPC_MADSH_M16,    madsh.m16),
+       OPC(3, OPC_MAD_U24,      mad.u24),
+       OPC(3, OPC_MAD_S24,      mad.s24),
+       OPC(3, OPC_MAD_F16,      mad.f16),
+       OPC(3, OPC_MAD_F32,      mad.f32),
+       OPC(3, OPC_SEL_B16,      sel.b16),
+       OPC(3, OPC_SEL_B32,      sel.b32),
+       OPC(3, OPC_SEL_S16,      sel.s16),
+       OPC(3, OPC_SEL_S32,      sel.s32),
+       OPC(3, OPC_SEL_F16,      sel.f16),
+       OPC(3, OPC_SEL_F32,      sel.f32),
+       OPC(3, OPC_SAD_S16,      sad.s16),
+       OPC(3, OPC_SAD_S32,      sad.s32),
+
+       /* category 4: */
+       OPC(4, OPC_RCP,          rcp),
+       OPC(4, OPC_RSQ,          rsq),
+       OPC(4, OPC_LOG2,         log2),
+       OPC(4, OPC_EXP2,         exp2),
+       OPC(4, OPC_SIN,          sin),
+       OPC(4, OPC_COS,          cos),
+       OPC(4, OPC_SQRT,         sqrt),
+
+       /* category 5: */
+       OPC(5, OPC_ISAM,         isam),
+       OPC(5, OPC_ISAML,        isaml),
+       OPC(5, OPC_ISAMM,        isamm),
+       OPC(5, OPC_SAM,          sam),
+       OPC(5, OPC_SAMB,         samb),
+       OPC(5, OPC_SAML,         saml),
+       OPC(5, OPC_SAMGQ,        samgq),
+       OPC(5, OPC_GETLOD,       getlod),
+       OPC(5, OPC_CONV,         conv),
+       OPC(5, OPC_CONVM,        convm),
+       OPC(5, OPC_GETSIZE,      getsize),
+       OPC(5, OPC_GETBUF,       getbuf),
+       OPC(5, OPC_GETPOS,       getpos),
+       OPC(5, OPC_GETINFO,      getinfo),
+       OPC(5, OPC_DSX,          dsx),
+       OPC(5, OPC_DSY,          dsy),
+       OPC(5, OPC_GATHER4R,     gather4r),
+       OPC(5, OPC_GATHER4G,     gather4g),
+       OPC(5, OPC_GATHER4B,     gather4b),
+       OPC(5, OPC_GATHER4A,     gather4a),
+       OPC(5, OPC_SAMGP0,       samgp0),
+       OPC(5, OPC_SAMGP1,       samgp1),
+       OPC(5, OPC_SAMGP2,       samgp2),
+       OPC(5, OPC_SAMGP3,       samgp3),
+       OPC(5, OPC_DSXPP_1,      dsxpp.1),
+       OPC(5, OPC_DSYPP_1,      dsypp.1),
+       OPC(5, OPC_RGETPOS,      rgetpos),
+       OPC(5, OPC_RGETINFO,     rgetinfo),
+
+
+       /* category 6: */
+       OPC(6, OPC_LDG,          ldg),
+       OPC(6, OPC_LDL,          ldl),
+       OPC(6, OPC_LDP,          ldp),
+       OPC(6, OPC_STG,          stg),
+       OPC(6, OPC_STL,          stl),
+       OPC(6, OPC_STP,          stp),
+       OPC(6, OPC_STI,          sti),
+       OPC(6, OPC_G2L,          g2l),
+       OPC(6, OPC_L2G,          l2g),
+       OPC(6, OPC_PREFETCH,     prefetch),
+       OPC(6, OPC_LDLW,         ldlw),
+       OPC(6, OPC_STLW,         stlw),
+       OPC(6, OPC_RESFMT,       resfmt),
+       OPC(6, OPC_RESINFO,      resinf),
+       OPC(6, OPC_ATOMIC_ADD_L,     atomic.add.l),
+       OPC(6, OPC_ATOMIC_SUB_L,     atomic.sub.l),
+       OPC(6, OPC_ATOMIC_XCHG_L,    atomic.xchg.l),
+       OPC(6, OPC_ATOMIC_INC_L,     atomic.inc.l),
+       OPC(6, OPC_ATOMIC_DEC_L,     atomic.dec.l),
+       OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l),
+       OPC(6, OPC_ATOMIC_MIN_L,     atomic.min.l),
+       OPC(6, OPC_ATOMIC_MAX_L,     atomic.max.l),
+       OPC(6, OPC_ATOMIC_AND_L,     atomic.and.l),
+       OPC(6, OPC_ATOMIC_OR_L,      atomic.or.l),
+       OPC(6, OPC_ATOMIC_XOR_L,     atomic.xor.l),
+       OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.4d),
+       OPC(6, OPC_STGB_4D_4,    stgb.4d.4),
+       OPC(6, OPC_STIB,         stib),
+       OPC(6, OPC_LDC_4,        ldc.4),
+       OPC(6, OPC_LDLV,         ldlv),
+
+
+#undef OPC
+};
+
+#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
+
+// XXX hack.. probably should move this table somewhere common:
+#include "ir3.h"
+const char *ir3_instr_name(struct ir3_instruction *instr)
+{
+       if (instr->category == -1) return "??meta??";
+       return opcs[(instr->category << NOPC_BITS) | instr->opc].name;
+}
+
+static void print_instr(uint32_t *dwords, int level, int n)
+{
+       instr_t *instr = (instr_t *)dwords;
+       uint32_t opc = instr_opc(instr);
+       const char *name;
+
+       printf("%s%04d[%08xx_%08xx] ", levels[level], n, dwords[1], dwords[0]);
+
+#if 0
+       /* print unknown bits: */
+       if (debug & PRINT_RAW)
+               printf("[%08xx_%08xx] ", dwords[1] & 0x001ff800, dwords[0] & 0x00000000);
+
+       if (debug & PRINT_VERBOSE)
+               printf("%d,%02d ", instr->opc_cat, opc);
+#endif
+
+       /* NOTE: order flags are printed is a bit fugly.. but for now I
+        * try to match the order in llvm-a3xx disassembler for easy
+        * diff'ing..
+        */
+
+       if (instr->sync)
+               printf("(sy)");
+       if (instr->ss && (instr->opc_cat <= 4))
+               printf("(ss)");
+       if (instr->jmp_tgt)
+               printf("(jp)");
+       if (instr->repeat && (instr->opc_cat <= 4)) {
+               printf("(rpt%d)", instr->repeat);
+               repeat = instr->repeat;
+       } else {
+               repeat = 0;
+       }
+       if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
+               printf("(ul)");
+
+       name = GETINFO(instr)->name;
+
+       if (name) {
+               printf("%s", name);
+               GETINFO(instr)->print(instr);
+       } else {
+               printf("unknown(%d,%d)", instr->opc_cat, opc);
+       }
+
+       printf("\n");
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type)
+{
+       int i;
+
+       assert((sizedwords % 2) == 0);
+
+       for (i = 0; i < sizedwords; i += 2)
+               print_instr(&dwords[i], level, i/2);
+
+       return 0;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
new file mode 100644 (file)
index 0000000..c67f103
--- /dev/null
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A3XX_H_
+#define INSTR_A3XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+#include <stdint.h>
+#include <assert.h>
+
+typedef enum {
+       /* category 0: */
+       OPC_NOP = 0,
+       OPC_BR = 1,
+       OPC_JUMP = 2,
+       OPC_CALL = 3,
+       OPC_RET = 4,
+       OPC_KILL = 5,
+       OPC_END = 6,
+       OPC_EMIT = 7,
+       OPC_CUT = 8,
+       OPC_CHMASK = 9,
+       OPC_CHSH = 10,
+       OPC_FLOW_REV = 11,
+
+       /* category 1: */
+       /* no opc.. all category 1 are variants of mov */
+
+       /* category 2: */
+       OPC_ADD_F = 0,
+       OPC_MIN_F = 1,
+       OPC_MAX_F = 2,
+       OPC_MUL_F = 3,
+       OPC_SIGN_F = 4,
+       OPC_CMPS_F = 5,
+       OPC_ABSNEG_F = 6,
+       OPC_CMPV_F = 7,
+       /* 8 - invalid */
+       OPC_FLOOR_F = 9,
+       OPC_CEIL_F = 10,
+       OPC_RNDNE_F = 11,
+       OPC_RNDAZ_F = 12,
+       OPC_TRUNC_F = 13,
+       /* 14-15 - invalid */
+       OPC_ADD_U = 16,
+       OPC_ADD_S = 17,
+       OPC_SUB_U = 18,
+       OPC_SUB_S = 19,
+       OPC_CMPS_U = 20,
+       OPC_CMPS_S = 21,
+       OPC_MIN_U = 22,
+       OPC_MIN_S = 23,
+       OPC_MAX_U = 24,
+       OPC_MAX_S = 25,
+       OPC_ABSNEG_S = 26,
+       /* 27 - invalid */
+       OPC_AND_B = 28,
+       OPC_OR_B = 29,
+       OPC_NOT_B = 30,
+       OPC_XOR_B = 31,
+       /* 32 - invalid */
+       OPC_CMPV_U = 33,
+       OPC_CMPV_S = 34,
+       /* 35-47 - invalid */
+       OPC_MUL_U = 48,
+       OPC_MUL_S = 49,
+       OPC_MULL_U = 50,
+       OPC_BFREV_B = 51,
+       OPC_CLZ_S = 52,
+       OPC_CLZ_B = 53,
+       OPC_SHL_B = 54,
+       OPC_SHR_B = 55,
+       OPC_ASHR_B = 56,
+       OPC_BARY_F = 57,
+       OPC_MGEN_B = 58,
+       OPC_GETBIT_B = 59,
+       OPC_SETRM = 60,
+       OPC_CBITS_B = 61,
+       OPC_SHB = 62,
+       OPC_MSAD = 63,
+
+       /* category 3: */
+       OPC_MAD_U16 = 0,
+       OPC_MADSH_U16 = 1,
+       OPC_MAD_S16 = 2,
+       OPC_MADSH_M16 = 3,   /* should this be .s16? */
+       OPC_MAD_U24 = 4,
+       OPC_MAD_S24 = 5,
+       OPC_MAD_F16 = 6,
+       OPC_MAD_F32 = 7,
+       OPC_SEL_B16 = 8,
+       OPC_SEL_B32 = 9,
+       OPC_SEL_S16 = 10,
+       OPC_SEL_S32 = 11,
+       OPC_SEL_F16 = 12,
+       OPC_SEL_F32 = 13,
+       OPC_SAD_S16 = 14,
+       OPC_SAD_S32 = 15,
+
+       /* category 4: */
+       OPC_RCP = 0,
+       OPC_RSQ = 1,
+       OPC_LOG2 = 2,
+       OPC_EXP2 = 3,
+       OPC_SIN = 4,
+       OPC_COS = 5,
+       OPC_SQRT = 6,
+       // 7-63 - invalid
+
+       /* category 5: */
+       OPC_ISAM = 0,
+       OPC_ISAML = 1,
+       OPC_ISAMM = 2,
+       OPC_SAM = 3,
+       OPC_SAMB = 4,
+       OPC_SAML = 5,
+       OPC_SAMGQ = 6,
+       OPC_GETLOD = 7,
+       OPC_CONV = 8,
+       OPC_CONVM = 9,
+       OPC_GETSIZE = 10,
+       OPC_GETBUF = 11,
+       OPC_GETPOS = 12,
+       OPC_GETINFO = 13,
+       OPC_DSX = 14,
+       OPC_DSY = 15,
+       OPC_GATHER4R = 16,
+       OPC_GATHER4G = 17,
+       OPC_GATHER4B = 18,
+       OPC_GATHER4A = 19,
+       OPC_SAMGP0 = 20,
+       OPC_SAMGP1 = 21,
+       OPC_SAMGP2 = 22,
+       OPC_SAMGP3 = 23,
+       OPC_DSXPP_1 = 24,
+       OPC_DSYPP_1 = 25,
+       OPC_RGETPOS = 26,
+       OPC_RGETINFO = 27,
+
+       /* category 6: */
+       OPC_LDG = 0,        /* load-global */
+       OPC_LDL = 1,
+       OPC_LDP = 2,
+       OPC_STG = 3,        /* store-global */
+       OPC_STL = 4,
+       OPC_STP = 5,
+       OPC_STI = 6,
+       OPC_G2L = 7,
+       OPC_L2G = 8,
+       OPC_PREFETCH = 9,
+       OPC_LDLW = 10,
+       OPC_STLW = 11,
+       OPC_RESFMT = 14,
+       OPC_RESINFO = 15,
+       OPC_ATOMIC_ADD_L = 16,
+       OPC_ATOMIC_SUB_L = 17,
+       OPC_ATOMIC_XCHG_L = 18,
+       OPC_ATOMIC_INC_L = 19,
+       OPC_ATOMIC_DEC_L = 20,
+       OPC_ATOMIC_CMPXCHG_L = 21,
+       OPC_ATOMIC_MIN_L = 22,
+       OPC_ATOMIC_MAX_L = 23,
+       OPC_ATOMIC_AND_L = 24,
+       OPC_ATOMIC_OR_L = 25,
+       OPC_ATOMIC_XOR_L = 26,
+       OPC_LDGB_TYPED_4D = 27,
+       OPC_STGB_4D_4 = 28,
+       OPC_STIB = 29,
+       OPC_LDC_4 = 30,
+       OPC_LDLV = 31,
+
+       /* meta instructions (category -1): */
+       /* placeholder instr to mark inputs/outputs: */
+       OPC_META_INPUT = 0,
+       OPC_META_OUTPUT = 1,
+       /* The "fan-in" and "fan-out" instructions are used for keeping
+        * track of instructions that write to multiple dst registers
+        * (fan-out) like texture sample instructions, or read multiple
+        * consecutive scalar registers (fan-in) (bary.f, texture samp)
+        */
+       OPC_META_FO = 2,
+       OPC_META_FI = 3,
+       /* branches/flow control */
+       OPC_META_FLOW = 4,
+       OPC_META_PHI = 5,
+       /* relative addressing */
+       OPC_META_DEREF = 6,
+
+
+} opc_t;
+
+typedef enum {
+       TYPE_F16 = 0,
+       TYPE_F32 = 1,
+       TYPE_U16 = 2,
+       TYPE_U32 = 3,
+       TYPE_S16 = 4,
+       TYPE_S32 = 5,
+       TYPE_U8  = 6,
+       TYPE_S8  = 7,  // XXX I assume?
+} type_t;
+
+static inline uint32_t type_size(type_t type)
+{
+       switch (type) {
+       case TYPE_F32:
+       case TYPE_U32:
+       case TYPE_S32:
+               return 32;
+       case TYPE_F16:
+       case TYPE_U16:
+       case TYPE_S16:
+               return 16;
+       case TYPE_U8:
+       case TYPE_S8:
+               return 8;
+       default:
+               assert(0); /* invalid type */
+               return 0;
+       }
+}
+
+static inline int type_float(type_t type)
+{
+       return (type == TYPE_F32) || (type == TYPE_F16);
+}
+
+static inline int type_uint(type_t type)
+{
+       return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+       return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
+typedef union PACKED {
+       /* normal gpr or const src register: */
+       struct PACKED {
+               uint32_t comp  : 2;
+               uint32_t num   : 10;
+       };
+       /* for immediate val: */
+       int32_t  iim_val   : 11;
+       /* to make compiler happy: */
+       uint32_t dummy32;
+       uint32_t dummy10   : 10;
+       uint32_t dummy11   : 11;
+       uint32_t dummy12   : 12;
+       uint32_t dummy13   : 13;
+       uint32_t dummy8    : 8;
+} reg_t;
+
+/* special registers: */
+#define REG_A0 61       /* address register */
+#define REG_P0 62       /* predicate register */
+
+static inline int reg_special(reg_t reg)
+{
+       return (reg.num == REG_A0) || (reg.num == REG_P0);
+}
+
+typedef struct PACKED {
+       /* dword0: */
+       int16_t  immed    : 16;
+       uint32_t dummy1   : 16;
+
+       /* dword1: */
+       uint32_t dummy2   : 8;
+       uint32_t repeat   : 3;
+       uint32_t dummy3   : 1;
+       uint32_t ss       : 1;
+       uint32_t dummy4   : 7;
+       uint32_t inv      : 1;
+       uint32_t comp     : 2;
+       uint32_t opc      : 4;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat0_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               /* for normal src register: */
+               struct PACKED {
+                       uint32_t src : 11;
+                       /* at least low bit of pad must be zero or it will
+                        * look like a address relative src
+                        */
+                       uint32_t pad : 21;
+               };
+               /* for address relative: */
+               struct PACKED {
+                       int32_t  off : 10;
+                       uint32_t src_rel_c : 1;
+                       uint32_t src_rel : 1;
+                       uint32_t unknown : 20;
+               };
+               /* for immediate: */
+               int32_t iim_val;
+               float   fim_val;
+       };
+
+       /* dword1: */
+       uint32_t dst        : 8;
+       uint32_t repeat     : 3;
+       uint32_t src_r      : 1;
+       uint32_t ss         : 1;
+       uint32_t ul         : 1;
+       uint32_t dst_type   : 3;
+       uint32_t dst_rel    : 1;
+       uint32_t src_type   : 3;
+       uint32_t src_c      : 1;
+       uint32_t src_im     : 1;
+       uint32_t even       : 1;
+       uint32_t pos_inf    : 1;
+       uint32_t must_be_0  : 2;
+       uint32_t jmp_tgt    : 1;
+       uint32_t sync       : 1;
+       uint32_t opc_cat    : 3;
+} instr_cat1_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       uint32_t src1         : 11;
+                       uint32_t must_be_zero1: 2;
+                       uint32_t src1_im      : 1;   /* immediate */
+                       uint32_t src1_neg     : 1;   /* negate */
+                       uint32_t src1_abs     : 1;   /* absolute value */
+               };
+               struct PACKED {
+                       uint32_t src1         : 10;
+                       uint32_t src1_c       : 1;   /* relative-const */
+                       uint32_t src1_rel     : 1;   /* relative address */
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel1;
+               struct PACKED {
+                       uint32_t src1         : 12;
+                       uint32_t src1_c       : 1;   /* const */
+                       uint32_t dummy        : 3;
+               } c1;
+       };
+
+       union PACKED {
+               struct PACKED {
+                       uint32_t src2         : 11;
+                       uint32_t must_be_zero2: 2;
+                       uint32_t src2_im      : 1;   /* immediate */
+                       uint32_t src2_neg     : 1;   /* negate */
+                       uint32_t src2_abs     : 1;   /* absolute value */
+               };
+               struct PACKED {
+                       uint32_t src2         : 10;
+                       uint32_t src2_c       : 1;   /* relative-const */
+                       uint32_t src2_rel     : 1;   /* relative address */
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel2;
+               struct PACKED {
+                       uint32_t src2         : 12;
+                       uint32_t src2_c       : 1;   /* const */
+                       uint32_t dummy        : 3;
+               } c2;
+       };
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t repeat   : 3;
+       uint32_t src1_r   : 1;
+       uint32_t ss       : 1;
+       uint32_t ul       : 1;   /* dunno */
+       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+       uint32_t ei       : 1;
+       uint32_t cond     : 3;
+       uint32_t src2_r   : 1;
+       uint32_t full     : 1;   /* not half */
+       uint32_t opc      : 6;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat2_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       uint32_t src1         : 11;
+                       uint32_t must_be_zero1: 2;
+                       uint32_t src2_c       : 1;
+                       uint32_t src1_neg     : 1;
+                       uint32_t src2_r       : 1;
+               };
+               struct PACKED {
+                       uint32_t src1         : 10;
+                       uint32_t src1_c       : 1;
+                       uint32_t src1_rel     : 1;
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel1;
+               struct PACKED {
+                       uint32_t src1         : 12;
+                       uint32_t src1_c       : 1;
+                       uint32_t dummy        : 3;
+               } c1;
+       };
+
+       union PACKED {
+               struct PACKED {
+                       uint32_t src3         : 11;
+                       uint32_t must_be_zero2: 2;
+                       uint32_t src3_r       : 1;
+                       uint32_t src2_neg     : 1;
+                       uint32_t src3_neg     : 1;
+               };
+               struct PACKED {
+                       uint32_t src3         : 10;
+                       uint32_t src3_c       : 1;
+                       uint32_t src3_rel     : 1;
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel2;
+               struct PACKED {
+                       uint32_t src3         : 12;
+                       uint32_t src3_c       : 1;
+                       uint32_t dummy        : 3;
+               } c2;
+       };
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t repeat   : 3;
+       uint32_t src1_r   : 1;
+       uint32_t ss       : 1;
+       uint32_t ul       : 1;
+       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+       uint32_t src2     : 8;
+       uint32_t opc      : 4;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat3_t;
+
+static inline bool instr_cat3_full(instr_cat3_t *cat3)
+{
+       switch (cat3->opc) {
+       case OPC_MAD_F16:
+       case OPC_MAD_U16:
+       case OPC_MAD_S16:
+       case OPC_SEL_B16:
+       case OPC_SEL_S16:
+       case OPC_SEL_F16:
+       case OPC_SAD_S16:
+       case OPC_SAD_S32:  // really??
+               return false;
+       default:
+               return true;
+       }
+}
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       uint32_t src          : 11;
+                       uint32_t must_be_zero1: 2;
+                       uint32_t src_im       : 1;   /* immediate */
+                       uint32_t src_neg      : 1;   /* negate */
+                       uint32_t src_abs      : 1;   /* absolute value */
+               };
+               struct PACKED {
+                       uint32_t src          : 10;
+                       uint32_t src_c        : 1;   /* relative-const */
+                       uint32_t src_rel      : 1;   /* relative address */
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel;
+               struct PACKED {
+                       uint32_t src          : 12;
+                       uint32_t src_c        : 1;   /* const */
+                       uint32_t dummy        : 3;
+               } c;
+       };
+       uint32_t dummy1   : 16;  /* seem to be ignored */
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t repeat   : 3;
+       uint32_t src_r    : 1;
+       uint32_t ss       : 1;
+       uint32_t ul       : 1;
+       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+       uint32_t dummy2   : 5;   /* seem to be ignored */
+       uint32_t full     : 1;   /* not half */
+       uint32_t opc      : 6;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat4_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               /* normal case: */
+               struct PACKED {
+                       uint32_t full     : 1;   /* not half */
+                       uint32_t src1     : 8;
+                       uint32_t src2     : 8;
+                       uint32_t dummy1   : 4;   /* seem to be ignored */
+                       uint32_t samp     : 4;
+                       uint32_t tex      : 7;
+               } norm;
+               /* s2en case: */
+               struct PACKED {
+                       uint32_t full     : 1;   /* not half */
+                       uint32_t src1     : 8;
+                       uint32_t src2     : 11;
+                       uint32_t dummy1   : 1;
+                       uint32_t src3     : 8;
+                       uint32_t dummy2   : 3;
+               } s2en;
+               /* same in either case: */
+               // XXX I think, confirm this
+               struct PACKED {
+                       uint32_t full     : 1;   /* not half */
+                       uint32_t src1     : 8;
+                       uint32_t pad      : 23;
+               };
+       };
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t wrmask   : 4;   /* write-mask */
+       uint32_t type     : 3;
+       uint32_t dummy2   : 1;   /* seems to be ignored */
+       uint32_t is_3d    : 1;
+
+       uint32_t is_a     : 1;
+       uint32_t is_s     : 1;
+       uint32_t is_s2en  : 1;
+       uint32_t is_o     : 1;
+       uint32_t is_p     : 1;
+
+       uint32_t opc      : 5;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat5_t;
+
+/* used for load instructions: */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t must_be_one1 : 1;
+       int16_t  off      : 13;
+       uint32_t src      : 8;
+       uint32_t dummy1   : 1;
+       uint32_t must_be_one2 : 1;
+       int32_t  iim_val  : 8;
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t dummy2   : 9;
+       uint32_t type     : 3;
+       uint32_t dummy3   : 2;
+       uint32_t opc      : 5;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat6a_t;
+
+/* used for store instructions: */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t must_be_zero1 : 1;
+       uint32_t src      : 8;
+       uint32_t off_hi   : 5;   /* high bits of 'off'... ugly! */
+       uint32_t dummy1   : 9;
+       uint32_t must_be_one1 : 1;
+       int32_t  iim_val  : 8;
+
+       /* dword1: */
+       uint16_t off      : 8;
+       uint32_t must_be_one2 : 1;
+       uint32_t dst      : 8;
+       uint32_t type     : 3;
+       uint32_t dummy2   : 2;
+       uint32_t opc      : 5;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat6b_t;
+
+typedef union PACKED {
+       instr_cat6a_t a;
+       instr_cat6b_t b;
+       struct PACKED {
+               /* dword0: */
+               uint32_t pad1     : 24;
+               int32_t  iim_val  : 8;
+
+               /* dword1: */
+               uint32_t pad2     : 17;
+               uint32_t type     : 3;
+               uint32_t pad3     : 2;
+               uint32_t opc      : 5;
+               uint32_t jmp_tgt  : 1;
+               uint32_t sync     : 1;
+               uint32_t opc_cat  : 3;
+       };
+} instr_cat6_t;
+
+typedef union PACKED {
+       instr_cat0_t cat0;
+       instr_cat1_t cat1;
+       instr_cat2_t cat2;
+       instr_cat3_t cat3;
+       instr_cat4_t cat4;
+       instr_cat5_t cat5;
+       instr_cat6_t cat6;
+       struct PACKED {
+               /* dword0: */
+               uint64_t pad1     : 40;
+               uint32_t repeat   : 3;  /* cat0-cat4 */
+               uint32_t pad2     : 1;
+               uint32_t ss       : 1;  /* cat1-cat4 (cat0??) */
+               uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+               uint32_t pad3     : 13;
+               uint32_t jmp_tgt  : 1;
+               uint32_t sync     : 1;
+               uint32_t opc_cat  : 3;
+
+       };
+} instr_t;
+
+static inline uint32_t instr_opc(instr_t *instr)
+{
+       switch (instr->opc_cat) {
+       case 0:  return instr->cat0.opc;
+       case 1:  return 0;
+       case 2:  return instr->cat2.opc;
+       case 3:  return instr->cat3.opc;
+       case 4:  return instr->cat4.opc;
+       case 5:  return instr->cat5.opc;
+       case 6:  return instr->cat6.opc;
+       default: return 0;
+       }
+}
+
+static inline bool is_mad(opc_t opc)
+{
+       switch (opc) {
+       case OPC_MAD_U16:
+       case OPC_MADSH_U16:
+       case OPC_MAD_S16:
+       case OPC_MADSH_M16:
+       case OPC_MAD_U24:
+       case OPC_MAD_S24:
+       case OPC_MAD_F16:
+       case OPC_MAD_F32:
+               return true;
+       default:
+               return false;
+       }
+}
+
+#endif /* INSTR_A3XX_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
new file mode 100644 (file)
index 0000000..ea2a925
--- /dev/null
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "freedreno_util.h"
+#include "instr-a3xx.h"
+
+#define CHUNK_SZ 1020
+
+struct ir3_heap_chunk {
+       struct ir3_heap_chunk *next;
+       uint32_t heap[CHUNK_SZ];
+};
+
+static void grow_heap(struct ir3 *shader)
+{
+       struct ir3_heap_chunk *chunk = calloc(1, sizeof(*chunk));
+       chunk->next = shader->chunk;
+       shader->chunk = chunk;
+       shader->heap_idx = 0;
+}
+
+/* simple allocator to carve allocations out of an up-front allocated heap,
+ * so that we can free everything easily in one shot.
+ */
+void * ir3_alloc(struct ir3 *shader, int sz)
+{
+       void *ptr;
+
+       sz = align(sz, 4) / 4;
+
+       if ((shader->heap_idx + sz) > CHUNK_SZ)
+               grow_heap(shader);
+
+       ptr = &shader->chunk->heap[shader->heap_idx];
+       shader->heap_idx += sz;
+
+       return ptr;
+}
+
+struct ir3 * ir3_create(void)
+{
+       struct ir3 *shader =
+                       calloc(1, sizeof(struct ir3));
+       grow_heap(shader);
+       return shader;
+}
+
+void ir3_destroy(struct ir3 *shader)
+{
+       while (shader->chunk) {
+               struct ir3_heap_chunk *chunk = shader->chunk;
+               shader->chunk = chunk->next;
+               free(chunk);
+       }
+       free(shader);
+}
+
+#define iassert(cond) do { \
+       if (!(cond)) { \
+               assert(cond); \
+               return -1; \
+       } } while (0)
+
+static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
+               uint32_t repeat, uint32_t valid_flags)
+{
+       reg_t val = { .dummy32 = 0 };
+
+       assert(!(reg->flags & ~valid_flags));
+
+       if (!(reg->flags & IR3_REG_R))
+               repeat = 0;
+
+       if (reg->flags & IR3_REG_IMMED) {
+               val.iim_val = reg->iim_val;
+       } else {
+               int8_t components = util_last_bit(reg->wrmask);
+               int8_t max = (reg->num + repeat + components - 1) >> 2;
+
+               val.comp = reg->num & 0x3;
+               val.num  = reg->num >> 2;
+
+               if (reg->flags & IR3_REG_CONST) {
+                       info->max_const = MAX2(info->max_const, max);
+               } else if ((max != REG_A0) && (max != REG_P0)) {
+                       if (reg->flags & IR3_REG_HALF) {
+                               info->max_half_reg = MAX2(info->max_half_reg, max);
+                       } else {
+                               info->max_reg = MAX2(info->max_reg, max);
+                       }
+               }
+       }
+
+       return val.dummy32;
+}
+
+static int emit_cat0(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       instr_cat0_t *cat0 = ptr;
+
+       cat0->immed    = instr->cat0.immed;
+       cat0->repeat   = instr->repeat;
+       cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat0->inv      = instr->cat0.inv;
+       cat0->comp     = instr->cat0.comp;
+       cat0->opc      = instr->opc;
+       cat0->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat0->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat0->opc_cat  = 0;
+
+       return 0;
+}
+
+static uint32_t type_flags(type_t type)
+{
+       return (type_size(type) == 32) ? 0 : IR3_REG_HALF;
+}
+
+static int emit_cat1(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src = instr->regs[1];
+       instr_cat1_t *cat1 = ptr;
+
+       iassert(instr->regs_count == 2);
+       iassert(!((dst->flags ^ type_flags(instr->cat1.dst_type)) & IR3_REG_HALF));
+       iassert((src->flags & IR3_REG_IMMED) ||
+                       !((src->flags ^ type_flags(instr->cat1.src_type)) & IR3_REG_HALF));
+
+       if (src->flags & IR3_REG_IMMED) {
+               cat1->iim_val = src->iim_val;
+               cat1->src_im  = 1;
+       } else if (src->flags & IR3_REG_RELATIV) {
+               cat1->off       = src->offset;
+               cat1->src_rel   = 1;
+               cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
+       } else {
+               cat1->src  = reg(src, info, instr->repeat,
+                               IR3_REG_IMMED | IR3_REG_R |
+                               IR3_REG_CONST | IR3_REG_HALF);
+               cat1->src_c     = !!(src->flags & IR3_REG_CONST);
+       }
+
+       cat1->dst      = reg(dst, info, instr->repeat,
+                       IR3_REG_RELATIV | IR3_REG_EVEN |
+                       IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
+       cat1->repeat   = instr->repeat;
+       cat1->src_r    = !!(src->flags & IR3_REG_R);
+       cat1->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat1->ul       = !!(instr->flags & IR3_INSTR_UL);
+       cat1->dst_type = instr->cat1.dst_type;
+       cat1->dst_rel  = !!(dst->flags & IR3_REG_RELATIV);
+       cat1->src_type = instr->cat1.src_type;
+       cat1->even     = !!(dst->flags & IR3_REG_EVEN);
+       cat1->pos_inf  = !!(dst->flags & IR3_REG_POS_INF);
+       cat1->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat1->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat1->opc_cat  = 1;
+
+       return 0;
+}
+
+static int emit_cat2(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src1 = instr->regs[1];
+       struct ir3_register *src2 = instr->regs[2];
+       instr_cat2_t *cat2 = ptr;
+
+       iassert((instr->regs_count == 2) || (instr->regs_count == 3));
+
+       if (src1->flags & IR3_REG_RELATIV) {
+               iassert(src1->num < (1 << 10));
+               cat2->rel1.src1      = reg(src1, info, instr->repeat,
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+                               IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+               cat2->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+               cat2->rel1.src1_rel  = 1;
+       } else if (src1->flags & IR3_REG_CONST) {
+               iassert(src1->num < (1 << 12));
+               cat2->c1.src1   = reg(src1, info, instr->repeat,
+                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+                               IR3_REG_R | IR3_REG_HALF);
+               cat2->c1.src1_c = 1;
+       } else {
+               iassert(src1->num < (1 << 11));
+               cat2->src1 = reg(src1, info, instr->repeat,
+                               IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+                               IR3_REG_R | IR3_REG_HALF);
+       }
+       cat2->src1_im  = !!(src1->flags & IR3_REG_IMMED);
+       cat2->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
+       cat2->src1_abs = !!(src1->flags & IR3_REG_ABS);
+       cat2->src1_r   = !!(src1->flags & IR3_REG_R);
+
+       if (src2) {
+               iassert((src2->flags & IR3_REG_IMMED) ||
+                               !((src1->flags ^ src2->flags) & IR3_REG_HALF));
+
+               if (src2->flags & IR3_REG_RELATIV) {
+                       iassert(src2->num < (1 << 10));
+                       cat2->rel2.src2      = reg(src2, info, instr->repeat,
+                                       IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+                                       IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+                       cat2->rel2.src2_c    = !!(src2->flags & IR3_REG_CONST);
+                       cat2->rel2.src2_rel  = 1;
+               } else if (src2->flags & IR3_REG_CONST) {
+                       iassert(src2->num < (1 << 12));
+                       cat2->c2.src2   = reg(src2, info, instr->repeat,
+                                       IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+                                       IR3_REG_R | IR3_REG_HALF);
+                       cat2->c2.src2_c = 1;
+               } else {
+                       iassert(src2->num < (1 << 11));
+                       cat2->src2 = reg(src2, info, instr->repeat,
+                                       IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+                                       IR3_REG_R | IR3_REG_HALF);
+               }
+
+               cat2->src2_im  = !!(src2->flags & IR3_REG_IMMED);
+               cat2->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
+               cat2->src2_abs = !!(src2->flags & IR3_REG_ABS);
+               cat2->src2_r   = !!(src2->flags & IR3_REG_R);
+       }
+
+       cat2->dst      = reg(dst, info, instr->repeat,
+                       IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
+       cat2->repeat   = instr->repeat;
+       cat2->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat2->ul       = !!(instr->flags & IR3_INSTR_UL);
+       cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
+       cat2->ei       = !!(dst->flags & IR3_REG_EI);
+       cat2->cond     = instr->cat2.condition;
+       cat2->full     = ! (src1->flags & IR3_REG_HALF);
+       cat2->opc      = instr->opc;
+       cat2->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat2->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat2->opc_cat  = 2;
+
+       return 0;
+}
+
+static int emit_cat3(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src1 = instr->regs[1];
+       struct ir3_register *src2 = instr->regs[2];
+       struct ir3_register *src3 = instr->regs[3];
+       instr_cat3_t *cat3 = ptr;
+       uint32_t src_flags = 0;
+
+       switch (instr->opc) {
+       case OPC_MAD_F16:
+       case OPC_MAD_U16:
+       case OPC_MAD_S16:
+       case OPC_SEL_B16:
+       case OPC_SEL_S16:
+       case OPC_SEL_F16:
+       case OPC_SAD_S16:
+       case OPC_SAD_S32:  // really??
+               src_flags |= IR3_REG_HALF;
+               break;
+       default:
+               break;
+       }
+
+       iassert(instr->regs_count == 4);
+       iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
+       iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
+       iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
+
+       if (src1->flags & IR3_REG_RELATIV) {
+               iassert(src1->num < (1 << 10));
+               cat3->rel1.src1      = reg(src1, info, instr->repeat,
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+                               IR3_REG_R | IR3_REG_HALF);
+               cat3->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+               cat3->rel1.src1_rel  = 1;
+       } else if (src1->flags & IR3_REG_CONST) {
+               iassert(src1->num < (1 << 12));
+               cat3->c1.src1   = reg(src1, info, instr->repeat,
+                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
+                               IR3_REG_HALF);
+               cat3->c1.src1_c = 1;
+       } else {
+               iassert(src1->num < (1 << 11));
+               cat3->src1 = reg(src1, info, instr->repeat,
+                               IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
+       }
+
+       cat3->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
+       cat3->src1_r   = !!(src1->flags & IR3_REG_R);
+
+       cat3->src2     = reg(src2, info, instr->repeat,
+                       IR3_REG_CONST | IR3_REG_NEGATE |
+                       IR3_REG_R | IR3_REG_HALF);
+       cat3->src2_c   = !!(src2->flags & IR3_REG_CONST);
+       cat3->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
+       cat3->src2_r   = !!(src2->flags & IR3_REG_R);
+
+
+       if (src3->flags & IR3_REG_RELATIV) {
+               iassert(src3->num < (1 << 10));
+               cat3->rel2.src3      = reg(src3, info, instr->repeat,
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+                               IR3_REG_R | IR3_REG_HALF);
+               cat3->rel2.src3_c    = !!(src3->flags & IR3_REG_CONST);
+               cat3->rel2.src3_rel  = 1;
+       } else if (src3->flags & IR3_REG_CONST) {
+               iassert(src3->num < (1 << 12));
+               cat3->c2.src3   = reg(src3, info, instr->repeat,
+                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
+                               IR3_REG_HALF);
+               cat3->c2.src3_c = 1;
+       } else {
+               iassert(src3->num < (1 << 11));
+               cat3->src3 = reg(src3, info, instr->repeat,
+                               IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
+       }
+
+       cat3->src3_neg = !!(src3->flags & IR3_REG_NEGATE);
+       cat3->src3_r   = !!(src3->flags & IR3_REG_R);
+
+       cat3->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+       cat3->repeat   = instr->repeat;
+       cat3->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat3->ul       = !!(instr->flags & IR3_INSTR_UL);
+       cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
+       cat3->opc      = instr->opc;
+       cat3->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat3->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat3->opc_cat  = 3;
+
+       return 0;
+}
+
+static int emit_cat4(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src = instr->regs[1];
+       instr_cat4_t *cat4 = ptr;
+
+       iassert(instr->regs_count == 2);
+
+       if (src->flags & IR3_REG_RELATIV) {
+               iassert(src->num < (1 << 10));
+               cat4->rel.src      = reg(src, info, instr->repeat,
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+                               IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+               cat4->rel.src_c    = !!(src->flags & IR3_REG_CONST);
+               cat4->rel.src_rel  = 1;
+       } else if (src->flags & IR3_REG_CONST) {
+               iassert(src->num < (1 << 12));
+               cat4->c.src   = reg(src, info, instr->repeat,
+                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+                               IR3_REG_R | IR3_REG_HALF);
+               cat4->c.src_c = 1;
+       } else {
+               iassert(src->num < (1 << 11));
+               cat4->src = reg(src, info, instr->repeat,
+                               IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+                               IR3_REG_R | IR3_REG_HALF);
+       }
+
+       cat4->src_im   = !!(src->flags & IR3_REG_IMMED);
+       cat4->src_neg  = !!(src->flags & IR3_REG_NEGATE);
+       cat4->src_abs  = !!(src->flags & IR3_REG_ABS);
+       cat4->src_r    = !!(src->flags & IR3_REG_R);
+
+       cat4->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+       cat4->repeat   = instr->repeat;
+       cat4->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat4->ul       = !!(instr->flags & IR3_INSTR_UL);
+       cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
+       cat4->full     = ! (src->flags & IR3_REG_HALF);
+       cat4->opc      = instr->opc;
+       cat4->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat4->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat4->opc_cat  = 4;
+
+       return 0;
+}
+
+static int emit_cat5(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src1 = instr->regs[1];
+       struct ir3_register *src2 = instr->regs[2];
+       struct ir3_register *src3 = instr->regs[3];
+       instr_cat5_t *cat5 = ptr;
+
+       iassert(!((dst->flags ^ type_flags(instr->cat5.type)) & IR3_REG_HALF));
+
+       if (src1) {
+               cat5->full = ! (src1->flags & IR3_REG_HALF);
+               cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
+       }
+
+
+       if (instr->flags & IR3_INSTR_S2EN) {
+               if (src2) {
+                       iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+                       cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+               }
+               if (src3) {
+                       iassert(src3->flags & IR3_REG_HALF);
+                       cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
+               }
+               iassert(!(instr->cat5.samp | instr->cat5.tex));
+       } else {
+               iassert(!src3);
+               if (src2) {
+                       iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+                       cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+               }
+               cat5->norm.samp = instr->cat5.samp;
+               cat5->norm.tex  = instr->cat5.tex;
+       }
+
+       cat5->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+       cat5->wrmask   = dst->wrmask;
+       cat5->type     = instr->cat5.type;
+       cat5->is_3d    = !!(instr->flags & IR3_INSTR_3D);
+       cat5->is_a     = !!(instr->flags & IR3_INSTR_A);
+       cat5->is_s     = !!(instr->flags & IR3_INSTR_S);
+       cat5->is_s2en  = !!(instr->flags & IR3_INSTR_S2EN);
+       cat5->is_o     = !!(instr->flags & IR3_INSTR_O);
+       cat5->is_p     = !!(instr->flags & IR3_INSTR_P);
+       cat5->opc      = instr->opc;
+       cat5->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat5->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat5->opc_cat  = 5;
+
+       return 0;
+}
+
+static int emit_cat6(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src = instr->regs[1];
+       instr_cat6_t *cat6 = ptr;
+
+       iassert(instr->regs_count == 2);
+
+       switch (instr->opc) {
+       /* load instructions: */
+       case OPC_LDG:
+       case OPC_LDP:
+       case OPC_LDL:
+       case OPC_LDLW:
+       case OPC_LDLV:
+       case OPC_PREFETCH: {
+               instr_cat6a_t *cat6a = ptr;
+
+               iassert(!((dst->flags ^ type_flags(instr->cat6.type)) & IR3_REG_HALF));
+
+               cat6a->must_be_one1  = 1;
+               cat6a->must_be_one2  = 1;
+               cat6a->off = instr->cat6.offset;
+               cat6a->src = reg(src, info, instr->repeat, 0);
+               cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+               break;
+       }
+       /* store instructions: */
+       case OPC_STG:
+       case OPC_STP:
+       case OPC_STL:
+       case OPC_STLW:
+       case OPC_STI: {
+               instr_cat6b_t *cat6b = ptr;
+               uint32_t src_flags = type_flags(instr->cat6.type);
+               uint32_t dst_flags = (instr->opc == OPC_STI) ? IR3_REG_HALF : 0;
+
+               iassert(!((src->flags ^ src_flags) & IR3_REG_HALF));
+
+               cat6b->must_be_one1  = 1;
+               cat6b->must_be_one2  = 1;
+               cat6b->src    = reg(src, info, instr->repeat, src_flags);
+               cat6b->off_hi = instr->cat6.offset >> 8;
+               cat6b->off    = instr->cat6.offset;
+               cat6b->dst    = reg(dst, info, instr->repeat, IR3_REG_R | dst_flags);
+
+               break;
+       }
+       default:
+               // TODO
+               break;
+       }
+
+       cat6->iim_val  = instr->cat6.iim_val;
+       cat6->type     = instr->cat6.type;
+       cat6->opc      = instr->opc;
+       cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat6->opc_cat  = 6;
+
+       return 0;
+}
+
+static int (*emit[])(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info) = {
+       emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
+};
+
+void * ir3_assemble(struct ir3 *shader, struct ir3_info *info)
+{
+       uint32_t *ptr, *dwords;
+       uint32_t i;
+
+       info->max_reg       = -1;
+       info->max_half_reg  = -1;
+       info->max_const     = -1;
+       info->instrs_count  = 0;
+
+       /* need a integer number of instruction "groups" (sets of four
+        * instructions), so pad out w/ NOPs if needed:
+        * (each instruction is 64bits)
+        */
+       info->sizedwords = 2 * align(shader->instrs_count, 4);
+
+       ptr = dwords = calloc(1, 4 * info->sizedwords);
+
+       for (i = 0; i < shader->instrs_count; i++) {
+               struct ir3_instruction *instr = shader->instrs[i];
+               int ret = emit[instr->category](instr, dwords, info);
+               if (ret)
+                       goto fail;
+               info->instrs_count += 1 + instr->repeat;
+               dwords += 2;
+       }
+
+       return ptr;
+
+fail:
+       free(ptr);
+       return NULL;
+}
+
+static struct ir3_register * reg_create(struct ir3 *shader,
+               int num, int flags)
+{
+       struct ir3_register *reg =
+                       ir3_alloc(shader, sizeof(struct ir3_register));
+       reg->wrmask = 1;
+       reg->flags = flags;
+       reg->num = num;
+       return reg;
+}
+
+static void insert_instr(struct ir3 *shader,
+               struct ir3_instruction *instr)
+{
+#ifdef DEBUG
+       static uint32_t serialno = 0;
+       instr->serialno = ++serialno;
+#endif
+       if (shader->instrs_count == shader->instrs_sz) {
+               shader->instrs_sz = MAX2(2 * shader->instrs_sz, 16);
+               shader->instrs = realloc(shader->instrs,
+                               shader->instrs_sz * sizeof(shader->instrs[0]));
+       }
+       shader->instrs[shader->instrs_count++] = instr;
+}
+
+struct ir3_block * ir3_block_create(struct ir3 *shader,
+               unsigned ntmp, unsigned nin, unsigned nout)
+{
+       struct ir3_block *block;
+       unsigned size;
+       char *ptr;
+
+       size = sizeof(*block);
+       size += sizeof(block->temporaries[0]) * ntmp;
+       size += sizeof(block->inputs[0]) * nin;
+       size += sizeof(block->outputs[0]) * nout;
+
+       ptr = ir3_alloc(shader, size);
+
+       block = (void *)ptr;
+       ptr += sizeof(*block);
+
+       block->temporaries = (void *)ptr;
+       block->ntemporaries = ntmp;
+       ptr += sizeof(block->temporaries[0]) * ntmp;
+
+       block->inputs = (void *)ptr;
+       block->ninputs = nin;
+       ptr += sizeof(block->inputs[0]) * nin;
+
+       block->outputs = (void *)ptr;
+       block->noutputs = nout;
+       ptr += sizeof(block->outputs[0]) * nout;
+
+       block->shader = shader;
+
+       return block;
+}
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
+               int category, opc_t opc)
+{
+       struct ir3_instruction *instr =
+                       ir3_alloc(block->shader, sizeof(struct ir3_instruction));
+       instr->block = block;
+       instr->category = category;
+       instr->opc = opc;
+       insert_instr(block->shader, instr);
+       return instr;
+}
+
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
+{
+       struct ir3_instruction *new_instr =
+                       ir3_alloc(instr->block->shader, sizeof(struct ir3_instruction));
+       unsigned i;
+
+       *new_instr = *instr;
+       insert_instr(instr->block->shader, new_instr);
+
+       /* clone registers: */
+       new_instr->regs_count = 0;
+       for (i = 0; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               struct ir3_register *new_reg =
+                               ir3_reg_create(new_instr, reg->num, reg->flags);
+               *new_reg = *reg;
+       }
+
+       return new_instr;
+}
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+               int num, int flags)
+{
+       struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
+       assert(instr->regs_count < ARRAY_SIZE(instr->regs));
+       instr->regs[instr->regs_count++] = reg;
+       return reg;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
new file mode 100644 (file)
index 0000000..9ed914b
--- /dev/null
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IR3_H_
+#define IR3_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "instr-a3xx.h"
+#include "disasm.h"  /* TODO move 'enum shader_t' somewhere else.. */
+
+/* low level intermediate representation of an adreno shader program */
+
+struct ir3;
+struct ir3_instruction;
+struct ir3_block;
+
+struct ir3 * fd_asm_parse(const char *src);
+
+struct ir3_info {
+       uint16_t sizedwords;
+       uint16_t instrs_count;   /* expanded to account for rpt's */
+       /* NOTE: max_reg, etc, does not include registers not touched
+        * by the shader (ie. vertex fetched via VFD_DECODE but not
+        * touched by shader)
+        */
+       int8_t   max_reg;   /* highest GPR # used by shader */
+       int8_t   max_half_reg;
+       int8_t   max_const;
+};
+
+struct ir3_register {
+       enum {
+               IR3_REG_CONST  = 0x001,
+               IR3_REG_IMMED  = 0x002,
+               IR3_REG_HALF   = 0x004,
+               IR3_REG_RELATIV= 0x008,
+               IR3_REG_R      = 0x010,
+               IR3_REG_NEGATE = 0x020,
+               IR3_REG_ABS    = 0x040,
+               IR3_REG_EVEN   = 0x080,
+               IR3_REG_POS_INF= 0x100,
+               /* (ei) flag, end-input?  Set on last bary, presumably to signal
+                * that the shader needs no more input:
+                */
+               IR3_REG_EI     = 0x200,
+               /* meta-flags, for intermediate stages of IR, ie.
+                * before register assignment is done:
+                */
+               IR3_REG_SSA    = 0x1000,   /* 'instr' is ptr to assigning instr */
+               IR3_REG_IA     = 0x2000,   /* meta-input dst is "assigned" */
+               IR3_REG_ADDR   = 0x4000,   /* register is a0.x */
+       } flags;
+       union {
+               /* normal registers:
+                * the component is in the low two bits of the reg #, so
+                * rN.x becomes: (N << 2) | x
+                */
+               int num;
+               /* immediate: */
+               int     iim_val;
+               float   fim_val;
+               /* relative: */
+               int offset;
+               /* for IR3_REG_SSA, src registers contain ptr back to
+                * assigning instruction.
+                */
+               struct ir3_instruction *instr;
+       };
+
+       /* used for cat5 instructions, but also for internal/IR level
+        * tracking of what registers are read/written by an instruction.
+        * wrmask may be a bad name since it is used to represent both
+        * src and dst that touch multiple adjacent registers.
+        */
+       int wrmask;
+};
+
+struct ir3_instruction {
+       struct ir3_block *block;
+       int category;
+       opc_t opc;
+       enum {
+               /* (sy) flag is set on first instruction, and after sample
+                * instructions (probably just on RAW hazard).
+                */
+               IR3_INSTR_SY    = 0x001,
+               /* (ss) flag is set on first instruction, and first instruction
+                * to depend on the result of "long" instructions (RAW hazard):
+                *
+                *   rcp, rsq, log2, exp2, sin, cos, sqrt
+                *
+                * It seems to synchronize until all in-flight instructions are
+                * completed, for example:
+                *
+                *   rsq hr1.w, hr1.w
+                *   add.f hr2.z, (neg)hr2.z, hc0.y
+                *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
+                *   rsq hr2.x, hr2.x
+                *   (rpt1)nop
+                *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
+                *   nop
+                *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
+                *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
+                *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
+                *
+                * The last mul.f does not have (ss) set, presumably because the
+                * (ss) on the previous instruction does the job.
+                *
+                * The blob driver also seems to set it on WAR hazards, although
+                * not really clear if this is needed or just blob compiler being
+                * sloppy.  So far I haven't found a case where removing the (ss)
+                * causes problems for WAR hazard, but I could just be getting
+                * lucky:
+                *
+                *   rcp r1.y, r3.y
+                *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
+                *
+                */
+               IR3_INSTR_SS    = 0x002,
+               /* (jp) flag is set on jump targets:
+                */
+               IR3_INSTR_JP    = 0x004,
+               IR3_INSTR_UL    = 0x008,
+               IR3_INSTR_3D    = 0x010,
+               IR3_INSTR_A     = 0x020,
+               IR3_INSTR_O     = 0x040,
+               IR3_INSTR_P     = 0x080,
+               IR3_INSTR_S     = 0x100,
+               IR3_INSTR_S2EN  = 0x200,
+               /* meta-flags, for intermediate stages of IR, ie.
+                * before register assignment is done:
+                */
+               IR3_INSTR_MARK  = 0x1000,
+       } flags;
+       int repeat;
+       unsigned regs_count;
+       struct ir3_register *regs[5];
+       union {
+               struct {
+                       char inv;
+                       char comp;
+                       int  immed;
+               } cat0;
+               struct {
+                       type_t src_type, dst_type;
+               } cat1;
+               struct {
+                       enum {
+                               IR3_COND_LT = 0,
+                               IR3_COND_LE = 1,
+                               IR3_COND_GT = 2,
+                               IR3_COND_GE = 3,
+                               IR3_COND_EQ = 4,
+                               IR3_COND_NE = 5,
+                       } condition;
+               } cat2;
+               struct {
+                       unsigned samp, tex;
+                       type_t type;
+               } cat5;
+               struct {
+                       type_t type;
+                       int offset;
+                       int iim_val;
+               } cat6;
+               /* for meta-instructions, just used to hold extra data
+                * before instruction scheduling, etc
+                */
+               struct {
+                       int off;              /* component/offset */
+               } fo;
+               struct {
+                       struct ir3_block *if_block, *else_block;
+               } flow;
+               struct {
+                       struct ir3_block *block;
+               } inout;
+       };
+
+       /* transient values used during various algorithms: */
+       union {
+               /* The instruction depth is the max dependency distance to output.
+                *
+                * You can also think of it as the "cost", if we did any sort of
+                * optimization for register footprint.  Ie. a value that is  just
+                * result of moving a const to a reg would have a low cost,  so to
+                * it could make sense to duplicate the instruction at various
+                * points where the result is needed to reduce register footprint.
+                */
+               unsigned depth;
+       };
+       struct ir3_instruction *next;
+#ifdef DEBUG
+       uint32_t serialno;
+#endif
+};
+
+struct ir3_heap_chunk;
+
+struct ir3 {
+       unsigned instrs_count, instrs_sz;
+       struct ir3_instruction **instrs;
+       unsigned heap_idx;
+       struct ir3_heap_chunk *chunk;
+};
+
+struct ir3_block {
+       struct ir3 *shader;
+       unsigned ntemporaries, ninputs, noutputs;
+       /* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */
+       struct ir3_instruction **temporaries;
+       struct ir3_instruction **inputs;
+       struct ir3_instruction **outputs;
+       /* only a single address register: */
+       struct ir3_instruction *address;
+       struct ir3_block *parent;
+       struct ir3_instruction *head;
+};
+
+struct ir3 * ir3_create(void);
+void ir3_destroy(struct ir3 *shader);
+void * ir3_assemble(struct ir3 *shader,
+               struct ir3_info *info);
+void * ir3_alloc(struct ir3 *shader, int sz);
+
+struct ir3_block * ir3_block_create(struct ir3 *shader,
+               unsigned ntmp, unsigned nin, unsigned nout);
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
+               int category, opc_t opc);
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
+const char *ir3_instr_name(struct ir3_instruction *instr);
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+               int num, int flags);
+
+
+static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
+{
+       if (instr->flags & IR3_INSTR_MARK)
+               return true;  /* already visited */
+       instr->flags ^= IR3_INSTR_MARK;
+       return false;
+}
+
+static inline void ir3_clear_mark(struct ir3 *shader)
+{
+       /* TODO would be nice to drop the instruction array.. for
+        * new compiler, _clear_mark() is all we use it for, and
+        * we could probably manage a linked list instead..
+        */
+       unsigned i;
+       for (i = 0; i < shader->instrs_count; i++) {
+               struct ir3_instruction *instr = shader->instrs[i];
+               instr->flags &= ~IR3_INSTR_MARK;
+       }
+}
+
+static inline int ir3_instr_regno(struct ir3_instruction *instr,
+               struct ir3_register *reg)
+{
+       unsigned i;
+       for (i = 0; i < instr->regs_count; i++)
+               if (reg == instr->regs[i])
+                       return i;
+       return -1;
+}
+
+
+/* comp:
+ *   0 - x
+ *   1 - y
+ *   2 - z
+ *   3 - w
+ */
+static inline uint32_t regid(int num, int comp)
+{
+       return (num << 2) | (comp & 0x3);
+}
+
+static inline uint32_t reg_num(struct ir3_register *reg)
+{
+       return reg->num >> 2;
+}
+
+static inline uint32_t reg_comp(struct ir3_register *reg)
+{
+       return reg->num & 0x3;
+}
+
+static inline bool is_flow(struct ir3_instruction *instr)
+{
+       return (instr->category == 0);
+}
+
+static inline bool is_kill(struct ir3_instruction *instr)
+{
+       return is_flow(instr) && (instr->opc == OPC_KILL);
+}
+
+static inline bool is_nop(struct ir3_instruction *instr)
+{
+       return is_flow(instr) && (instr->opc == OPC_NOP);
+}
+
+static inline bool is_alu(struct ir3_instruction *instr)
+{
+       return (1 <= instr->category) && (instr->category <= 3);
+}
+
+static inline bool is_sfu(struct ir3_instruction *instr)
+{
+       return (instr->category == 4);
+}
+
+static inline bool is_tex(struct ir3_instruction *instr)
+{
+       return (instr->category == 5);
+}
+
+static inline bool is_input(struct ir3_instruction *instr)
+{
+       return (instr->category == 2) && (instr->opc == OPC_BARY_F);
+}
+
+static inline bool is_meta(struct ir3_instruction *instr)
+{
+       /* TODO how should we count PHI (and maybe fan-in/out) which
+        * might actually contribute some instructions to the final
+        * result?
+        */
+       return (instr->category == -1);
+}
+
+static inline bool is_addr(struct ir3_instruction *instr)
+{
+       return is_meta(instr) && (instr->opc == OPC_META_DEREF);
+}
+
+static inline bool writes_addr(struct ir3_instruction *instr)
+{
+       if (instr->regs_count > 0) {
+               struct ir3_register *dst = instr->regs[0];
+               return !!(dst->flags & IR3_REG_ADDR);
+       }
+       return false;
+}
+
+static inline bool writes_pred(struct ir3_instruction *instr)
+{
+       if (instr->regs_count > 0) {
+               struct ir3_register *dst = instr->regs[0];
+               return reg_num(dst) == REG_P0;
+       }
+       return false;
+}
+
+static inline bool reg_gpr(struct ir3_register *r)
+{
+       if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR))
+               return false;
+       if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
+               return false;
+       return true;
+}
+
+/* dump: */
+#include <stdio.h>
+void ir3_dump(struct ir3 *shader, const char *name,
+               struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
+               FILE *f);
+void ir3_dump_instr_single(struct ir3_instruction *instr);
+void ir3_dump_instr_list(struct ir3_instruction *instr);
+
+/* flatten if/else: */
+int ir3_block_flatten(struct ir3_block *block);
+
+/* depth calculation: */
+int ir3_delayslots(struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer, unsigned n);
+void ir3_block_depth(struct ir3_block *block);
+
+/* copy-propagate: */
+void ir3_block_cp(struct ir3_block *block);
+
+/* scheduling: */
+void ir3_block_sched(struct ir3_block *block);
+
+/* register assignment: */
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+               bool half_precision, bool frag_coord, bool frag_face,
+               bool *has_samp);
+
+#ifndef ARRAY_SIZE
+#  define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+/* ************************************************************************* */
+/* split this out or find some helper to use.. like main/bitset.h.. */
+
+#include <string.h>
+
+#define MAX_REG 256
+
+typedef uint8_t regmask_t[2 * MAX_REG / 8];
+
+static inline unsigned regmask_idx(struct ir3_register *reg)
+{
+       unsigned num = reg->num;
+       assert(num < MAX_REG);
+       if (reg->flags & IR3_REG_HALF)
+               num += MAX_REG;
+       return num;
+}
+
+static inline void regmask_init(regmask_t *regmask)
+{
+       memset(regmask, 0, sizeof(*regmask));
+}
+
+static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
+{
+       unsigned idx = regmask_idx(reg);
+       unsigned i;
+       for (i = 0; i < 4; i++, idx++)
+               if (reg->wrmask & (1 << i))
+                       (*regmask)[idx / 8] |= 1 << (idx % 8);
+}
+
+/* set bits in a if not set in b, conceptually:
+ *   a |= (reg & ~b)
+ */
+static inline void regmask_set_if_not(regmask_t *a,
+               struct ir3_register *reg, regmask_t *b)
+{
+       unsigned idx = regmask_idx(reg);
+       unsigned i;
+       for (i = 0; i < 4; i++, idx++)
+               if (reg->wrmask & (1 << i))
+                       if (!((*b)[idx / 8] & (1 << (idx % 8))))
+                               (*a)[idx / 8] |= 1 << (idx % 8);
+}
+
+static inline unsigned regmask_get(regmask_t *regmask,
+               struct ir3_register *reg)
+{
+       unsigned idx = regmask_idx(reg);
+       unsigned i;
+       for (i = 0; i < 4; i++, idx++)
+               if (reg->wrmask & (1 << i))
+                       if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+                               return true;
+       return false;
+}
+
+/* ************************************************************************* */
+
+#endif /* IR3_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
new file mode 100644 (file)
index 0000000..1fa2fd4
--- /dev/null
@@ -0,0 +1,2639 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdarg.h>
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_strings.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+
+#include "freedreno_lowering.h"
+#include "freedreno_util.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+struct ir3_compile_context {
+       const struct tgsi_token *tokens;
+       bool free_tokens;
+       struct ir3 *ir;
+       struct ir3_shader_variant *so;
+
+       struct ir3_block *block;
+       struct ir3_instruction *current_instr;
+
+       /* we need to defer updates to block->outputs[] until the end
+        * of an instruction (so we don't see new value until *after*
+        * the src registers are processed)
+        */
+       struct {
+               struct ir3_instruction *instr, **instrp;
+       } output_updates[16];
+       unsigned num_output_updates;
+
+       /* are we in a sequence of "atomic" instructions?
+        */
+       bool atomic;
+
+       /* For fragment shaders, from the hw perspective the only
+        * actual input is r0.xy position register passed to bary.f.
+        * But TGSI doesn't know that, it still declares things as
+        * IN[] registers.  So we do all the input tracking normally
+        * and fix things up after compile_instructions()
+        *
+        * NOTE that frag_pos is the hardware position (possibly it
+        * is actually an index or tag or some such.. it is *not*
+        * values that can be directly used for gl_FragCoord..)
+        */
+       struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
+
+       struct tgsi_parse_context parser;
+       unsigned type;
+
+       struct tgsi_shader_info info;
+
+       /* for calculating input/output positions/linkages: */
+       unsigned next_inloc;
+
+       unsigned num_internal_temps;
+       struct tgsi_src_register internal_temps[6];
+
+       /* idx/slot for last compiler generated immediate */
+       unsigned immediate_idx;
+
+       /* stack of branch instructions that mark (potentially nested)
+        * branch if/else/loop/etc
+        */
+       struct {
+               struct ir3_instruction *instr, *cond;
+               bool inv;   /* true iff in else leg of branch */
+       } branch[16];
+       unsigned int branch_count;
+
+       /* list of kill instructions: */
+       struct ir3_instruction *kill[16];
+       unsigned int kill_count;
+
+       /* used when dst is same as one of the src, to avoid overwriting a
+        * src element before the remaining scalar instructions that make
+        * up the vector operation
+        */
+       struct tgsi_dst_register tmp_dst;
+       struct tgsi_src_register *tmp_src;
+};
+
+
+static void vectorize(struct ir3_compile_context *ctx,
+               struct ir3_instruction *instr, struct tgsi_dst_register *dst,
+               int nsrcs, ...);
+static void create_mov(struct ir3_compile_context *ctx,
+               struct tgsi_dst_register *dst, struct tgsi_src_register *src);
+static type_t get_ftype(struct ir3_compile_context *ctx);
+
+static unsigned
+compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
+               const struct tgsi_token *tokens)
+{
+       unsigned ret;
+       struct tgsi_shader_info *info = &ctx->info;
+       const struct fd_lowering_config lconfig = {
+                       .color_two_side = so->key.color_two_side,
+                       .lower_DST  = true,
+                       .lower_XPD  = true,
+                       .lower_SCS  = true,
+                       .lower_LRP  = true,
+                       .lower_FRC  = true,
+                       .lower_POW  = true,
+                       .lower_LIT  = true,
+                       .lower_EXP  = true,
+                       .lower_LOG  = true,
+                       .lower_DP4  = true,
+                       .lower_DP3  = true,
+                       .lower_DPH  = true,
+                       .lower_DP2  = true,
+                       .lower_DP2A = true,
+       };
+
+       ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
+       ctx->free_tokens = !!ctx->tokens;
+       if (!ctx->tokens) {
+               /* no lowering */
+               ctx->tokens = tokens;
+       }
+       ctx->ir = so->ir;
+       ctx->so = so;
+       ctx->next_inloc = 8;
+       ctx->num_internal_temps = 0;
+       ctx->branch_count = 0;
+       ctx->kill_count = 0;
+       ctx->block = NULL;
+       ctx->current_instr = NULL;
+       ctx->num_output_updates = 0;
+       ctx->atomic = false;
+       ctx->frag_pos = NULL;
+       ctx->frag_face = NULL;
+
+       memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
+
+#define FM(x) (1 << TGSI_FILE_##x)
+       /* optimize can't deal with relative addressing: */
+       if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
+               return TGSI_PARSE_ERROR;
+
+       /* Immediates go after constants: */
+       so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
+       ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
+
+       ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
+       if (ret != TGSI_PARSE_OK)
+               return ret;
+
+       ctx->type = ctx->parser.FullHeader.Processor.Processor;
+
+       return ret;
+}
+
+static void
+compile_error(struct ir3_compile_context *ctx, const char *format, ...)
+{
+       va_list ap;
+       va_start(ap, format);
+       _debug_vprintf(format, ap);
+       va_end(ap);
+       tgsi_dump(ctx->tokens, 0);
+       debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+               if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+       } while (0)
+
+static void
+compile_free(struct ir3_compile_context *ctx)
+{
+       if (ctx->free_tokens)
+               free((void *)ctx->tokens);
+       tgsi_parse_free(&ctx->parser);
+}
+
+struct instr_translater {
+       void (*fxn)(const struct instr_translater *t,
+                       struct ir3_compile_context *ctx,
+                       struct tgsi_full_instruction *inst);
+       unsigned tgsi_opc;
+       opc_t opc;
+       opc_t hopc;    /* opc to use for half_precision mode, if different */
+       unsigned arg;
+};
+
+static void
+instr_finish(struct ir3_compile_context *ctx)
+{
+       unsigned i;
+
+       if (ctx->atomic)
+               return;
+
+       for (i = 0; i < ctx->num_output_updates; i++)
+               *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
+
+       ctx->num_output_updates = 0;
+}
+
+/* For "atomic" groups of instructions, for example the four scalar
+ * instructions to perform a vec4 operation.  Basically this just
+ * blocks out handling of output_updates so the next scalar instruction
+ * still sees the result from before the start of the atomic group.
+ *
+ * NOTE: when used properly, this could probably replace get/put_dst()
+ * stuff.
+ */
+static void
+instr_atomic_start(struct ir3_compile_context *ctx)
+{
+       ctx->atomic = true;
+}
+
+static void
+instr_atomic_end(struct ir3_compile_context *ctx)
+{
+       ctx->atomic = false;
+       instr_finish(ctx);
+}
+
+static struct ir3_instruction *
+instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
+{
+       instr_finish(ctx);
+       return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
+}
+
+static struct ir3_instruction *
+instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
+{
+       instr_finish(ctx);
+       return (ctx->current_instr = ir3_instr_clone(instr));
+}
+
+static struct ir3_block *
+push_block(struct ir3_compile_context *ctx)
+{
+       struct ir3_block *block;
+       unsigned ntmp, nin, nout;
+
+#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
+
+       /* hmm, give ourselves room to create 4 extra temporaries (vec4):
+        */
+       ntmp = SCALAR_REGS(TEMPORARY);
+       ntmp += 4 * 4;
+
+       nout = SCALAR_REGS(OUTPUT);
+       nin  = SCALAR_REGS(INPUT);
+
+       /* for outermost block, 'inputs' are the actual shader INPUT
+        * register file.  Reads from INPUT registers always go back to
+        * top block.  For nested blocks, 'inputs' is used to track any
+        * TEMPORARY file register from one of the enclosing blocks that
+        * is ready in this block.
+        */
+       if (!ctx->block) {
+               /* NOTE: fragment shaders actually have two inputs (r0.xy, the
+                * position)
+                */
+               if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+                       int n = 2;
+                       if (ctx->info.reads_position)
+                               n += 4;
+                       if (ctx->info.uses_frontface)
+                               n += 4;
+                       nin = MAX2(n, nin);
+                       nout += ARRAY_SIZE(ctx->kill);
+               }
+       } else {
+               nin = ntmp;
+       }
+
+       block = ir3_block_create(ctx->ir, ntmp, nin, nout);
+
+       if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
+               block->noutputs -= ARRAY_SIZE(ctx->kill);
+
+       block->parent = ctx->block;
+       ctx->block = block;
+
+       return block;
+}
+
+static void
+pop_block(struct ir3_compile_context *ctx)
+{
+       ctx->block = ctx->block->parent;
+       compile_assert(ctx, ctx->block);
+}
+
+static struct ir3_instruction *
+create_output(struct ir3_block *block, struct ir3_instruction *instr,
+               unsigned n)
+{
+       struct ir3_instruction *out;
+
+       out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
+       out->inout.block = block;
+       ir3_reg_create(out, n, 0);
+       if (instr)
+               ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
+
+       return out;
+}
+
+static struct ir3_instruction *
+create_input(struct ir3_block *block, struct ir3_instruction *instr,
+               unsigned n)
+{
+       struct ir3_instruction *in;
+
+       in = ir3_instr_create(block, -1, OPC_META_INPUT);
+       in->inout.block = block;
+       ir3_reg_create(in, n, 0);
+       if (instr)
+               ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
+
+       return in;
+}
+
+static struct ir3_instruction *
+block_input(struct ir3_block *block, unsigned n)
+{
+       /* references to INPUT register file always go back up to
+        * top level:
+        */
+       if (block->parent)
+               return block_input(block->parent, n);
+       return block->inputs[n];
+}
+
+/* return temporary in scope, creating if needed meta-input node
+ * to track block inputs
+ */
+static struct ir3_instruction *
+block_temporary(struct ir3_block *block, unsigned n)
+{
+       /* references to TEMPORARY register file, find the nearest
+        * enclosing block which has already assigned this temporary,
+        * creating meta-input instructions along the way to keep
+        * track of block inputs
+        */
+       if (block->parent && !block->temporaries[n]) {
+               /* if already have input for this block, reuse: */
+               if (!block->inputs[n])
+                       block->inputs[n] = block_temporary(block->parent, n);
+
+               /* and create new input to return: */
+               return create_input(block, block->inputs[n], n);
+       }
+       return block->temporaries[n];
+}
+
+static struct ir3_instruction *
+create_immed(struct ir3_compile_context *ctx, float val)
+{
+       /* NOTE: *don't* use instr_create() here!
+        */
+       struct ir3_instruction *instr;
+       instr = ir3_instr_create(ctx->block, 1, 0);
+       instr->cat1.src_type = get_ftype(ctx);
+       instr->cat1.dst_type = get_ftype(ctx);
+       ir3_reg_create(instr, 0, 0);
+       ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
+       return instr;
+}
+
+static void
+ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+               const struct tgsi_dst_register *dst, unsigned chan)
+{
+       unsigned n = regid(dst->Index, chan);
+       unsigned idx = ctx->num_output_updates;
+
+       compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
+
+       /* NOTE: defer update of temporaries[idx] or output[idx]
+        * until instr_finish(), so that if the current instruction
+        * reads the same TEMP/OUT[] it gets the old value:
+        *
+        * bleh.. this might be a bit easier to just figure out
+        * in instr_finish().  But at that point we've already
+        * lost information about OUTPUT vs TEMPORARY register
+        * file..
+        */
+
+       switch (dst->File) {
+       case TGSI_FILE_OUTPUT:
+               compile_assert(ctx, n < ctx->block->noutputs);
+               ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
+               ctx->output_updates[idx].instr = instr;
+               ctx->num_output_updates++;
+               break;
+       case TGSI_FILE_TEMPORARY:
+               compile_assert(ctx, n < ctx->block->ntemporaries);
+               ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
+               ctx->output_updates[idx].instr = instr;
+               ctx->num_output_updates++;
+               break;
+       case TGSI_FILE_ADDRESS:
+               compile_assert(ctx, n < 1);
+               ctx->output_updates[idx].instrp = &ctx->block->address;
+               ctx->output_updates[idx].instr = instr;
+               ctx->num_output_updates++;
+               break;
+       }
+}
+
+static void
+ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
+               const struct tgsi_src_register *src, unsigned chan)
+{
+       struct ir3_block *block = ctx->block;
+       unsigned n = regid(src->Index, chan);
+
+       switch (src->File) {
+       case TGSI_FILE_INPUT:
+               reg->flags |= IR3_REG_SSA;
+               reg->instr = block_input(ctx->block, n);
+               break;
+       case TGSI_FILE_OUTPUT:
+               /* really this should just happen in case of 'MOV_SAT OUT[n], ..',
+                * for the following clamp instructions:
+                */
+               reg->flags |= IR3_REG_SSA;
+               reg->instr = block->outputs[n];
+               /* we don't have to worry about read from an OUTPUT that was
+                * assigned outside of the current block, because the _SAT
+                * clamp instructions will always be in the same block as
+                * the original instruction which wrote the OUTPUT
+                */
+               compile_assert(ctx, reg->instr);
+               break;
+       case TGSI_FILE_TEMPORARY:
+               reg->flags |= IR3_REG_SSA;
+               reg->instr = block_temporary(ctx->block, n);
+               break;
+       }
+
+       if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
+               /* this can happen when registers (or components of a TGSI
+                * register) are used as src before they have been assigned
+                * (undefined contents).  To avoid confusing the rest of the
+                * compiler, and to generally keep things peachy, substitute
+                * an instruction that sets the src to 0.0.  Or to keep
+                * things undefined, I could plug in a random number? :-P
+                *
+                * NOTE: *don't* use instr_create() here!
+                */
+               reg->instr = create_immed(ctx, 0.0);
+       }
+}
+
+static struct ir3_register *
+add_dst_reg_wrmask(struct ir3_compile_context *ctx,
+               struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
+               unsigned chan, unsigned wrmask)
+{
+       unsigned flags = 0, num = 0;
+       struct ir3_register *reg;
+
+       switch (dst->File) {
+       case TGSI_FILE_OUTPUT:
+       case TGSI_FILE_TEMPORARY:
+               /* uses SSA */
+               break;
+       case TGSI_FILE_ADDRESS:
+               flags |= IR3_REG_ADDR;
+               /* uses SSA */
+               break;
+       default:
+               compile_error(ctx, "unsupported dst register file: %s\n",
+                       tgsi_file_name(dst->File));
+               break;
+       }
+
+       if (dst->Indirect)
+               flags |= IR3_REG_RELATIV;
+
+       reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+       /* NOTE: do not call ssa_dst() if atomic.. vectorize()
+        * itself will call ssa_dst().  This is to filter out
+        * the (initially bogus) .x component dst which is
+        * created (but not necessarily used, ie. if the net
+        * result of the vector operation does not write to
+        * the .x component)
+        */
+
+       reg->wrmask = wrmask;
+       if (wrmask == 0x1) {
+               /* normal case */
+               if (!ctx->atomic)
+                       ssa_dst(ctx, instr, dst, chan);
+       } else if ((dst->File == TGSI_FILE_TEMPORARY) ||
+                       (dst->File == TGSI_FILE_OUTPUT) ||
+                       (dst->File == TGSI_FILE_ADDRESS)) {
+               unsigned i;
+
+               /* if instruction writes multiple, we need to create
+                * some place-holder collect the registers:
+                */
+               for (i = 0; i < 4; i++) {
+                       if (wrmask & (1 << i)) {
+                               struct ir3_instruction *collect =
+                                               ir3_instr_create(ctx->block, -1, OPC_META_FO);
+                               collect->fo.off = i;
+                               /* unused dst reg: */
+                               ir3_reg_create(collect, 0, 0);
+                               /* and src reg used to hold original instr */
+                               ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
+                               if (!ctx->atomic)
+                                       ssa_dst(ctx, collect, dst, chan+i);
+                       }
+               }
+       }
+
+       return reg;
+}
+
+static struct ir3_register *
+add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+               const struct tgsi_dst_register *dst, unsigned chan)
+{
+       return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
+}
+
+static struct ir3_register *
+add_src_reg_wrmask(struct ir3_compile_context *ctx,
+               struct ir3_instruction *instr, const struct tgsi_src_register *src,
+               unsigned chan, unsigned wrmask)
+{
+       unsigned flags = 0, num = 0;
+       struct ir3_register *reg;
+       struct ir3_instruction *orig = NULL;
+
+       /* TODO we need to use a mov to temp for const >= 64.. or maybe
+        * we could use relative addressing..
+        */
+       compile_assert(ctx, src->Index < 64);
+
+       switch (src->File) {
+       case TGSI_FILE_IMMEDIATE:
+               /* TODO if possible, use actual immediate instead of const.. but
+                * TGSI has vec4 immediates, we can only embed scalar (of limited
+                * size, depending on instruction..)
+                */
+               flags |= IR3_REG_CONST;
+               num = src->Index + ctx->so->first_immediate;
+               break;
+       case TGSI_FILE_CONSTANT:
+               flags |= IR3_REG_CONST;
+               num = src->Index;
+               break;
+       case TGSI_FILE_OUTPUT:
+               /* NOTE: we should only end up w/ OUTPUT file for things like
+                * clamp()'ing saturated dst instructions
+                */
+       case TGSI_FILE_INPUT:
+       case TGSI_FILE_TEMPORARY:
+               /* uses SSA */
+               break;
+       default:
+               compile_error(ctx, "unsupported src register file: %s\n",
+                       tgsi_file_name(src->File));
+               break;
+       }
+
+       if (src->Absolute)
+               flags |= IR3_REG_ABS;
+       if (src->Negate)
+               flags |= IR3_REG_NEGATE;
+
+       if (src->Indirect) {
+               flags |= IR3_REG_RELATIV;
+
+               /* shouldn't happen, and we can't cope with it below: */
+               compile_assert(ctx, wrmask == 0x1);
+
+               /* wrap in a meta-deref to track both the src and address: */
+               orig = instr;
+
+               instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
+               ir3_reg_create(instr, 0, 0);
+               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
+       }
+
+       reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+       reg->wrmask = wrmask;
+       if (wrmask == 0x1) {
+               /* normal case */
+               ssa_src(ctx, reg, src, chan);
+       } else if ((src->File == TGSI_FILE_TEMPORARY) ||
+                       (src->File == TGSI_FILE_OUTPUT) ||
+                       (src->File == TGSI_FILE_INPUT)) {
+               struct ir3_instruction *collect;
+               unsigned i;
+
+               compile_assert(ctx, !src->Indirect);
+
+               /* if instruction reads multiple, we need to create
+                * some place-holder collect the registers:
+                */
+               collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
+               ir3_reg_create(collect, 0, 0);   /* unused dst reg */
+
+               for (i = 0; i < 4; i++) {
+                       if (wrmask & (1 << i)) {
+                               /* and src reg used point to the original instr */
+                               ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
+                                               src, chan + i);
+                       } else if (wrmask & ~((i << i) - 1)) {
+                               /* if any remaining components, then dummy
+                                * placeholder src reg to fill in the blanks:
+                                */
+                               ir3_reg_create(collect, 0, 0);
+                       }
+               }
+
+               reg->flags |= IR3_REG_SSA;
+               reg->instr = collect;
+       }
+
+       if (src->Indirect) {
+               reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
+               reg->instr = instr;
+       }
+       return reg;
+}
+
+static struct ir3_register *
+add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+               const struct tgsi_src_register *src, unsigned chan)
+{
+       return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
+}
+
+static void
+src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
+{
+       src->File      = dst->File;
+       src->Indirect  = dst->Indirect;
+       src->Dimension = dst->Dimension;
+       src->Index     = dst->Index;
+       src->Absolute  = 0;
+       src->Negate    = 0;
+       src->SwizzleX  = TGSI_SWIZZLE_X;
+       src->SwizzleY  = TGSI_SWIZZLE_Y;
+       src->SwizzleZ  = TGSI_SWIZZLE_Z;
+       src->SwizzleW  = TGSI_SWIZZLE_W;
+}
+
+/* Get internal-temp src/dst to use for a sequence of instructions
+ * generated by a single TGSI op.
+ */
+static struct tgsi_src_register *
+get_internal_temp(struct ir3_compile_context *ctx,
+               struct tgsi_dst_register *tmp_dst)
+{
+       struct tgsi_src_register *tmp_src;
+       int n;
+
+       tmp_dst->File      = TGSI_FILE_TEMPORARY;
+       tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+       tmp_dst->Indirect  = 0;
+       tmp_dst->Dimension = 0;
+
+       /* assign next temporary: */
+       n = ctx->num_internal_temps++;
+       compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
+       tmp_src = &ctx->internal_temps[n];
+
+       tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
+
+       src_from_dst(tmp_src, tmp_dst);
+
+       return tmp_src;
+}
+
+static inline bool
+is_const(struct tgsi_src_register *src)
+{
+       return (src->File == TGSI_FILE_CONSTANT) ||
+                       (src->File == TGSI_FILE_IMMEDIATE);
+}
+
+static inline bool
+is_relative(struct tgsi_src_register *src)
+{
+       return src->Indirect;
+}
+
+static inline bool
+is_rel_or_const(struct tgsi_src_register *src)
+{
+       return is_relative(src) || is_const(src);
+}
+
+static type_t
+get_ftype(struct ir3_compile_context *ctx)
+{
+       return TYPE_F32;
+}
+
+static type_t
+get_utype(struct ir3_compile_context *ctx)
+{
+       return TYPE_U32;
+}
+
+static unsigned
+src_swiz(struct tgsi_src_register *src, int chan)
+{
+       switch (chan) {
+       case 0: return src->SwizzleX;
+       case 1: return src->SwizzleY;
+       case 2: return src->SwizzleZ;
+       case 3: return src->SwizzleW;
+       }
+       assert(0);
+       return 0;
+}
+
+/* for instructions that cannot take a const register as src, if needed
+ * generate a move to temporary gpr:
+ */
+static struct tgsi_src_register *
+get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
+{
+       struct tgsi_dst_register tmp_dst;
+       struct tgsi_src_register *tmp_src;
+
+       compile_assert(ctx, is_rel_or_const(src));
+
+       tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+       create_mov(ctx, &tmp_dst, src);
+
+       return tmp_src;
+}
+
+static void
+get_immediate(struct ir3_compile_context *ctx,
+               struct tgsi_src_register *reg, uint32_t val)
+{
+       unsigned neg, swiz, idx, i;
+       /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
+       static const unsigned swiz2tgsi[] = {
+                       TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
+       };
+
+       for (i = 0; i < ctx->immediate_idx; i++) {
+               swiz = i % 4;
+               idx  = i / 4;
+
+               if (ctx->so->immediates[idx].val[swiz] == val) {
+                       neg = 0;
+                       break;
+               }
+
+               if (ctx->so->immediates[idx].val[swiz] == -val) {
+                       neg = 1;
+                       break;
+               }
+       }
+
+       if (i == ctx->immediate_idx) {
+               /* need to generate a new immediate: */
+               swiz = i % 4;
+               idx  = i / 4;
+               neg  = 0;
+               ctx->so->immediates[idx].val[swiz] = val;
+               ctx->so->immediates_count = idx + 1;
+               ctx->immediate_idx++;
+       }
+
+       reg->File      = TGSI_FILE_IMMEDIATE;
+       reg->Indirect  = 0;
+       reg->Dimension = 0;
+       reg->Index     = idx;
+       reg->Absolute  = 0;
+       reg->Negate    = neg;
+       reg->SwizzleX  = swiz2tgsi[swiz];
+       reg->SwizzleY  = swiz2tgsi[swiz];
+       reg->SwizzleZ  = swiz2tgsi[swiz];
+       reg->SwizzleW  = swiz2tgsi[swiz];
+}
+
+static void
+create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
+               struct tgsi_src_register *src)
+{
+       type_t type_mov = get_ftype(ctx);
+       unsigned i;
+
+       for (i = 0; i < 4; i++) {
+               /* move to destination: */
+               if (dst->WriteMask & (1 << i)) {
+                       struct ir3_instruction *instr;
+
+                       if (src->Absolute || src->Negate) {
+                               /* can't have abs or neg on a mov instr, so use
+                                * absneg.f instead to handle these cases:
+                                */
+                               instr = instr_create(ctx, 2, OPC_ABSNEG_F);
+                       } else {
+                               instr = instr_create(ctx, 1, 0);
+                               instr->cat1.src_type = type_mov;
+                               instr->cat1.dst_type = type_mov;
+                       }
+
+                       add_dst_reg(ctx, instr, dst, i);
+                       add_src_reg(ctx, instr, src, src_swiz(src, i));
+               }
+       }
+}
+
+static void
+create_clamp(struct ir3_compile_context *ctx,
+               struct tgsi_dst_register *dst, struct tgsi_src_register *val,
+               struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
+{
+       struct ir3_instruction *instr;
+
+       instr = instr_create(ctx, 2, OPC_MAX_F);
+       vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
+
+       instr = instr_create(ctx, 2, OPC_MIN_F);
+       vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
+}
+
+static void
+create_clamp_imm(struct ir3_compile_context *ctx,
+               struct tgsi_dst_register *dst,
+               uint32_t minval, uint32_t maxval)
+{
+       struct tgsi_src_register minconst, maxconst;
+       struct tgsi_src_register src;
+
+       src_from_dst(&src, dst);
+
+       get_immediate(ctx, &minconst, minval);
+       get_immediate(ctx, &maxconst, maxval);
+
+       create_clamp(ctx, dst, &src, &minconst, &maxconst);
+}
+
+static struct tgsi_dst_register *
+get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+       unsigned i;
+       for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+               struct tgsi_src_register *src = &inst->Src[i].Register;
+               if ((src->File == dst->File) && (src->Index == dst->Index)) {
+                       if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
+                                       (src->SwizzleX == TGSI_SWIZZLE_X) &&
+                                       (src->SwizzleY == TGSI_SWIZZLE_Y) &&
+                                       (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
+                                       (src->SwizzleW == TGSI_SWIZZLE_W))
+                               continue;
+                       ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
+                       ctx->tmp_dst.WriteMask = dst->WriteMask;
+                       dst = &ctx->tmp_dst;
+                       break;
+               }
+       }
+       return dst;
+}
+
+static void
+put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
+               struct tgsi_dst_register *dst)
+{
+       /* if necessary, add mov back into original dst: */
+       if (dst != &inst->Dst[0].Register) {
+               create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
+       }
+}
+
+/* helper to generate the necessary repeat and/or additional instructions
+ * to turn a scalar instruction into a vector operation:
+ */
+static void
+vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+               struct tgsi_dst_register *dst, int nsrcs, ...)
+{
+       va_list ap;
+       int i, j, n = 0;
+
+       instr_atomic_start(ctx);
+
+       add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
+
+       va_start(ap, nsrcs);
+       for (j = 0; j < nsrcs; j++) {
+               struct tgsi_src_register *src =
+                               va_arg(ap, struct tgsi_src_register *);
+               unsigned flags = va_arg(ap, unsigned);
+               struct ir3_register *reg;
+               if (flags & IR3_REG_IMMED) {
+                       reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
+                       /* this is an ugly cast.. should have put flags first! */
+                       reg->iim_val = *(int *)&src;
+               } else {
+                       reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
+               }
+               reg->flags |= flags & ~IR3_REG_NEGATE;
+               if (flags & IR3_REG_NEGATE)
+                       reg->flags ^= IR3_REG_NEGATE;
+       }
+       va_end(ap);
+
+       for (i = 0; i < 4; i++) {
+               if (dst->WriteMask & (1 << i)) {
+                       struct ir3_instruction *cur;
+
+                       if (n++ == 0) {
+                               cur = instr;
+                       } else {
+                               cur = instr_clone(ctx, instr);
+                       }
+
+                       ssa_dst(ctx, cur, dst, i);
+
+                       /* fix-up dst register component: */
+                       cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
+
+                       /* fix-up src register component: */
+                       va_start(ap, nsrcs);
+                       for (j = 0; j < nsrcs; j++) {
+                               struct ir3_register *reg = cur->regs[j+1];
+                               struct tgsi_src_register *src =
+                                               va_arg(ap, struct tgsi_src_register *);
+                               unsigned flags = va_arg(ap, unsigned);
+                               if (reg->flags & IR3_REG_SSA) {
+                                       ssa_src(ctx, reg, src, src_swiz(src, i));
+                               } else if (!(flags & IR3_REG_IMMED)) {
+                                       reg->num = regid(reg->num >> 2, src_swiz(src, i));
+                               }
+                       }
+                       va_end(ap);
+               }
+       }
+
+       instr_atomic_end(ctx);
+}
+
+/*
+ * Handlers for TGSI instructions which do not have a 1:1 mapping to
+ * native instructions:
+ */
+
+static void
+trans_clamp(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src0 = &inst->Src[0].Register;
+       struct tgsi_src_register *src1 = &inst->Src[1].Register;
+       struct tgsi_src_register *src2 = &inst->Src[2].Register;
+
+       create_clamp(ctx, dst, src0, src1, src2);
+
+       put_dst(ctx, inst, dst);
+}
+
+/* ARL(x) = x, but mova from hrN.x to a0.. */
+static void
+trans_arl(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+       struct tgsi_dst_register tmp_dst;
+       struct tgsi_src_register *tmp_src;
+       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+       unsigned chan = src->SwizzleX;
+
+       compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
+
+       /* NOTE: we allocate a temporary from a flat register
+        * namespace (ignoring half vs full).  It turns out
+        * not to really matter since registers get reassigned
+        * later in ir3_ra which (hopefully!) can deal a bit
+        * better with mixed half and full precision.
+        */
+       tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+       /* cov.f{32,16}s16 Rtmp, Rsrc */
+       instr = instr_create(ctx, 1, 0);
+       instr->cat1.src_type = get_ftype(ctx);
+       instr->cat1.dst_type = TYPE_S16;
+       add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+       add_src_reg(ctx, instr, src, chan);
+
+       /* shl.b Rtmp, Rtmp, 2 */
+       instr = instr_create(ctx, 2, OPC_SHL_B);
+       add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+       add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+       ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
+
+       /* mova a0, Rtmp */
+       instr = instr_create(ctx, 1, 0);
+       instr->cat1.src_type = TYPE_S16;
+       instr->cat1.dst_type = TYPE_S16;
+       add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
+       add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+}
+
+/*
+ * texture fetch/sample instructions:
+ */
+
+struct tex_info {
+       int8_t order[4];
+       unsigned src_wrmask, flags;
+};
+
+static const struct tex_info *
+get_tex_info(struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       static const struct tex_info tex1d = {
+               .order = { 0, -1, -1, -1 },  /* coord.x */
+               .src_wrmask = TGSI_WRITEMASK_XY,
+               .flags = 0,
+       };
+       static const struct tex_info tex1ds = {
+               .order = { 0, -1,  2, -1 },  /* coord.xz */
+               .src_wrmask = TGSI_WRITEMASK_XYZ,
+               .flags = IR3_INSTR_S,
+       };
+       static const struct tex_info tex2d = {
+               .order = { 0,  1, -1, -1 },  /* coord.xy */
+               .src_wrmask = TGSI_WRITEMASK_XY,
+               .flags = 0,
+       };
+       static const struct tex_info tex2ds = {
+               .order = { 0,  1,  2, -1 },  /* coord.xyz */
+               .src_wrmask = TGSI_WRITEMASK_XYZ,
+               .flags = IR3_INSTR_S,
+       };
+       static const struct tex_info tex3d = {
+               .order = { 0,  1,  2, -1 },  /* coord.xyz */
+               .src_wrmask = TGSI_WRITEMASK_XYZ,
+               .flags = IR3_INSTR_3D,
+       };
+       static const struct tex_info tex3ds = {
+               .order = { 0,  1,  2,  3 },  /* coord.xyzw */
+               .src_wrmask = TGSI_WRITEMASK_XYZW,
+               .flags = IR3_INSTR_S | IR3_INSTR_3D,
+       };
+       static const struct tex_info txp1d = {
+               .order = { 0, -1,  3, -1 },  /* coord.xw */
+               .src_wrmask = TGSI_WRITEMASK_XYZ,
+               .flags = IR3_INSTR_P,
+       };
+       static const struct tex_info txp1ds = {
+               .order = { 0, -1,  2,  3 },  /* coord.xzw */
+               .src_wrmask = TGSI_WRITEMASK_XYZW,
+               .flags = IR3_INSTR_P | IR3_INSTR_S,
+       };
+       static const struct tex_info txp2d = {
+               .order = { 0,  1,  3, -1 },  /* coord.xyw */
+               .src_wrmask = TGSI_WRITEMASK_XYZ,
+               .flags = IR3_INSTR_P,
+       };
+       static const struct tex_info txp2ds = {
+               .order = { 0,  1,  2,  3 },  /* coord.xyzw */
+               .src_wrmask = TGSI_WRITEMASK_XYZW,
+               .flags = IR3_INSTR_P | IR3_INSTR_S,
+       };
+       static const struct tex_info txp3d = {
+               .order = { 0,  1,  2,  3 },  /* coord.xyzw */
+               .src_wrmask = TGSI_WRITEMASK_XYZW,
+               .flags = IR3_INSTR_P | IR3_INSTR_3D,
+       };
+
+       unsigned tex = inst->Texture.Texture;
+
+       switch (inst->Instruction.Opcode) {
+       case TGSI_OPCODE_TEX:
+               switch (tex) {
+               case TGSI_TEXTURE_1D:
+                       return &tex1d;
+               case TGSI_TEXTURE_SHADOW1D:
+                       return &tex1ds;
+               case TGSI_TEXTURE_2D:
+               case TGSI_TEXTURE_RECT:
+                       return &tex2d;
+               case TGSI_TEXTURE_SHADOW2D:
+               case TGSI_TEXTURE_SHADOWRECT:
+                       return &tex2ds;
+               case TGSI_TEXTURE_3D:
+               case TGSI_TEXTURE_CUBE:
+                       return &tex3d;
+               case TGSI_TEXTURE_SHADOWCUBE:
+                       return &tex3ds;
+               default:
+                       compile_error(ctx, "unknown texture type: %s\n",
+                                       tgsi_texture_names[tex]);
+                       return NULL;
+               }
+               break;
+       case TGSI_OPCODE_TXP:
+               switch (tex) {
+               case TGSI_TEXTURE_1D:
+                       return &txp1d;
+               case TGSI_TEXTURE_SHADOW1D:
+                       return &txp1ds;
+               case TGSI_TEXTURE_2D:
+               case TGSI_TEXTURE_RECT:
+                       return &txp2d;
+               case TGSI_TEXTURE_SHADOW2D:
+               case TGSI_TEXTURE_SHADOWRECT:
+                       return &txp2ds;
+               case TGSI_TEXTURE_3D:
+               case TGSI_TEXTURE_CUBE:
+                       return &txp3d;
+               default:
+                       compile_error(ctx, "unknown texture type: %s\n",
+                                       tgsi_texture_names[tex]);
+                       break;
+               }
+               break;
+       }
+       compile_assert(ctx, 0);
+       return NULL;
+}
+
+static struct tgsi_src_register *
+get_tex_coord(struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst,
+               const struct tex_info *tinf)
+{
+       struct tgsi_src_register *coord = &inst->Src[0].Register;
+       struct ir3_instruction *instr;
+       unsigned tex = inst->Texture.Texture;
+       bool needs_mov = false;
+       unsigned i;
+
+       /* cat5 instruction cannot seem to handle const or relative: */
+       if (is_rel_or_const(coord))
+               needs_mov = true;
+
+       /* 1D textures we fix up w/ 0.0 as 2nd coord: */
+       if ((tex == TGSI_TEXTURE_1D) || (tex == TGSI_TEXTURE_SHADOW1D))
+               needs_mov = true;
+
+       /* The texture sample instructions need to coord in successive
+        * registers/components (ie. src.xy but not src.yx).  And TXP
+        * needs the .w component in .z for 2D..  so in some cases we
+        * might need to emit some mov instructions to shuffle things
+        * around:
+        */
+       for (i = 1; (i < 4) && (tinf->order[i] >= 0) && !needs_mov; i++)
+               if (src_swiz(coord, i) != (src_swiz(coord, 0) + tinf->order[i]))
+                       needs_mov = true;
+
+       if (needs_mov) {
+               struct tgsi_dst_register tmp_dst;
+               struct tgsi_src_register *tmp_src;
+               unsigned j;
+
+               type_t type_mov = get_ftype(ctx);
+
+               /* need to move things around: */
+               tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+               for (j = 0; j < 4; j++) {
+                       if (tinf->order[j] < 0)
+                               continue;
+                       instr = instr_create(ctx, 1, 0);  /* mov */
+                       instr->cat1.src_type = type_mov;
+                       instr->cat1.dst_type = type_mov;
+                       add_dst_reg(ctx, instr, &tmp_dst, j);
+                       add_src_reg(ctx, instr, coord,
+                                       src_swiz(coord, tinf->order[j]));
+               }
+
+               /* fix up .y coord: */
+               if ((tex == TGSI_TEXTURE_1D) ||
+                               (tex == TGSI_TEXTURE_SHADOW1D)) {
+                       instr = instr_create(ctx, 1, 0);  /* mov */
+                       instr->cat1.src_type = type_mov;
+                       instr->cat1.dst_type = type_mov;
+                       add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
+                       ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5;
+               }
+
+               coord = tmp_src;
+       }
+
+       return coord;
+}
+
+static void
+trans_samp(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+       struct tgsi_src_register *coord;
+       struct tgsi_src_register *samp  = &inst->Src[1].Register;
+       const struct tex_info *tinf;
+
+       tinf = get_tex_info(ctx, inst);
+       coord = get_tex_coord(ctx, inst, tinf);
+
+       instr = instr_create(ctx, 5, t->opc);
+       instr->cat5.type = get_ftype(ctx);
+       instr->cat5.samp = samp->Index;
+       instr->cat5.tex  = samp->Index;
+       instr->flags |= tinf->flags;
+
+       add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
+       add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf->src_wrmask);
+}
+
+/*
+ * SEQ(a,b) = (a == b) ? 1.0 : 0.0
+ *   cmps.f.eq tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SNE(a,b) = (a != b) ? 1.0 : 0.0
+ *   cmps.f.ne tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SGE(a,b) = (a >= b) ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SLE(a,b) = (a <= b) ? 1.0 : 0.0
+ *   cmps.f.le tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SGT(a,b) = (a > b)  ? 1.0 : 0.0
+ *   cmps.f.gt tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SLT(a,b) = (a < b)  ? 1.0 : 0.0
+ *   cmps.f.lt tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * CMP(a,b,c) = (a < 0.0) ? b : c
+ *   cmps.f.lt tmp0, a, {0.0}
+ *   sel.b16 dst, b, tmp0, c
+ */
+static void
+trans_cmp(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+       struct tgsi_dst_register tmp_dst;
+       struct tgsi_src_register *tmp_src;
+       struct tgsi_src_register constval0;
+       /* final instruction for CMP() uses orig src1 and src2: */
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *a0, *a1, *a2;
+       unsigned condition;
+
+       tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+       a0 = &inst->Src[0].Register;  /* a */
+       a1 = &inst->Src[1].Register;  /* b */
+
+       switch (t->tgsi_opc) {
+       case TGSI_OPCODE_SEQ:
+       case TGSI_OPCODE_FSEQ:
+               condition = IR3_COND_EQ;
+               break;
+       case TGSI_OPCODE_SNE:
+       case TGSI_OPCODE_FSNE:
+               condition = IR3_COND_NE;
+               break;
+       case TGSI_OPCODE_SGE:
+       case TGSI_OPCODE_FSGE:
+               condition = IR3_COND_GE;
+               break;
+       case TGSI_OPCODE_SLT:
+       case TGSI_OPCODE_FSLT:
+               condition = IR3_COND_LT;
+               break;
+       case TGSI_OPCODE_SLE:
+               condition = IR3_COND_LE;
+               break;
+       case TGSI_OPCODE_SGT:
+               condition = IR3_COND_GT;
+               break;
+       case TGSI_OPCODE_CMP:
+               get_immediate(ctx, &constval0, fui(0.0));
+               a0 = &inst->Src[0].Register;  /* a */
+               a1 = &constval0;              /* {0.0} */
+               condition = IR3_COND_LT;
+               break;
+       default:
+               compile_assert(ctx, 0);
+               return;
+       }
+
+       if (is_const(a0) && is_const(a1))
+               a0 = get_unconst(ctx, a0);
+
+       /* cmps.f.<cond> tmp, a0, a1 */
+       instr = instr_create(ctx, 2, OPC_CMPS_F);
+       instr->cat2.condition = condition;
+       vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+       switch (t->tgsi_opc) {
+       case TGSI_OPCODE_SEQ:
+       case TGSI_OPCODE_FSEQ:
+       case TGSI_OPCODE_SGE:
+       case TGSI_OPCODE_FSGE:
+       case TGSI_OPCODE_SLE:
+       case TGSI_OPCODE_SNE:
+       case TGSI_OPCODE_FSNE:
+       case TGSI_OPCODE_SGT:
+       case TGSI_OPCODE_SLT:
+       case TGSI_OPCODE_FSLT:
+               /* cov.u16f16 dst, tmp0 */
+               instr = instr_create(ctx, 1, 0);
+               instr->cat1.src_type = get_utype(ctx);
+               instr->cat1.dst_type = get_ftype(ctx);
+               vectorize(ctx, instr, dst, 1, tmp_src, 0);
+               break;
+       case TGSI_OPCODE_CMP:
+               a1 = &inst->Src[1].Register;
+               a2 = &inst->Src[2].Register;
+               /* sel.{b32,b16} dst, src2, tmp, src1 */
+               instr = instr_create(ctx, 3, OPC_SEL_B32);
+               vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
+
+               break;
+       }
+
+       put_dst(ctx, inst, dst);
+}
+
+/*
+ * USNE(a,b) = (a != b) ? 1 : 0
+ *   cmps.u32.ne dst, a, b
+ *
+ * USEQ(a,b) = (a == b) ? 1 : 0
+ *   cmps.u32.eq dst, a, b
+ *
+ * ISGE(a,b) = (a > b) ? 1 : 0
+ *   cmps.s32.ge dst, a, b
+ *
+ * USGE(a,b) = (a > b) ? 1 : 0
+ *   cmps.u32.ge dst, a, b
+ *
+ * ISLT(a,b) = (a < b) ? 1 : 0
+ *   cmps.s32.lt dst, a, b
+ *
+ * USLT(a,b) = (a < b) ? 1 : 0
+ *   cmps.u32.lt dst, a, b
+ *
+ * UCMP(a,b,c) = (a < 0) ? b : c
+ *   cmps.u32.lt tmp0, a, {0}
+ *   sel.b16 dst, b, tmp0, c
+ */
+static void
+trans_icmp(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register constval0;
+       struct tgsi_src_register *a0, *a1, *a2;
+       unsigned condition;
+
+       a0 = &inst->Src[0].Register;  /* a */
+       a1 = &inst->Src[1].Register;  /* b */
+
+       switch (t->tgsi_opc) {
+       case TGSI_OPCODE_USNE:
+               condition = IR3_COND_NE;
+               break;
+       case TGSI_OPCODE_USEQ:
+               condition = IR3_COND_EQ;
+               break;
+       case TGSI_OPCODE_ISGE:
+       case TGSI_OPCODE_USGE:
+               condition = IR3_COND_GE;
+               break;
+       case TGSI_OPCODE_ISLT:
+       case TGSI_OPCODE_USLT:
+               condition = IR3_COND_LT;
+               break;
+       case TGSI_OPCODE_UCMP:
+               get_immediate(ctx, &constval0, 0);
+               a0 = &inst->Src[0].Register;  /* a */
+               a1 = &constval0;              /* {0} */
+               condition = IR3_COND_LT;
+               break;
+
+       default:
+               compile_assert(ctx, 0);
+               return;
+       }
+
+       if (is_const(a0) && is_const(a1))
+               a0 = get_unconst(ctx, a0);
+
+       if (t->tgsi_opc == TGSI_OPCODE_UCMP) {
+               struct tgsi_dst_register tmp_dst;
+               struct tgsi_src_register *tmp_src;
+               tmp_src = get_internal_temp(ctx, &tmp_dst);
+               /* cmps.u32.lt tmp, a0, a1 */
+               instr = instr_create(ctx, 2, t->opc);
+               instr->cat2.condition = condition;
+               vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+               a1 = &inst->Src[1].Register;
+               a2 = &inst->Src[2].Register;
+               /* sel.{b32,b16} dst, src2, tmp, src1 */
+               instr = instr_create(ctx, 3, OPC_SEL_B32);
+               vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
+       } else {
+               /* cmps.{u32,s32}.<cond> dst, a0, a1 */
+               instr = instr_create(ctx, 2, t->opc);
+               instr->cat2.condition = condition;
+               vectorize(ctx, instr, dst, 2, a0, 0, a1, 0);
+       }
+       put_dst(ctx, inst, dst);
+}
+
+/*
+ * Conditional / Flow control
+ */
+
+static void
+push_branch(struct ir3_compile_context *ctx, bool inv,
+               struct ir3_instruction *instr, struct ir3_instruction *cond)
+{
+       unsigned int idx = ctx->branch_count++;
+       compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
+       ctx->branch[idx].instr = instr;
+       ctx->branch[idx].inv = inv;
+       /* else side of branch has same condition: */
+       if (!inv)
+               ctx->branch[idx].cond = cond;
+}
+
+static struct ir3_instruction *
+pop_branch(struct ir3_compile_context *ctx)
+{
+       unsigned int idx = --ctx->branch_count;
+       return ctx->branch[idx].instr;
+}
+
+static void
+trans_if(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr, *cond;
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+       struct tgsi_dst_register tmp_dst;
+       struct tgsi_src_register *tmp_src;
+       struct tgsi_src_register constval;
+
+       get_immediate(ctx, &constval, fui(0.0));
+       tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+       if (is_const(src))
+               src = get_unconst(ctx, src);
+
+       /* cmps.f.ne tmp0, b, {0.0} */
+       instr = instr_create(ctx, 2, OPC_CMPS_F);
+       add_dst_reg(ctx, instr, &tmp_dst, 0);
+       add_src_reg(ctx, instr, src, src->SwizzleX);
+       add_src_reg(ctx, instr, &constval, constval.SwizzleX);
+       instr->cat2.condition = IR3_COND_NE;
+
+       compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
+       cond = instr->regs[1]->instr;
+
+       /* meta:flow tmp0 */
+       instr = instr_create(ctx, -1, OPC_META_FLOW);
+       ir3_reg_create(instr, 0, 0);  /* dummy dst */
+       add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
+
+       push_branch(ctx, false, instr, cond);
+       instr->flow.if_block = push_block(ctx);
+}
+
+static void
+trans_else(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+
+       pop_block(ctx);
+
+       instr = pop_branch(ctx);
+
+       compile_assert(ctx, (instr->category == -1) &&
+                       (instr->opc == OPC_META_FLOW));
+
+       push_branch(ctx, true, instr, NULL);
+       instr->flow.else_block = push_block(ctx);
+}
+
+static struct ir3_instruction *
+find_temporary(struct ir3_block *block, unsigned n)
+{
+       if (block->parent && !block->temporaries[n])
+               return find_temporary(block->parent, n);
+       return block->temporaries[n];
+}
+
+static struct ir3_instruction *
+find_output(struct ir3_block *block, unsigned n)
+{
+       if (block->parent && !block->outputs[n])
+               return find_output(block->parent, n);
+       return block->outputs[n];
+}
+
+static struct ir3_instruction *
+create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
+               struct ir3_instruction *a, struct ir3_instruction *b)
+{
+       struct ir3_instruction *phi;
+
+       compile_assert(ctx, cond);
+
+       /* Either side of the condition could be null..  which
+        * indicates a variable written on only one side of the
+        * branch.  Normally this should only be variables not
+        * used outside of that side of the branch.  So we could
+        * just 'return a ? a : b;' in that case.  But for better
+        * defined undefined behavior we just stick in imm{0.0}.
+        * In the common case of a value only used within the
+        * one side of the branch, the PHI instruction will not
+        * get scheduled
+        */
+       if (!a)
+               a = create_immed(ctx, 0.0);
+       if (!b)
+               b = create_immed(ctx, 0.0);
+
+       phi = instr_create(ctx, -1, OPC_META_PHI);
+       ir3_reg_create(phi, 0, 0);  /* dummy dst */
+       ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
+       ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
+       ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
+
+       return phi;
+}
+
+static void
+trans_endif(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+       struct ir3_block *ifb, *elseb;
+       struct ir3_instruction **ifout, **elseout;
+       unsigned i, ifnout = 0, elsenout = 0;
+
+       pop_block(ctx);
+
+       instr = pop_branch(ctx);
+
+       compile_assert(ctx, (instr->category == -1) &&
+                       (instr->opc == OPC_META_FLOW));
+
+       ifb = instr->flow.if_block;
+       elseb = instr->flow.else_block;
+       /* if there is no else block, the parent block is used for the
+        * branch-not-taken src of the PHI instructions:
+        */
+       if (!elseb)
+               elseb = ifb->parent;
+
+       /* worst case sizes: */
+       ifnout = ifb->ntemporaries + ifb->noutputs;
+       elsenout = elseb->ntemporaries + elseb->noutputs;
+
+       ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
+       if (elseb != ifb->parent)
+               elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
+
+       ifnout = 0;
+       elsenout = 0;
+
+       /* generate PHI instructions for any temporaries written: */
+       for (i = 0; i < ifb->ntemporaries; i++) {
+               struct ir3_instruction *a = ifb->temporaries[i];
+               struct ir3_instruction *b = elseb->temporaries[i];
+
+               /* if temporary written in if-block, or if else block
+                * is present and temporary written in else-block:
+                */
+               if (a || ((elseb != ifb->parent) && b)) {
+                       struct ir3_instruction *phi;
+
+                       /* if only written on one side, find the closest
+                        * enclosing update on other side:
+                        */
+                       if (!a)
+                               a = find_temporary(ifb, i);
+                       if (!b)
+                               b = find_temporary(elseb, i);
+
+                       ifout[ifnout] = a;
+                       a = create_output(ifb, a, ifnout++);
+
+                       if (elseb != ifb->parent) {
+                               elseout[elsenout] = b;
+                               b = create_output(elseb, b, elsenout++);
+                       }
+
+                       phi = create_phi(ctx, instr, a, b);
+                       ctx->block->temporaries[i] = phi;
+               }
+       }
+
+       compile_assert(ctx, ifb->noutputs == elseb->noutputs);
+
+       /* .. and any outputs written: */
+       for (i = 0; i < ifb->noutputs; i++) {
+               struct ir3_instruction *a = ifb->outputs[i];
+               struct ir3_instruction *b = elseb->outputs[i];
+
+               /* if output written in if-block, or if else block
+                * is present and output written in else-block:
+                */
+               if (a || ((elseb != ifb->parent) && b)) {
+                       struct ir3_instruction *phi;
+
+                       /* if only written on one side, find the closest
+                        * enclosing update on other side:
+                        */
+                       if (!a)
+                               a = find_output(ifb, i);
+                       if (!b)
+                               b = find_output(elseb, i);
+
+                       ifout[ifnout] = a;
+                       a = create_output(ifb, a, ifnout++);
+
+                       if (elseb != ifb->parent) {
+                               elseout[elsenout] = b;
+                               b = create_output(elseb, b, elsenout++);
+                       }
+
+                       phi = create_phi(ctx, instr, a, b);
+                       ctx->block->outputs[i] = phi;
+               }
+       }
+
+       ifb->noutputs = ifnout;
+       ifb->outputs = ifout;
+
+       if (elseb != ifb->parent) {
+               elseb->noutputs = elsenout;
+               elseb->outputs = elseout;
+       }
+
+       // TODO maybe we want to compact block->inputs?
+}
+
+/*
+ * Kill
+ */
+
+static void
+trans_kill(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr, *immed, *cond = NULL;
+       bool inv = false;
+
+       switch (t->tgsi_opc) {
+       case TGSI_OPCODE_KILL:
+               /* unconditional kill, use enclosing if condition: */
+               if (ctx->branch_count > 0) {
+                       unsigned int idx = ctx->branch_count - 1;
+                       cond = ctx->branch[idx].cond;
+                       inv = ctx->branch[idx].inv;
+               } else {
+                       cond = create_immed(ctx, 1.0);
+               }
+
+               break;
+       }
+
+       compile_assert(ctx, cond);
+
+       immed = create_immed(ctx, 0.0);
+
+       /* cmps.f.ne p0.x, cond, {0.0} */
+       instr = instr_create(ctx, 2, OPC_CMPS_F);
+       instr->cat2.condition = IR3_COND_NE;
+       ir3_reg_create(instr, regid(REG_P0, 0), 0);
+       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
+       cond = instr;
+
+       /* kill p0.x */
+       instr = instr_create(ctx, 0, OPC_KILL);
+       instr->cat0.inv = inv;
+       ir3_reg_create(instr, 0, 0);  /* dummy dst */
+       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+
+       ctx->kill[ctx->kill_count++] = instr;
+}
+
+/*
+ * Kill-If
+ */
+
+static void
+trans_killif(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+       struct ir3_instruction *instr, *immed, *cond = NULL;
+       bool inv = false;
+
+       immed = create_immed(ctx, 0.0);
+
+       /* cmps.f.ne p0.x, cond, {0.0} */
+       instr = instr_create(ctx, 2, OPC_CMPS_F);
+       instr->cat2.condition = IR3_COND_NE;
+       ir3_reg_create(instr, regid(REG_P0, 0), 0);
+       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
+       add_src_reg(ctx, instr, src, src->SwizzleX);
+
+       cond = instr;
+
+       /* kill p0.x */
+       instr = instr_create(ctx, 0, OPC_KILL);
+       instr->cat0.inv = inv;
+       ir3_reg_create(instr, 0, 0);  /* dummy dst */
+       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+
+       ctx->kill[ctx->kill_count++] = instr;
+
+}
+/*
+ * I2F / U2F / F2I / F2U
+ */
+
+static void
+trans_cov(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+
+       // cov.f32s32 dst, tmp0 /
+       instr = instr_create(ctx, 1, 0);
+       switch (t->tgsi_opc) {
+       case TGSI_OPCODE_U2F:
+               instr->cat1.src_type = TYPE_U32;
+               instr->cat1.dst_type = TYPE_F32;
+               break;
+       case TGSI_OPCODE_I2F:
+               instr->cat1.src_type = TYPE_S32;
+               instr->cat1.dst_type = TYPE_F32;
+               break;
+       case TGSI_OPCODE_F2U:
+               instr->cat1.src_type = TYPE_F32;
+               instr->cat1.dst_type = TYPE_U32;
+               break;
+       case TGSI_OPCODE_F2I:
+               instr->cat1.src_type = TYPE_F32;
+               instr->cat1.dst_type = TYPE_S32;
+               break;
+
+       }
+       vectorize(ctx, instr, dst, 1, src, 0);
+}
+
+/*
+ * Handlers for TGSI instructions which do have 1:1 mapping to native
+ * instructions:
+ */
+
+static void
+instr_cat0(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       instr_create(ctx, 0, t->opc);
+}
+
+static void
+instr_cat1(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+       create_mov(ctx, dst, src);
+       put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat2(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src0 = &inst->Src[0].Register;
+       struct tgsi_src_register *src1 = &inst->Src[1].Register;
+       struct ir3_instruction *instr;
+       unsigned src0_flags = 0, src1_flags = 0;
+
+       switch (t->tgsi_opc) {
+       case TGSI_OPCODE_ABS:
+       case TGSI_OPCODE_IABS:
+               src0_flags = IR3_REG_ABS;
+               break;
+       case TGSI_OPCODE_SUB:
+       case TGSI_OPCODE_INEG:
+               src1_flags = IR3_REG_NEGATE;
+               break;
+       }
+
+       switch (t->opc) {
+       case OPC_ABSNEG_F:
+       case OPC_ABSNEG_S:
+       case OPC_CLZ_B:
+       case OPC_CLZ_S:
+       case OPC_SIGN_F:
+       case OPC_FLOOR_F:
+       case OPC_CEIL_F:
+       case OPC_RNDNE_F:
+       case OPC_RNDAZ_F:
+       case OPC_TRUNC_F:
+       case OPC_NOT_B:
+       case OPC_BFREV_B:
+       case OPC_SETRM:
+       case OPC_CBITS_B:
+               /* these only have one src reg */
+               instr = instr_create(ctx, 2, t->opc);
+               vectorize(ctx, instr, dst, 1, src0, src0_flags);
+               break;
+       default:
+               if (is_const(src0) && is_const(src1))
+                       src0 = get_unconst(ctx, src0);
+
+               instr = instr_create(ctx, 2, t->opc);
+               vectorize(ctx, instr, dst, 2, src0, src0_flags,
+                               src1, src1_flags);
+               break;
+       }
+
+       put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat3(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src0 = &inst->Src[0].Register;
+       struct tgsi_src_register *src1 = &inst->Src[1].Register;
+       struct ir3_instruction *instr;
+
+       /* in particular, can't handle const for src1 for cat3..
+        * for mad, we can swap first two src's if needed:
+        */
+       if (is_rel_or_const(src1)) {
+               if (is_mad(t->opc) && !is_rel_or_const(src0)) {
+                       struct tgsi_src_register *tmp;
+                       tmp = src0;
+                       src0 = src1;
+                       src1 = tmp;
+               } else {
+                       src1 = get_unconst(ctx, src1);
+               }
+       }
+
+       instr = instr_create(ctx, 3, t->opc);
+       vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
+                       &inst->Src[2].Register, 0);
+       put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat4(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+       struct ir3_instruction *instr;
+       unsigned i;
+
+       /* seems like blob compiler avoids const as src.. */
+       if (is_const(src))
+               src = get_unconst(ctx, src);
+
+       /* we need to replicate into each component: */
+       for (i = 0; i < 4; i++) {
+               if (dst->WriteMask & (1 << i)) {
+                       instr = instr_create(ctx, 4, t->opc);
+                       add_dst_reg(ctx, instr, dst, i);
+                       add_src_reg(ctx, instr, src, src->SwizzleX);
+               }
+       }
+
+       put_dst(ctx, inst, dst);
+}
+
+static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
+#define INSTR(n, f, ...) \
+       [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
+
+       INSTR(MOV,          instr_cat1),
+       INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
+       INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
+       INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
+       INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
+       INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
+       INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
+       INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
+       INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
+       INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
+       INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
+       INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
+       INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
+       INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
+       INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
+       INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
+       INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
+       INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
+       INSTR(UMUL,         instr_cat2, .opc = OPC_MUL_U),
+       INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
+       INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
+       INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
+       INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
+       INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
+       INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
+       INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
+       INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
+       INSTR(CLAMP,        trans_clamp),
+       INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
+       INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
+       INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
+       INSTR(CEIL,         instr_cat2, .opc = OPC_CEIL_F),
+       INSTR(ARL,          trans_arl),
+       INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
+       INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
+       INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
+       INSTR(COS,          instr_cat4, .opc = OPC_COS),
+       INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
+       INSTR(TEX,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
+       INSTR(TXP,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
+       INSTR(SGT,          trans_cmp),
+       INSTR(SLT,          trans_cmp),
+       INSTR(FSLT,         trans_cmp),
+       INSTR(SGE,          trans_cmp),
+       INSTR(FSGE,         trans_cmp),
+       INSTR(SLE,          trans_cmp),
+       INSTR(SNE,          trans_cmp),
+       INSTR(FSNE,         trans_cmp),
+       INSTR(SEQ,          trans_cmp),
+       INSTR(FSEQ,         trans_cmp),
+       INSTR(CMP,          trans_cmp),
+       INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
+       INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
+       INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
+       INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
+       INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
+       INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
+       INSTR(UCMP,         trans_icmp, .opc = OPC_CMPS_U),
+       INSTR(IF,           trans_if),
+       INSTR(UIF,          trans_if),
+       INSTR(ELSE,         trans_else),
+       INSTR(ENDIF,        trans_endif),
+       INSTR(END,          instr_cat0, .opc = OPC_END),
+       INSTR(KILL,         trans_kill, .opc = OPC_KILL),
+       INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
+       INSTR(I2F,          trans_cov),
+       INSTR(U2F,          trans_cov),
+       INSTR(F2I,          trans_cov),
+       INSTR(F2U,          trans_cov),
+};
+
+static ir3_semantic
+decl_semantic(const struct tgsi_declaration_semantic *sem)
+{
+       return ir3_semantic_name(sem->Name, sem->Index);
+}
+
+static struct ir3_instruction *
+decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
+               unsigned j, unsigned inloc)
+{
+       struct ir3_instruction *instr;
+       struct ir3_register *src;
+
+       /* bary.f dst, #inloc, r0.x */
+       instr = instr_create(ctx, 2, OPC_BARY_F);
+       ir3_reg_create(instr, regid, 0);   /* dummy dst */
+       ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
+       src = ir3_reg_create(instr, 0, IR3_REG_SSA);
+       src->wrmask = 0x3;
+       src->instr = ctx->frag_pos;
+
+       return instr;
+}
+
+/* TGSI_SEMANTIC_POSITION
+ * """"""""""""""""""""""
+ *
+ * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
+ * fragment shader input contains the fragment's window position.  The X
+ * component starts at zero and always increases from left to right.
+ * The Y component starts at zero and always increases but Y=0 may either
+ * indicate the top of the window or the bottom depending on the fragment
+ * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
+ * The Z coordinate ranges from 0 to 1 to represent depth from the front
+ * to the back of the Z buffer.  The W component contains the reciprocol
+ * of the interpolated vertex position W component.
+ */
+static struct ir3_instruction *
+decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
+               unsigned j)
+{
+       struct ir3_instruction *instr, *src;
+
+       compile_assert(ctx, !ctx->frag_coord[j]);
+
+       ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
+
+
+       switch (j) {
+       case 0: /* .x */
+       case 1: /* .y */
+               /* for frag_coord, we get unsigned values.. we need
+                * to subtract (integer) 8 and divide by 16 (right-
+                * shift by 4) then convert to float:
+                */
+
+               /* add.s tmp, src, -8 */
+               instr = instr_create(ctx, 2, OPC_ADD_S);
+               ir3_reg_create(instr, regid, 0);    /* dummy dst */
+               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
+               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
+               src = instr;
+
+               /* shr.b tmp, tmp, 4 */
+               instr = instr_create(ctx, 2, OPC_SHR_B);
+               ir3_reg_create(instr, regid, 0);    /* dummy dst */
+               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
+               src = instr;
+
+               /* mov.u32f32 dst, tmp */
+               instr = instr_create(ctx, 1, 0);
+               instr->cat1.src_type = TYPE_U32;
+               instr->cat1.dst_type = TYPE_F32;
+               ir3_reg_create(instr, regid, 0);    /* dummy dst */
+               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+
+               break;
+       case 2: /* .z */
+       case 3: /* .w */
+               /* seems that we can use these as-is: */
+               instr = ctx->frag_coord[j];
+               break;
+       default:
+               compile_error(ctx, "invalid channel\n");
+               instr = create_immed(ctx, 0.0);
+               break;
+       }
+
+       return instr;
+}
+
+/* TGSI_SEMANTIC_FACE
+ * """"""""""""""""""
+ *
+ * This label applies to fragment shader inputs only and indicates that
+ * the register contains front/back-face information of the form (F, 0,
+ * 0, 1).  The first component will be positive when the fragment belongs
+ * to a front-facing polygon, and negative when the fragment belongs to a
+ * back-facing polygon.
+ */
+static struct ir3_instruction *
+decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
+               unsigned j)
+{
+       struct ir3_instruction *instr, *src;
+
+       switch (j) {
+       case 0: /* .x */
+               compile_assert(ctx, !ctx->frag_face);
+
+               ctx->frag_face = create_input(ctx->block, NULL, 0);
+
+               /* for faceness, we always get -1 or 0 (int).. but TGSI expects
+                * positive vs negative float.. and piglit further seems to
+                * expect -1.0 or 1.0:
+                *
+                *    mul.s tmp, hr0.x, 2
+                *    add.s tmp, tmp, 1
+                *    mov.s16f32, dst, tmp
+                *
+                */
+
+               instr = instr_create(ctx, 2, OPC_MUL_S);
+               ir3_reg_create(instr, regid, 0);    /* dummy dst */
+               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
+               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
+               src = instr;
+
+               instr = instr_create(ctx, 2, OPC_ADD_S);
+               ir3_reg_create(instr, regid, 0);    /* dummy dst */
+               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
+               src = instr;
+
+               instr = instr_create(ctx, 1, 0); /* mov */
+               instr->cat1.src_type = TYPE_S32;
+               instr->cat1.dst_type = TYPE_F32;
+               ir3_reg_create(instr, regid, 0);    /* dummy dst */
+               ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+
+               break;
+       case 1: /* .y */
+       case 2: /* .z */
+               instr = create_immed(ctx, 0.0);
+               break;
+       case 3: /* .w */
+               instr = create_immed(ctx, 1.0);
+               break;
+       default:
+               compile_error(ctx, "invalid channel\n");
+               instr = create_immed(ctx, 0.0);
+               break;
+       }
+
+       return instr;
+}
+
+static void
+decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       unsigned name = decl->Semantic.Name;
+       unsigned i;
+
+       /* I don't think we should get frag shader input without
+        * semantic info?  Otherwise how do inputs get linked to
+        * vert outputs?
+        */
+       compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
+                       decl->Declaration.Semantic);
+
+       for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+               unsigned n = so->inputs_count++;
+               unsigned r = regid(i, 0);
+               unsigned ncomp, j;
+
+               /* we'll figure out the actual components used after scheduling */
+               ncomp = 4;
+
+               DBG("decl in -> r%d", i);
+
+               compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
+
+               so->inputs[n].semantic = decl_semantic(&decl->Semantic);
+               so->inputs[n].compmask = (1 << ncomp) - 1;
+               so->inputs[n].regid = r;
+               so->inputs[n].inloc = ctx->next_inloc;
+
+               for (j = 0; j < ncomp; j++) {
+                       struct ir3_instruction *instr = NULL;
+
+                       if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+                               /* for fragment shaders, POSITION and FACE are handled
+                                * specially, not using normal varying / bary.f
+                                */
+                               if (name == TGSI_SEMANTIC_POSITION) {
+                                       so->inputs[n].bary = false;
+                                       so->frag_coord = true;
+                                       instr = decl_in_frag_coord(ctx, r + j, j);
+                               } else if (name == TGSI_SEMANTIC_FACE) {
+                                       so->inputs[n].bary = false;
+                                       so->frag_face = true;
+                                       instr = decl_in_frag_face(ctx, r + j, j);
+                               } else {
+                                       so->inputs[n].bary = true;
+                                       instr = decl_in_frag_bary(ctx, r + j, j,
+                                                       so->inputs[n].inloc + j - 8);
+                               }
+                       } else {
+                               instr = create_input(ctx->block, NULL, (i * 4) + j);
+                       }
+
+                       ctx->block->inputs[(i * 4) + j] = instr;
+               }
+
+               if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
+                       ctx->next_inloc += ncomp;
+                       so->total_in += ncomp;
+               }
+       }
+}
+
+static void
+decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       unsigned comp = 0;
+       unsigned name = decl->Semantic.Name;
+       unsigned i;
+
+       compile_assert(ctx, decl->Declaration.Semantic);
+
+       DBG("decl out[%d] -> r%d", name, decl->Range.First);
+
+       if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+               switch (name) {
+               case TGSI_SEMANTIC_POSITION:
+                       so->writes_pos = true;
+                       break;
+               case TGSI_SEMANTIC_PSIZE:
+                       so->writes_psize = true;
+                       break;
+               case TGSI_SEMANTIC_COLOR:
+               case TGSI_SEMANTIC_BCOLOR:
+               case TGSI_SEMANTIC_GENERIC:
+               case TGSI_SEMANTIC_FOG:
+               case TGSI_SEMANTIC_TEXCOORD:
+                       break;
+               default:
+                       compile_error(ctx, "unknown VS semantic name: %s\n",
+                                       tgsi_semantic_names[name]);
+               }
+       } else {
+               switch (name) {
+               case TGSI_SEMANTIC_POSITION:
+                       comp = 2;  /* tgsi will write to .z component */
+                       so->writes_pos = true;
+                       break;
+               case TGSI_SEMANTIC_COLOR:
+                       break;
+               default:
+                       compile_error(ctx, "unknown FS semantic name: %s\n",
+                                       tgsi_semantic_names[name]);
+               }
+       }
+
+       for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+               unsigned n = so->outputs_count++;
+               unsigned ncomp, j;
+
+               ncomp = 4;
+
+               compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+
+               so->outputs[n].semantic = decl_semantic(&decl->Semantic);
+               so->outputs[n].regid = regid(i, comp);
+
+               /* avoid undefined outputs, stick a dummy mov from imm{0.0},
+                * which if the output is actually assigned will be over-
+                * written
+                */
+               for (j = 0; j < ncomp; j++)
+                       ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
+       }
+}
+
+/* from TGSI perspective, we actually have inputs.  But most of the "inputs"
+ * for a fragment shader are just bary.f instructions.  The *actual* inputs
+ * from the hw perspective are the frag_pos and optionally frag_coord and
+ * frag_face.
+ */
+static void
+fixup_frag_inputs(struct ir3_compile_context *ctx)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       struct ir3_block *block = ctx->block;
+       struct ir3_instruction **inputs;
+       struct ir3_instruction *instr;
+       int n, regid = 0;
+
+       block->ninputs = 0;
+
+       n  = 4;  /* always have frag_pos */
+       n += COND(so->frag_face, 4);
+       n += COND(so->frag_coord, 4);
+
+       inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
+
+       if (so->frag_face) {
+               /* this ultimately gets assigned to hr0.x so doesn't conflict
+                * with frag_coord/frag_pos..
+                */
+               inputs[block->ninputs++] = ctx->frag_face;
+               ctx->frag_face->regs[0]->num = 0;
+
+               /* remaining channels not used, but let's avoid confusing
+                * other parts that expect inputs to come in groups of vec4
+                */
+               inputs[block->ninputs++] = NULL;
+               inputs[block->ninputs++] = NULL;
+               inputs[block->ninputs++] = NULL;
+       }
+
+       /* since we don't know where to set the regid for frag_coord,
+        * we have to use r0.x for it.  But we don't want to *always*
+        * use r1.x for frag_pos as that could increase the register
+        * footprint on simple shaders:
+        */
+       if (so->frag_coord) {
+               ctx->frag_coord[0]->regs[0]->num = regid++;
+               ctx->frag_coord[1]->regs[0]->num = regid++;
+               ctx->frag_coord[2]->regs[0]->num = regid++;
+               ctx->frag_coord[3]->regs[0]->num = regid++;
+
+               inputs[block->ninputs++] = ctx->frag_coord[0];
+               inputs[block->ninputs++] = ctx->frag_coord[1];
+               inputs[block->ninputs++] = ctx->frag_coord[2];
+               inputs[block->ninputs++] = ctx->frag_coord[3];
+       }
+
+       /* we always have frag_pos: */
+       so->pos_regid = regid;
+
+       /* r0.x */
+       instr = create_input(block, NULL, block->ninputs);
+       instr->regs[0]->num = regid++;
+       inputs[block->ninputs++] = instr;
+       ctx->frag_pos->regs[1]->instr = instr;
+
+       /* r0.y */
+       instr = create_input(block, NULL, block->ninputs);
+       instr->regs[0]->num = regid++;
+       inputs[block->ninputs++] = instr;
+       ctx->frag_pos->regs[2]->instr = instr;
+
+       block->inputs = inputs;
+}
+
+static void
+compile_instructions(struct ir3_compile_context *ctx)
+{
+       push_block(ctx);
+
+       /* for fragment shader, we have a single input register (usually
+        * r0.xy) which is used as the base for bary.f varying fetch instrs:
+        */
+       if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+               struct ir3_instruction *instr;
+               instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
+               ir3_reg_create(instr, 0, 0);
+               ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
+               ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
+               ctx->frag_pos = instr;
+       }
+
+       while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
+               tgsi_parse_token(&ctx->parser);
+
+               switch (ctx->parser.FullToken.Token.Type) {
+               case TGSI_TOKEN_TYPE_DECLARATION: {
+                       struct tgsi_full_declaration *decl =
+                                       &ctx->parser.FullToken.FullDeclaration;
+                       if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
+                               decl_out(ctx, decl);
+                       } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+                               decl_in(ctx, decl);
+                       }
+                       break;
+               }
+               case TGSI_TOKEN_TYPE_IMMEDIATE: {
+                       /* TODO: if we know the immediate is small enough, and only
+                        * used with instructions that can embed an immediate, we
+                        * can skip this:
+                        */
+                       struct tgsi_full_immediate *imm =
+                                       &ctx->parser.FullToken.FullImmediate;
+                       unsigned n = ctx->so->immediates_count++;
+                       compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
+                       memcpy(ctx->so->immediates[n].val, imm->u, 16);
+                       break;
+               }
+               case TGSI_TOKEN_TYPE_INSTRUCTION: {
+                       struct tgsi_full_instruction *inst =
+                                       &ctx->parser.FullToken.FullInstruction;
+                       unsigned opc = inst->Instruction.Opcode;
+                       const struct instr_translater *t = &translaters[opc];
+
+                       if (t->fxn) {
+                               t->fxn(t, ctx, inst);
+                               ctx->num_internal_temps = 0;
+                       } else {
+                               compile_error(ctx, "unknown TGSI opc: %s\n",
+                                               tgsi_get_opcode_name(opc));
+                       }
+
+                       switch (inst->Instruction.Saturate) {
+                       case TGSI_SAT_ZERO_ONE:
+                               create_clamp_imm(ctx, &inst->Dst[0].Register,
+                                               fui(0.0), fui(1.0));
+                               break;
+                       case TGSI_SAT_MINUS_PLUS_ONE:
+                               create_clamp_imm(ctx, &inst->Dst[0].Register,
+                                               fui(-1.0), fui(1.0));
+                               break;
+                       }
+
+                       instr_finish(ctx);
+
+                       break;
+               }
+               default:
+                       break;
+               }
+       }
+}
+
+static void
+compile_dump(struct ir3_compile_context *ctx)
+{
+       const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
+       static unsigned n = 0;
+       char fname[16];
+       FILE *f;
+       snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
+       f = fopen(fname, "w");
+       if (!f)
+               return;
+       ir3_block_depth(ctx->block);
+       ir3_dump(ctx->ir, name, ctx->block, f);
+       fclose(f);
+}
+
+int
+ir3_compile_shader(struct ir3_shader_variant *so,
+               const struct tgsi_token *tokens, struct ir3_shader_key key)
+{
+       struct ir3_compile_context ctx;
+       struct ir3_block *block;
+       struct ir3_instruction **inputs;
+       unsigned i, j, actual_in;
+       int ret = 0;
+
+       assert(!so->ir);
+
+       so->ir = ir3_create();
+
+       assert(so->ir);
+
+       if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
+               ret = -1;
+               goto out;
+       }
+
+       compile_instructions(&ctx);
+
+       block = ctx.block;
+
+       /* keep track of the inputs from TGSI perspective.. */
+       inputs = block->inputs;
+
+       /* but fixup actual inputs for frag shader: */
+       if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
+               fixup_frag_inputs(&ctx);
+
+       /* at this point, for binning pass, throw away unneeded outputs: */
+       if (key.binning_pass) {
+               for (i = 0, j = 0; i < so->outputs_count; i++) {
+                       unsigned name = sem2name(so->outputs[i].semantic);
+                       unsigned idx = sem2name(so->outputs[i].semantic);
+
+                       /* throw away everything but first position/psize */
+                       if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
+                                       (name == TGSI_SEMANTIC_PSIZE))) {
+                               if (i != j) {
+                                       so->outputs[j] = so->outputs[i];
+                                       block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
+                                       block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
+                                       block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
+                                       block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
+                               }
+                               j++;
+                       }
+               }
+               so->outputs_count = j;
+               block->noutputs = j * 4;
+       }
+
+       /* at this point, we want the kill's in the outputs array too,
+        * so that they get scheduled (since they have no dst).. we've
+        * already ensured that the array is big enough in push_block():
+        */
+       if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+               for (i = 0; i < ctx.kill_count; i++)
+                       block->outputs[block->noutputs++] = ctx.kill[i];
+       }
+
+       if (fd_mesa_debug & FD_DBG_OPTDUMP)
+               compile_dump(&ctx);
+
+       ret = ir3_block_flatten(block);
+       if (ret < 0)
+               goto out;
+       if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
+               compile_dump(&ctx);
+
+       ir3_block_cp(block);
+
+       if (fd_mesa_debug & FD_DBG_OPTDUMP)
+               compile_dump(&ctx);
+
+       ir3_block_depth(block);
+
+       if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+               printf("AFTER DEPTH:\n");
+               ir3_dump_instr_list(block->head);
+       }
+
+       ir3_block_sched(block);
+
+       if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+               printf("AFTER SCHED:\n");
+               ir3_dump_instr_list(block->head);
+       }
+
+       ret = ir3_block_ra(block, so->type, key.half_precision,
+                       so->frag_coord, so->frag_face, &so->has_samp);
+       if (ret)
+               goto out;
+
+       if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+               printf("AFTER RA:\n");
+               ir3_dump_instr_list(block->head);
+       }
+
+       /* fixup input/outputs: */
+       for (i = 0; i < so->outputs_count; i++) {
+               so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
+               /* preserve hack for depth output.. tgsi writes depth to .z,
+                * but what we give the hw is the scalar register:
+                */
+               if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
+                       (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
+                       so->outputs[i].regid += 2;
+       }
+       /* Note that some or all channels of an input may be unused: */
+       actual_in = 0;
+       for (i = 0; i < so->inputs_count; i++) {
+               unsigned j, regid = ~0, compmask = 0;
+               so->inputs[i].ncomp = 0;
+               for (j = 0; j < 4; j++) {
+                       struct ir3_instruction *in = inputs[(i*4) + j];
+                       if (in) {
+                               compmask |= (1 << j);
+                               regid = in->regs[0]->num - j;
+                               actual_in++;
+                               so->inputs[i].ncomp++;
+                       }
+               }
+               so->inputs[i].regid = regid;
+               so->inputs[i].compmask = compmask;
+       }
+
+       /* fragment shader always gets full vec4's even if it doesn't
+        * fetch all components, but vertex shader we need to update
+        * with the actual number of components fetch, otherwise thing
+        * will hang due to mismaptch between VFD_DECODE's and
+        * TOTALATTRTOVS
+        */
+       if (so->type == SHADER_VERTEX)
+               so->total_in = actual_in;
+
+out:
+       if (ret) {
+               ir3_destroy(so->ir);
+               so->ir = NULL;
+       }
+       compile_free(&ctx);
+
+       return ret;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
new file mode 100644 (file)
index 0000000..9b11b3d
--- /dev/null
@@ -0,0 +1,42 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FD3_COMPILER_H_
+#define FD3_COMPILER_H_
+
+#include "ir3_shader.h"
+
+
+int ir3_compile_shader(struct ir3_shader_variant *so,
+               const struct tgsi_token *tokens,
+               struct ir3_shader_key key);
+int ir3_compile_shader_old(struct ir3_shader_variant *so,
+               const struct tgsi_token *tokens,
+               struct ir3_shader_key key);
+
+#endif /* FD3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c
new file mode 100644 (file)
index 0000000..1e1ca7a
--- /dev/null
@@ -0,0 +1,1524 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdarg.h>
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_strings.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+
+#include "freedreno_lowering.h"
+#include "freedreno_util.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+
+struct ir3_compile_context {
+       const struct tgsi_token *tokens;
+       bool free_tokens;
+       struct ir3 *ir;
+       struct ir3_block *block;
+       struct ir3_shader_variant *so;
+
+       struct tgsi_parse_context parser;
+       unsigned type;
+
+       struct tgsi_shader_info info;
+
+       /* last input dst (for setting (ei) flag): */
+       struct ir3_register *last_input;
+
+       /* last instruction with relative addressing: */
+       struct ir3_instruction *last_rel;
+
+       /* for calculating input/output positions/linkages: */
+       unsigned next_inloc;
+
+       unsigned num_internal_temps;
+       struct tgsi_src_register internal_temps[6];
+
+       /* track registers which need to synchronize w/ "complex alu" cat3
+        * instruction pipeline:
+        */
+       regmask_t needs_ss;
+
+       /* track registers which need to synchronize with texture fetch
+        * pipeline:
+        */
+       regmask_t needs_sy;
+
+       /* inputs start at r0, temporaries start after last input, and
+        * outputs start after last temporary.
+        *
+        * We could be more clever, because this is not a hw restriction,
+        * but probably best just to implement an optimizing pass to
+        * reduce the # of registers used and get rid of redundant mov's
+        * (to output register).
+        */
+       unsigned base_reg[TGSI_FILE_COUNT];
+
+       /* idx/slot for last compiler generated immediate */
+       unsigned immediate_idx;
+
+       /* stack of branch instructions that start (potentially nested)
+        * branch instructions, so that we can fix up the branch targets
+        * so that we can fix up the branch target on the corresponding
+        * END instruction
+        */
+       struct ir3_instruction *branch[16];
+       unsigned int branch_count;
+
+       /* used when dst is same as one of the src, to avoid overwriting a
+        * src element before the remaining scalar instructions that make
+        * up the vector operation
+        */
+       struct tgsi_dst_register tmp_dst;
+       struct tgsi_src_register *tmp_src;
+};
+
+
+static void vectorize(struct ir3_compile_context *ctx,
+               struct ir3_instruction *instr, struct tgsi_dst_register *dst,
+               int nsrcs, ...);
+static void create_mov(struct ir3_compile_context *ctx,
+               struct tgsi_dst_register *dst, struct tgsi_src_register *src);
+
+static unsigned
+compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
+               const struct tgsi_token *tokens)
+{
+       unsigned ret, base = 0;
+       struct tgsi_shader_info *info = &ctx->info;
+       const struct fd_lowering_config lconfig = {
+                       .color_two_side = so->key.color_two_side,
+                       .lower_DST  = true,
+                       .lower_XPD  = true,
+                       .lower_SCS  = true,
+                       .lower_LRP  = true,
+                       .lower_FRC  = true,
+                       .lower_POW  = true,
+                       .lower_LIT  = true,
+                       .lower_EXP  = true,
+                       .lower_LOG  = true,
+                       .lower_DP4  = true,
+                       .lower_DP3  = true,
+                       .lower_DPH  = true,
+                       .lower_DP2  = true,
+                       .lower_DP2A = true,
+       };
+
+       ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
+       ctx->free_tokens = !!ctx->tokens;
+       if (!ctx->tokens) {
+               /* no lowering */
+               ctx->tokens = tokens;
+       }
+       ctx->ir = so->ir;
+       ctx->block = ir3_block_create(ctx->ir, 0, 0, 0);
+       ctx->so = so;
+       ctx->last_input = NULL;
+       ctx->last_rel = NULL;
+       ctx->next_inloc = 8;
+       ctx->num_internal_temps = 0;
+       ctx->branch_count = 0;
+
+       regmask_init(&ctx->needs_ss);
+       regmask_init(&ctx->needs_sy);
+       memset(ctx->base_reg, 0, sizeof(ctx->base_reg));
+
+       /* Immediates go after constants: */
+       ctx->base_reg[TGSI_FILE_CONSTANT]  = 0;
+       ctx->base_reg[TGSI_FILE_IMMEDIATE] =
+                       info->file_max[TGSI_FILE_CONSTANT] + 1;
+
+       /* if full precision and fragment shader, don't clobber
+        * r0.x w/ bary fetch:
+        */
+       if ((so->type == SHADER_FRAGMENT) && !so->key.half_precision)
+               base = 1;
+
+       /* Temporaries after outputs after inputs: */
+       ctx->base_reg[TGSI_FILE_INPUT]     = base;
+       ctx->base_reg[TGSI_FILE_OUTPUT]    = base +
+                       info->file_max[TGSI_FILE_INPUT] + 1;
+       ctx->base_reg[TGSI_FILE_TEMPORARY] = base +
+                       info->file_max[TGSI_FILE_INPUT] + 1 +
+                       info->file_max[TGSI_FILE_OUTPUT] + 1;
+
+       so->first_immediate = ctx->base_reg[TGSI_FILE_IMMEDIATE];
+       ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
+
+       ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
+       if (ret != TGSI_PARSE_OK)
+               return ret;
+
+       ctx->type = ctx->parser.FullHeader.Processor.Processor;
+
+       return ret;
+}
+
+static void
+compile_error(struct ir3_compile_context *ctx, const char *format, ...)
+{
+       va_list ap;
+       va_start(ap, format);
+       _debug_vprintf(format, ap);
+       va_end(ap);
+       tgsi_dump(ctx->tokens, 0);
+       debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+               if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+       } while (0)
+
+static void
+compile_free(struct ir3_compile_context *ctx)
+{
+       if (ctx->free_tokens)
+               free((void *)ctx->tokens);
+       tgsi_parse_free(&ctx->parser);
+}
+
+struct instr_translater {
+       void (*fxn)(const struct instr_translater *t,
+                       struct ir3_compile_context *ctx,
+                       struct tgsi_full_instruction *inst);
+       unsigned tgsi_opc;
+       opc_t opc;
+       opc_t hopc;    /* opc to use for half_precision mode, if different */
+       unsigned arg;
+};
+
+static void
+handle_last_rel(struct ir3_compile_context *ctx)
+{
+       if (ctx->last_rel) {
+               ctx->last_rel->flags |= IR3_INSTR_UL;
+               ctx->last_rel = NULL;
+       }
+}
+
+static struct ir3_instruction *
+instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
+{
+       return ir3_instr_create(ctx->block, category, opc);
+}
+
+static void
+add_nop(struct ir3_compile_context *ctx, unsigned count)
+{
+       while (count-- > 0)
+               instr_create(ctx, 0, OPC_NOP);
+}
+
+static unsigned
+src_flags(struct ir3_compile_context *ctx, struct ir3_register *reg)
+{
+       unsigned flags = 0;
+
+       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+               return flags;
+
+       if (regmask_get(&ctx->needs_ss, reg)) {
+               flags |= IR3_INSTR_SS;
+               regmask_init(&ctx->needs_ss);
+       }
+
+       if (regmask_get(&ctx->needs_sy, reg)) {
+               flags |= IR3_INSTR_SY;
+               regmask_init(&ctx->needs_sy);
+       }
+
+       return flags;
+}
+
+static struct ir3_register *
+add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+               const struct tgsi_dst_register *dst, unsigned chan)
+{
+       unsigned flags = 0, num = 0;
+       struct ir3_register *reg;
+
+       switch (dst->File) {
+       case TGSI_FILE_OUTPUT:
+       case TGSI_FILE_TEMPORARY:
+               num = dst->Index + ctx->base_reg[dst->File];
+               break;
+       case TGSI_FILE_ADDRESS:
+               num = REG_A0;
+               break;
+       default:
+               compile_error(ctx, "unsupported dst register file: %s\n",
+                       tgsi_file_name(dst->File));
+               break;
+       }
+
+       if (dst->Indirect)
+               flags |= IR3_REG_RELATIV;
+       if (ctx->so->key.half_precision)
+               flags |= IR3_REG_HALF;
+
+       reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+       if (dst->Indirect)
+               ctx->last_rel = instr;
+
+       return reg;
+}
+
+static struct ir3_register *
+add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+               const struct tgsi_src_register *src, unsigned chan)
+{
+       unsigned flags = 0, num = 0;
+       struct ir3_register *reg;
+
+       /* TODO we need to use a mov to temp for const >= 64.. or maybe
+        * we could use relative addressing..
+        */
+       compile_assert(ctx, src->Index < 64);
+
+       switch (src->File) {
+       case TGSI_FILE_IMMEDIATE:
+               /* TODO if possible, use actual immediate instead of const.. but
+                * TGSI has vec4 immediates, we can only embed scalar (of limited
+                * size, depending on instruction..)
+                */
+       case TGSI_FILE_CONSTANT:
+               flags |= IR3_REG_CONST;
+               num = src->Index + ctx->base_reg[src->File];
+               break;
+       case TGSI_FILE_OUTPUT:
+               /* NOTE: we should only end up w/ OUTPUT file for things like
+                * clamp()'ing saturated dst instructions
+                */
+       case TGSI_FILE_INPUT:
+       case TGSI_FILE_TEMPORARY:
+               num = src->Index + ctx->base_reg[src->File];
+               break;
+       default:
+               compile_error(ctx, "unsupported src register file: %s\n",
+                       tgsi_file_name(src->File));
+               break;
+       }
+
+       if (src->Absolute)
+               flags |= IR3_REG_ABS;
+       if (src->Negate)
+               flags |= IR3_REG_NEGATE;
+       if (src->Indirect)
+               flags |= IR3_REG_RELATIV;
+       if (ctx->so->key.half_precision)
+               flags |= IR3_REG_HALF;
+
+       reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+       if (src->Indirect)
+               ctx->last_rel = instr;
+
+       instr->flags |= src_flags(ctx, reg);
+
+       return reg;
+}
+
+static void
+src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
+{
+       src->File      = dst->File;
+       src->Indirect  = dst->Indirect;
+       src->Dimension = dst->Dimension;
+       src->Index     = dst->Index;
+       src->Absolute  = 0;
+       src->Negate    = 0;
+       src->SwizzleX  = TGSI_SWIZZLE_X;
+       src->SwizzleY  = TGSI_SWIZZLE_Y;
+       src->SwizzleZ  = TGSI_SWIZZLE_Z;
+       src->SwizzleW  = TGSI_SWIZZLE_W;
+}
+
+/* Get internal-temp src/dst to use for a sequence of instructions
+ * generated by a single TGSI op.
+ */
+static struct tgsi_src_register *
+get_internal_temp(struct ir3_compile_context *ctx,
+               struct tgsi_dst_register *tmp_dst)
+{
+       struct tgsi_src_register *tmp_src;
+       int n;
+
+       tmp_dst->File      = TGSI_FILE_TEMPORARY;
+       tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+       tmp_dst->Indirect  = 0;
+       tmp_dst->Dimension = 0;
+
+       /* assign next temporary: */
+       n = ctx->num_internal_temps++;
+       compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
+       tmp_src = &ctx->internal_temps[n];
+
+       tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
+
+       src_from_dst(tmp_src, tmp_dst);
+
+       return tmp_src;
+}
+
+/* Get internal half-precision temp src/dst to use for a sequence of
+ * instructions generated by a single TGSI op.
+ */
+static struct tgsi_src_register *
+get_internal_temp_hr(struct ir3_compile_context *ctx,
+               struct tgsi_dst_register *tmp_dst)
+{
+       struct tgsi_src_register *tmp_src;
+       int n;
+
+       if (ctx->so->key.half_precision)
+               return get_internal_temp(ctx, tmp_dst);
+
+       tmp_dst->File      = TGSI_FILE_TEMPORARY;
+       tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+       tmp_dst->Indirect  = 0;
+       tmp_dst->Dimension = 0;
+
+       /* assign next temporary: */
+       n = ctx->num_internal_temps++;
+       compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
+       tmp_src = &ctx->internal_temps[n];
+
+       /* just use hr0 because no one else should be using half-
+        * precision regs:
+        */
+       tmp_dst->Index = 0;
+
+       src_from_dst(tmp_src, tmp_dst);
+
+       return tmp_src;
+}
+
+static inline bool
+is_const(struct tgsi_src_register *src)
+{
+       return (src->File == TGSI_FILE_CONSTANT) ||
+                       (src->File == TGSI_FILE_IMMEDIATE);
+}
+
+static inline bool
+is_relative(struct tgsi_src_register *src)
+{
+       return src->Indirect;
+}
+
+static inline bool
+is_rel_or_const(struct tgsi_src_register *src)
+{
+       return is_relative(src) || is_const(src);
+}
+
+static type_t
+get_ftype(struct ir3_compile_context *ctx)
+{
+       return ctx->so->key.half_precision ? TYPE_F16 : TYPE_F32;
+}
+
+static type_t
+get_utype(struct ir3_compile_context *ctx)
+{
+       return ctx->so->key.half_precision ? TYPE_U16 : TYPE_U32;
+}
+
+static unsigned
+src_swiz(struct tgsi_src_register *src, int chan)
+{
+       switch (chan) {
+       case 0: return src->SwizzleX;
+       case 1: return src->SwizzleY;
+       case 2: return src->SwizzleZ;
+       case 3: return src->SwizzleW;
+       }
+       assert(0);
+       return 0;
+}
+
+/* for instructions that cannot take a const register as src, if needed
+ * generate a move to temporary gpr:
+ */
+static struct tgsi_src_register *
+get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
+{
+       struct tgsi_dst_register tmp_dst;
+       struct tgsi_src_register *tmp_src;
+
+       compile_assert(ctx, is_rel_or_const(src));
+
+       tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+       create_mov(ctx, &tmp_dst, src);
+
+       return tmp_src;
+}
+
+static void
+get_immediate(struct ir3_compile_context *ctx,
+               struct tgsi_src_register *reg, uint32_t val)
+{
+       unsigned neg, swiz, idx, i;
+       /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
+       static const unsigned swiz2tgsi[] = {
+                       TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
+       };
+
+       for (i = 0; i < ctx->immediate_idx; i++) {
+               swiz = i % 4;
+               idx  = i / 4;
+
+               if (ctx->so->immediates[idx].val[swiz] == val) {
+                       neg = 0;
+                       break;
+               }
+
+               if (ctx->so->immediates[idx].val[swiz] == -val) {
+                       neg = 1;
+                       break;
+               }
+       }
+
+       if (i == ctx->immediate_idx) {
+               /* need to generate a new immediate: */
+               swiz = i % 4;
+               idx  = i / 4;
+               neg  = 0;
+               ctx->so->immediates[idx].val[swiz] = val;
+               ctx->so->immediates_count = idx + 1;
+               ctx->immediate_idx++;
+       }
+
+       reg->File      = TGSI_FILE_IMMEDIATE;
+       reg->Indirect  = 0;
+       reg->Dimension = 0;
+       reg->Index     = idx;
+       reg->Absolute  = 0;
+       reg->Negate    = neg;
+       reg->SwizzleX  = swiz2tgsi[swiz];
+       reg->SwizzleY  = swiz2tgsi[swiz];
+       reg->SwizzleZ  = swiz2tgsi[swiz];
+       reg->SwizzleW  = swiz2tgsi[swiz];
+}
+
+static void
+create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
+               struct tgsi_src_register *src)
+{
+       type_t type_mov = get_ftype(ctx);
+       unsigned i;
+
+       for (i = 0; i < 4; i++) {
+               /* move to destination: */
+               if (dst->WriteMask & (1 << i)) {
+                       struct ir3_instruction *instr;
+
+                       if (src->Absolute || src->Negate) {
+                               /* can't have abs or neg on a mov instr, so use
+                                * absneg.f instead to handle these cases:
+                                */
+                               instr = instr_create(ctx, 2, OPC_ABSNEG_F);
+                       } else {
+                               instr = instr_create(ctx, 1, 0);
+                               instr->cat1.src_type = type_mov;
+                               instr->cat1.dst_type = type_mov;
+                       }
+
+                       add_dst_reg(ctx, instr, dst, i);
+                       add_src_reg(ctx, instr, src, src_swiz(src, i));
+               } else {
+                       add_nop(ctx, 1);
+               }
+       }
+}
+
+static void
+create_clamp(struct ir3_compile_context *ctx,
+               struct tgsi_dst_register *dst, struct tgsi_src_register *val,
+               struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
+{
+       struct ir3_instruction *instr;
+
+       instr = instr_create(ctx, 2, OPC_MAX_F);
+       vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
+
+       instr = instr_create(ctx, 2, OPC_MIN_F);
+       vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
+}
+
+static void
+create_clamp_imm(struct ir3_compile_context *ctx,
+               struct tgsi_dst_register *dst,
+               uint32_t minval, uint32_t maxval)
+{
+       struct tgsi_src_register minconst, maxconst;
+       struct tgsi_src_register src;
+
+       src_from_dst(&src, dst);
+
+       get_immediate(ctx, &minconst, minval);
+       get_immediate(ctx, &maxconst, maxval);
+
+       create_clamp(ctx, dst, &src, &minconst, &maxconst);
+}
+
+static struct tgsi_dst_register *
+get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+       unsigned i;
+       for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+               struct tgsi_src_register *src = &inst->Src[i].Register;
+               if ((src->File == dst->File) && (src->Index == dst->Index)) {
+                       if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
+                                       (src->SwizzleX == TGSI_SWIZZLE_X) &&
+                                       (src->SwizzleY == TGSI_SWIZZLE_Y) &&
+                                       (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
+                                       (src->SwizzleW == TGSI_SWIZZLE_W))
+                               continue;
+                       ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
+                       ctx->tmp_dst.WriteMask = dst->WriteMask;
+                       dst = &ctx->tmp_dst;
+                       break;
+               }
+       }
+       return dst;
+}
+
+static void
+put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
+               struct tgsi_dst_register *dst)
+{
+       /* if necessary, add mov back into original dst: */
+       if (dst != &inst->Dst[0].Register) {
+               create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
+       }
+}
+
+/* helper to generate the necessary repeat and/or additional instructions
+ * to turn a scalar instruction into a vector operation:
+ */
+static void
+vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+               struct tgsi_dst_register *dst, int nsrcs, ...)
+{
+       va_list ap;
+       int i, j, n = 0;
+       bool indirect = dst->Indirect;
+
+       add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
+
+       va_start(ap, nsrcs);
+       for (j = 0; j < nsrcs; j++) {
+               struct tgsi_src_register *src =
+                               va_arg(ap, struct tgsi_src_register *);
+               unsigned flags = va_arg(ap, unsigned);
+               struct ir3_register *reg;
+               if (flags & IR3_REG_IMMED) {
+                       reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
+                       /* this is an ugly cast.. should have put flags first! */
+                       reg->iim_val = *(int *)&src;
+               } else {
+                       reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
+                       indirect |= src->Indirect;
+               }
+               reg->flags |= flags & ~IR3_REG_NEGATE;
+               if (flags & IR3_REG_NEGATE)
+                       reg->flags ^= IR3_REG_NEGATE;
+       }
+       va_end(ap);
+
+       for (i = 0; i < 4; i++) {
+               if (dst->WriteMask & (1 << i)) {
+                       struct ir3_instruction *cur;
+
+                       if (n++ == 0) {
+                               cur = instr;
+                       } else {
+                               cur = ir3_instr_clone(instr);
+                               cur->flags &= ~(IR3_INSTR_SY | IR3_INSTR_SS | IR3_INSTR_JP);
+                       }
+
+                       /* fix-up dst register component: */
+                       cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
+
+                       /* fix-up src register component: */
+                       va_start(ap, nsrcs);
+                       for (j = 0; j < nsrcs; j++) {
+                               struct tgsi_src_register *src =
+                                               va_arg(ap, struct tgsi_src_register *);
+                               unsigned flags = va_arg(ap, unsigned);
+                               if (!(flags & IR3_REG_IMMED)) {
+                                       cur->regs[j+1]->num =
+                                                       regid(cur->regs[j+1]->num >> 2,
+                                                                       src_swiz(src, i));
+                                       cur->flags |= src_flags(ctx, cur->regs[j+1]);
+                               }
+                       }
+                       va_end(ap);
+
+                       if (indirect)
+                               ctx->last_rel = cur;
+               }
+       }
+
+       /* pad w/ nop's.. at least until we are clever enough to
+        * figure out if we really need to..
+        */
+       add_nop(ctx, 4 - n);
+}
+
+/*
+ * Handlers for TGSI instructions which do not have a 1:1 mapping to
+ * native instructions:
+ */
+
+static void
+trans_clamp(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src0 = &inst->Src[0].Register;
+       struct tgsi_src_register *src1 = &inst->Src[1].Register;
+       struct tgsi_src_register *src2 = &inst->Src[2].Register;
+
+       create_clamp(ctx, dst, src0, src1, src2);
+
+       put_dst(ctx, inst, dst);
+}
+
+/* ARL(x) = x, but mova from hrN.x to a0.. */
+static void
+trans_arl(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+       struct tgsi_dst_register tmp_dst;
+       struct tgsi_src_register *tmp_src;
+       struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+       unsigned chan = src->SwizzleX;
+       compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
+
+       handle_last_rel(ctx);
+
+       tmp_src = get_internal_temp_hr(ctx, &tmp_dst);
+
+       /* cov.{f32,f16}s16 Rtmp, Rsrc */
+       instr = instr_create(ctx, 1, 0);
+       instr->cat1.src_type = get_ftype(ctx);
+       instr->cat1.dst_type = TYPE_S16;
+       add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+       add_src_reg(ctx, instr, src, chan);
+
+       add_nop(ctx, 3);
+
+       /* shl.b Rtmp, Rtmp, 2 */
+       instr = instr_create(ctx, 2, OPC_SHL_B);
+       add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+       add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+       ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
+
+       add_nop(ctx, 3);
+
+       /* mova a0, Rtmp */
+       instr = instr_create(ctx, 1, 0);
+       instr->cat1.src_type = TYPE_S16;
+       instr->cat1.dst_type = TYPE_S16;
+       add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
+       add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+
+       /* need to ensure 5 instr slots before a0 is used: */
+       add_nop(ctx, 6);
+}
+
+/* texture fetch/sample instructions: */
+static void
+trans_samp(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_register *r;
+       struct ir3_instruction *instr;
+       struct tgsi_src_register *coord = &inst->Src[0].Register;
+       struct tgsi_src_register *samp  = &inst->Src[1].Register;
+       unsigned tex = inst->Texture.Texture;
+       int8_t *order;
+       unsigned i, flags = 0, src_wrmask;
+       bool needs_mov = false;
+
+       switch (t->arg) {
+       case TGSI_OPCODE_TEX:
+               if (tex == TGSI_TEXTURE_2D) {
+                       order = (int8_t[4]){ 0,  1, -1, -1 };
+                       src_wrmask = TGSI_WRITEMASK_XY;
+               } else {
+                       order = (int8_t[4]){ 0,  1,  2, -1 };
+                       src_wrmask = TGSI_WRITEMASK_XYZ;
+               }
+               break;
+       case TGSI_OPCODE_TXP:
+               if (tex == TGSI_TEXTURE_2D) {
+                       order = (int8_t[4]){ 0,  1,  3, -1 };
+                       src_wrmask = TGSI_WRITEMASK_XYZ;
+               } else {
+                       order = (int8_t[4]){ 0,  1,  2,  3 };
+                       src_wrmask = TGSI_WRITEMASK_XYZW;
+               }
+               flags |= IR3_INSTR_P;
+               break;
+       default:
+               compile_assert(ctx, 0);
+               break;
+       }
+
+       if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE)) {
+               add_nop(ctx, 3);
+               flags |= IR3_INSTR_3D;
+       }
+
+       /* cat5 instruction cannot seem to handle const or relative: */
+       if (is_rel_or_const(coord))
+               needs_mov = true;
+
+       /* The texture sample instructions need to coord in successive
+        * registers/components (ie. src.xy but not src.yx).  And TXP
+        * needs the .w component in .z for 2D..  so in some cases we
+        * might need to emit some mov instructions to shuffle things
+        * around:
+        */
+       for (i = 1; (i < 4) && (order[i] >= 0) && !needs_mov; i++)
+               if (src_swiz(coord, i) != (src_swiz(coord, 0) + order[i]))
+                       needs_mov = true;
+
+       if (needs_mov) {
+               struct tgsi_dst_register tmp_dst;
+               struct tgsi_src_register *tmp_src;
+               unsigned j;
+
+               type_t type_mov = get_ftype(ctx);
+
+               /* need to move things around: */
+               tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+               for (j = 0; (j < 4) && (order[j] >= 0); j++) {
+                       instr = instr_create(ctx, 1, 0);
+                       instr->cat1.src_type = type_mov;
+                       instr->cat1.dst_type = type_mov;
+                       add_dst_reg(ctx, instr, &tmp_dst, j);
+                       add_src_reg(ctx, instr, coord,
+                                       src_swiz(coord, order[j]));
+               }
+
+               coord = tmp_src;
+
+               add_nop(ctx, 4 - j);
+       }
+
+       instr = instr_create(ctx, 5, t->opc);
+       instr->cat5.type = get_ftype(ctx);
+       instr->cat5.samp = samp->Index;
+       instr->cat5.tex  = samp->Index;
+       instr->flags |= flags;
+
+       r = add_dst_reg(ctx, instr, &inst->Dst[0].Register, 0);
+       r->wrmask = inst->Dst[0].Register.WriteMask;
+
+       add_src_reg(ctx, instr, coord, coord->SwizzleX)->wrmask = src_wrmask;
+
+       /* after add_src_reg() so we don't set (sy) on sam instr itself! */
+       regmask_set(&ctx->needs_sy, r);
+}
+
+/*
+ * SEQ(a,b) = (a == b) ? 1.0 : 0.0
+ *   cmps.f.eq tmp0, b, a
+ *   cov.u16f16 dst, tmp0
+ *
+ * SNE(a,b) = (a != b) ? 1.0 : 0.0
+ *   cmps.f.eq tmp0, b, a
+ *   add.s tmp0, tmp0, -1
+ *   sel.f16 dst, {0.0}, tmp0, {1.0}
+ *
+ * SGE(a,b) = (a >= b) ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SLE(a,b) = (a <= b) ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, b, a
+ *   cov.u16f16 dst, tmp0
+ *
+ * SGT(a,b) = (a > b)  ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, b, a
+ *   add.s tmp0, tmp0, -1
+ *   sel.f16 dst, {0.0}, tmp0, {1.0}
+ *
+ * SLT(a,b) = (a < b)  ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, a, b
+ *   add.s tmp0, tmp0, -1
+ *   sel.f16 dst, {0.0}, tmp0, {1.0}
+ *
+ * CMP(a,b,c) = (a < 0.0) ? b : c
+ *   cmps.f.ge tmp0, a, {0.0}
+ *   add.s tmp0, tmp0, -1
+ *   sel.f16 dst, c, tmp0, b
+ */
+static void
+trans_cmp(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+       struct tgsi_dst_register tmp_dst;
+       struct tgsi_src_register *tmp_src;
+       struct tgsi_src_register constval0, constval1;
+       /* final instruction for CMP() uses orig src1 and src2: */
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *a0, *a1;
+       unsigned condition;
+
+       tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+       switch (t->tgsi_opc) {
+       case TGSI_OPCODE_SEQ:
+       case TGSI_OPCODE_SNE:
+               a0 = &inst->Src[1].Register;  /* b */
+               a1 = &inst->Src[0].Register;  /* a */
+               condition = IR3_COND_EQ;
+               break;
+       case TGSI_OPCODE_SGE:
+       case TGSI_OPCODE_SLT:
+               a0 = &inst->Src[0].Register;  /* a */
+               a1 = &inst->Src[1].Register;  /* b */
+               condition = IR3_COND_GE;
+               break;
+       case TGSI_OPCODE_SLE:
+       case TGSI_OPCODE_SGT:
+               a0 = &inst->Src[1].Register;  /* b */
+               a1 = &inst->Src[0].Register;  /* a */
+               condition = IR3_COND_GE;
+               break;
+       case TGSI_OPCODE_CMP:
+               get_immediate(ctx, &constval0, fui(0.0));
+               a0 = &inst->Src[0].Register;  /* a */
+               a1 = &constval0;              /* {0.0} */
+               condition = IR3_COND_GE;
+               break;
+       default:
+               compile_assert(ctx, 0);
+               return;
+       }
+
+       if (is_const(a0) && is_const(a1))
+               a0 = get_unconst(ctx, a0);
+
+       /* cmps.f.ge tmp, a0, a1 */
+       instr = instr_create(ctx, 2, OPC_CMPS_F);
+       instr->cat2.condition = condition;
+       vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+       switch (t->tgsi_opc) {
+       case TGSI_OPCODE_SEQ:
+       case TGSI_OPCODE_SGE:
+       case TGSI_OPCODE_SLE:
+               /* cov.u16f16 dst, tmp0 */
+               instr = instr_create(ctx, 1, 0);
+               instr->cat1.src_type = get_utype(ctx);
+               instr->cat1.dst_type = get_ftype(ctx);
+               vectorize(ctx, instr, dst, 1, tmp_src, 0);
+               break;
+       case TGSI_OPCODE_SNE:
+       case TGSI_OPCODE_SGT:
+       case TGSI_OPCODE_SLT:
+       case TGSI_OPCODE_CMP:
+               /* add.s tmp, tmp, -1 */
+               instr = instr_create(ctx, 2, OPC_ADD_S);
+               vectorize(ctx, instr, &tmp_dst, 2, tmp_src, 0, -1, IR3_REG_IMMED);
+
+               if (t->tgsi_opc == TGSI_OPCODE_CMP) {
+                       /* sel.{f32,f16} dst, src2, tmp, src1 */
+                       instr = instr_create(ctx, 3,
+                                       ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32);
+                       vectorize(ctx, instr, dst, 3,
+                                       &inst->Src[2].Register, 0,
+                                       tmp_src, 0,
+                                       &inst->Src[1].Register, 0);
+               } else {
+                       get_immediate(ctx, &constval0, fui(0.0));
+                       get_immediate(ctx, &constval1, fui(1.0));
+                       /* sel.{f32,f16} dst, {0.0}, tmp0, {1.0} */
+                       instr = instr_create(ctx, 3,
+                                       ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32);
+                       vectorize(ctx, instr, dst, 3,
+                                       &constval0, 0, tmp_src, 0, &constval1, 0);
+               }
+
+               break;
+       }
+
+       put_dst(ctx, inst, dst);
+}
+
+/*
+ * Conditional / Flow control
+ */
+
+static unsigned
+find_instruction(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
+{
+       unsigned i;
+       for (i = 0; i < ctx->ir->instrs_count; i++)
+               if (ctx->ir->instrs[i] == instr)
+                       return i;
+       return ~0;
+}
+
+static void
+push_branch(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
+{
+       ctx->branch[ctx->branch_count++] = instr;
+}
+
+static void
+pop_branch(struct ir3_compile_context *ctx)
+{
+       struct ir3_instruction *instr;
+
+       /* if we were clever enough, we'd patch this up after the fact,
+        * and set (jp) flag on whatever the next instruction was, rather
+        * than inserting an extra nop..
+        */
+       instr = instr_create(ctx, 0, OPC_NOP);
+       instr->flags |= IR3_INSTR_JP;
+
+       /* pop the branch instruction from the stack and fix up branch target: */
+       instr = ctx->branch[--ctx->branch_count];
+       instr->cat0.immed = ctx->ir->instrs_count - find_instruction(ctx, instr) - 1;
+}
+
+/* We probably don't really want to translate if/else/endif into branches..
+ * the blob driver evaluates both legs of the if and then uses the sel
+ * instruction to pick which sides of the branch to "keep".. but figuring
+ * that out will take somewhat more compiler smarts.  So hopefully branches
+ * don't kill performance too badly.
+ */
+static void
+trans_if(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+       struct tgsi_src_register constval;
+
+       get_immediate(ctx, &constval, fui(0.0));
+
+       if (is_const(src))
+               src = get_unconst(ctx, src);
+
+       instr = instr_create(ctx, 2, OPC_CMPS_F);
+       ir3_reg_create(instr, regid(REG_P0, 0), 0);
+       add_src_reg(ctx, instr, src, src->SwizzleX);
+       add_src_reg(ctx, instr, &constval, constval.SwizzleX);
+       instr->cat2.condition = IR3_COND_EQ;
+
+       instr = instr_create(ctx, 0, OPC_BR);
+       push_branch(ctx, instr);
+}
+
+static void
+trans_else(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct ir3_instruction *instr;
+
+       /* for first half of if/else/endif, generate a jump past the else: */
+       instr = instr_create(ctx, 0, OPC_JUMP);
+
+       pop_branch(ctx);
+       push_branch(ctx, instr);
+}
+
+static void
+trans_endif(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       pop_branch(ctx);
+}
+
+/*
+ * Handlers for TGSI instructions which do have 1:1 mapping to native
+ * instructions:
+ */
+
+static void
+instr_cat0(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       instr_create(ctx, 0, t->opc);
+}
+
+static void
+instr_cat1(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+
+       /* mov instructions can't handle a negate on src: */
+       if (src->Negate) {
+               struct tgsi_src_register constval;
+               struct ir3_instruction *instr;
+
+               /* since right now, we are using uniformly either TYPE_F16 or
+                * TYPE_F32, and we don't utilize the conversion possibilities
+                * of mov instructions, we can get away with substituting an
+                * add.f which can handle negate.  Might need to revisit this
+                * in the future if we start supporting widening/narrowing or
+                * conversion to/from integer..
+                */
+               instr = instr_create(ctx, 2, OPC_ADD_F);
+               get_immediate(ctx, &constval, fui(0.0));
+               vectorize(ctx, instr, dst, 2, src, 0, &constval, 0);
+       } else {
+               create_mov(ctx, dst, src);
+               /* create_mov() generates vector sequence, so no vectorize() */
+       }
+       put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat2(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src0 = &inst->Src[0].Register;
+       struct tgsi_src_register *src1 = &inst->Src[1].Register;
+       struct ir3_instruction *instr;
+       unsigned src0_flags = 0, src1_flags = 0;
+
+       switch (t->tgsi_opc) {
+       case TGSI_OPCODE_ABS:
+               src0_flags = IR3_REG_ABS;
+               break;
+       case TGSI_OPCODE_SUB:
+               src1_flags = IR3_REG_NEGATE;
+               break;
+       }
+
+       switch (t->opc) {
+       case OPC_ABSNEG_F:
+       case OPC_ABSNEG_S:
+       case OPC_CLZ_B:
+       case OPC_CLZ_S:
+       case OPC_SIGN_F:
+       case OPC_FLOOR_F:
+       case OPC_CEIL_F:
+       case OPC_RNDNE_F:
+       case OPC_RNDAZ_F:
+       case OPC_TRUNC_F:
+       case OPC_NOT_B:
+       case OPC_BFREV_B:
+       case OPC_SETRM:
+       case OPC_CBITS_B:
+               /* these only have one src reg */
+               instr = instr_create(ctx, 2, t->opc);
+               vectorize(ctx, instr, dst, 1, src0, src0_flags);
+               break;
+       default:
+               if (is_const(src0) && is_const(src1))
+                       src0 = get_unconst(ctx, src0);
+
+               instr = instr_create(ctx, 2, t->opc);
+               vectorize(ctx, instr, dst, 2, src0, src0_flags,
+                               src1, src1_flags);
+               break;
+       }
+
+       put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat3(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src0 = &inst->Src[0].Register;
+       struct tgsi_src_register *src1 = &inst->Src[1].Register;
+       struct ir3_instruction *instr;
+
+       /* in particular, can't handle const for src1 for cat3..
+        * for mad, we can swap first two src's if needed:
+        */
+       if (is_rel_or_const(src1)) {
+               if (is_mad(t->opc) && !is_rel_or_const(src0)) {
+                       struct tgsi_src_register *tmp;
+                       tmp = src0;
+                       src0 = src1;
+                       src1 = tmp;
+               } else {
+                       src1 = get_unconst(ctx, src1);
+               }
+       }
+
+       instr = instr_create(ctx, 3,
+                       ctx->so->key.half_precision ? t->hopc : t->opc);
+       vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
+                       &inst->Src[2].Register, 0);
+       put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat4(const struct instr_translater *t,
+               struct ir3_compile_context *ctx,
+               struct tgsi_full_instruction *inst)
+{
+       struct tgsi_dst_register *dst = get_dst(ctx, inst);
+       struct tgsi_src_register *src = &inst->Src[0].Register;
+       struct ir3_instruction *instr;
+       unsigned i, n;
+
+       /* seems like blob compiler avoids const as src.. */
+       if (is_const(src))
+               src = get_unconst(ctx, src);
+
+       /* worst case: */
+       add_nop(ctx, 6);
+
+       /* we need to replicate into each component: */
+       for (i = 0, n = 0; i < 4; i++) {
+               if (dst->WriteMask & (1 << i)) {
+                       if (n++)
+                               add_nop(ctx, 1);
+                       instr = instr_create(ctx, 4, t->opc);
+                       add_dst_reg(ctx, instr, dst, i);
+                       add_src_reg(ctx, instr, src, src->SwizzleX);
+               }
+       }
+
+       regmask_set(&ctx->needs_ss, instr->regs[0]);
+       put_dst(ctx, inst, dst);
+}
+
+static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
+#define INSTR(n, f, ...) \
+       [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
+
+       INSTR(MOV,          instr_cat1),
+       INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
+       INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
+       INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
+       INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
+       INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
+       INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
+       INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
+       INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
+       INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
+       INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
+       INSTR(CLAMP,        trans_clamp),
+       INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
+       INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
+       INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
+       INSTR(ARL,          trans_arl),
+       INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
+       INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
+       INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
+       INSTR(COS,          instr_cat4, .opc = OPC_COS),
+       INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
+       INSTR(TEX,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
+       INSTR(TXP,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
+       INSTR(SGT,          trans_cmp),
+       INSTR(SLT,          trans_cmp),
+       INSTR(SGE,          trans_cmp),
+       INSTR(SLE,          trans_cmp),
+       INSTR(SNE,          trans_cmp),
+       INSTR(SEQ,          trans_cmp),
+       INSTR(CMP,          trans_cmp),
+       INSTR(IF,           trans_if),
+       INSTR(ELSE,         trans_else),
+       INSTR(ENDIF,        trans_endif),
+       INSTR(END,          instr_cat0, .opc = OPC_END),
+       INSTR(KILL,         instr_cat0, .opc = OPC_KILL),
+};
+
+static ir3_semantic
+decl_semantic(const struct tgsi_declaration_semantic *sem)
+{
+       return ir3_semantic_name(sem->Name, sem->Index);
+}
+
+static int
+decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       unsigned base = ctx->base_reg[TGSI_FILE_INPUT];
+       unsigned i, flags = 0;
+       int nop = 0;
+
+       /* I don't think we should get frag shader input without
+        * semantic info?  Otherwise how do inputs get linked to
+        * vert outputs?
+        */
+       compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
+                       decl->Declaration.Semantic);
+
+       if (ctx->so->key.half_precision)
+               flags |= IR3_REG_HALF;
+
+       for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+               unsigned n = so->inputs_count++;
+               unsigned r = regid(i + base, 0);
+               unsigned ncomp;
+
+               /* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */
+               ncomp = 4;
+
+               DBG("decl in -> r%d", i + base);   // XXX
+
+               compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
+
+               so->inputs[n].semantic = decl_semantic(&decl->Semantic);
+               so->inputs[n].compmask = (1 << ncomp) - 1;
+               so->inputs[n].ncomp = ncomp;
+               so->inputs[n].regid = r;
+               so->inputs[n].inloc = ctx->next_inloc;
+               so->inputs[n].bary = true;   /* all that is supported */
+               ctx->next_inloc += ncomp;
+
+               so->total_in += ncomp;
+
+               /* for frag shaders, we need to generate the corresponding bary instr: */
+               if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+                       unsigned j;
+
+                       for (j = 0; j < ncomp; j++) {
+                               struct ir3_instruction *instr;
+                               struct ir3_register *dst;
+
+                               instr = instr_create(ctx, 2, OPC_BARY_F);
+
+                               /* dst register: */
+                               dst = ir3_reg_create(instr, r + j, flags);
+                               ctx->last_input = dst;
+
+                               /* input position: */
+                               ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val =
+                                               so->inputs[n].inloc + j - 8;
+
+                               /* input base (always r0.xy): */
+                               ir3_reg_create(instr, regid(0,0), 0)->wrmask = 0x3;
+                       }
+
+                       nop = 6;
+               }
+       }
+
+       return nop;
+}
+
+static void
+decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       unsigned base = ctx->base_reg[TGSI_FILE_OUTPUT];
+       unsigned comp = 0;
+       unsigned name = decl->Semantic.Name;
+       unsigned i;
+
+       compile_assert(ctx, decl->Declaration.Semantic);  // TODO is this ever not true?
+
+       DBG("decl out[%d] -> r%d", name, decl->Range.First + base);   // XXX
+
+       if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+               switch (name) {
+               case TGSI_SEMANTIC_POSITION:
+                       so->writes_pos = true;
+                       break;
+               case TGSI_SEMANTIC_PSIZE:
+                       so->writes_psize = true;
+                       break;
+               case TGSI_SEMANTIC_COLOR:
+               case TGSI_SEMANTIC_BCOLOR:
+               case TGSI_SEMANTIC_GENERIC:
+               case TGSI_SEMANTIC_FOG:
+               case TGSI_SEMANTIC_TEXCOORD:
+                       break;
+               default:
+                       compile_error(ctx, "unknown VS semantic name: %s\n",
+                                       tgsi_semantic_names[name]);
+               }
+       } else {
+               switch (name) {
+               case TGSI_SEMANTIC_POSITION:
+                       comp = 2;  /* tgsi will write to .z component */
+                       so->writes_pos = true;
+                       break;
+               case TGSI_SEMANTIC_COLOR:
+                       break;
+               default:
+                       compile_error(ctx, "unknown FS semantic name: %s\n",
+                                       tgsi_semantic_names[name]);
+               }
+       }
+
+       for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+               unsigned n = so->outputs_count++;
+               compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+               so->outputs[n].semantic = decl_semantic(&decl->Semantic);
+               so->outputs[n].regid = regid(i + base, comp);
+       }
+}
+
+static void
+decl_samp(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+       ctx->so->has_samp = true;
+}
+
+static void
+compile_instructions(struct ir3_compile_context *ctx)
+{
+       struct ir3 *ir = ctx->ir;
+       int nop = 0;
+
+       while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
+               tgsi_parse_token(&ctx->parser);
+
+               switch (ctx->parser.FullToken.Token.Type) {
+               case TGSI_TOKEN_TYPE_DECLARATION: {
+                       struct tgsi_full_declaration *decl =
+                                       &ctx->parser.FullToken.FullDeclaration;
+                       if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
+                               decl_out(ctx, decl);
+                       } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+                               nop = decl_in(ctx, decl);
+                       } else if (decl->Declaration.File == TGSI_FILE_SAMPLER) {
+                               decl_samp(ctx, decl);
+                       }
+                       break;
+               }
+               case TGSI_TOKEN_TYPE_IMMEDIATE: {
+                       /* TODO: if we know the immediate is small enough, and only
+                        * used with instructions that can embed an immediate, we
+                        * can skip this:
+                        */
+                       struct tgsi_full_immediate *imm =
+                                       &ctx->parser.FullToken.FullImmediate;
+                       unsigned n = ctx->so->immediates_count++;
+                       memcpy(ctx->so->immediates[n].val, imm->u, 16);
+                       break;
+               }
+               case TGSI_TOKEN_TYPE_INSTRUCTION: {
+                       struct tgsi_full_instruction *inst =
+                                       &ctx->parser.FullToken.FullInstruction;
+                       unsigned opc = inst->Instruction.Opcode;
+                       const struct instr_translater *t = &translaters[opc];
+
+                       add_nop(ctx, nop);
+                       nop = 0;
+
+                       if (t->fxn) {
+                               t->fxn(t, ctx, inst);
+                               ctx->num_internal_temps = 0;
+                       } else {
+                               compile_error(ctx, "unknown TGSI opc: %s\n",
+                                               tgsi_get_opcode_name(opc));
+                       }
+
+                       switch (inst->Instruction.Saturate) {
+                       case TGSI_SAT_ZERO_ONE:
+                               create_clamp_imm(ctx, &inst->Dst[0].Register,
+                                               fui(0.0), fui(1.0));
+                               break;
+                       case TGSI_SAT_MINUS_PLUS_ONE:
+                               create_clamp_imm(ctx, &inst->Dst[0].Register,
+                                               fui(-1.0), fui(1.0));
+                               break;
+                       }
+
+                       break;
+               }
+               default:
+                       break;
+               }
+       }
+
+       if (ir->instrs_count > 0)
+               ir->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+
+       if (ctx->last_input)
+               ctx->last_input->flags |= IR3_REG_EI;
+
+       handle_last_rel(ctx);
+}
+
+int
+ir3_compile_shader_old(struct ir3_shader_variant *so,
+               const struct tgsi_token *tokens, struct ir3_shader_key key)
+{
+       struct ir3_compile_context ctx;
+
+       assert(!so->ir);
+
+       so->ir = ir3_create();
+
+       assert(so->ir);
+
+       if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK)
+               return -1;
+
+       compile_instructions(&ctx);
+
+       compile_free(&ctx);
+
+       return 0;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
new file mode 100644 (file)
index 0000000..73c2a27
--- /dev/null
@@ -0,0 +1,158 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "ir3.h"
+
+/*
+ * Copy Propagate:
+ *
+ * TODO probably want some sort of visitor sort of interface to
+ * avoid duplicating the same graph traversal logic everywhere..
+ *
+ */
+
+static void block_cp(struct ir3_block *block);
+static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep);
+
+static bool is_eligible_mov(struct ir3_instruction *instr)
+{
+       if ((instr->category == 1) &&
+                       (instr->cat1.src_type == instr->cat1.dst_type)) {
+               struct ir3_register *dst = instr->regs[0];
+               struct ir3_register *src = instr->regs[1];
+               if (dst->flags & IR3_REG_ADDR)
+                       return false;
+               if ((src->flags & IR3_REG_SSA) &&
+                               /* TODO: propagate abs/neg modifiers if possible */
+                               !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV)))
+                       return true;
+       }
+       return false;
+}
+
+static void walk_children(struct ir3_instruction *instr, bool keep)
+{
+       unsigned i;
+
+       /* walk down the graph from each src: */
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *src = instr->regs[i];
+               if (src->flags & IR3_REG_SSA)
+                       src->instr = instr_cp(src->instr, keep);
+       }
+}
+
+static struct ir3_instruction *
+instr_cp_fanin(struct ir3_instruction *instr)
+{
+       unsigned i;
+
+       /* we need to handle fanin specially, to detect cases
+        * when we need to keep a mov
+        */
+
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *src = instr->regs[i];
+               if (src->flags & IR3_REG_SSA) {
+                       struct ir3_instruction *cand =
+                                       instr_cp(src->instr, false);
+
+                       /* if the candidate is a fanout, then keep
+                        * the move.
+                        *
+                        * This is a bit, um, fragile, but it should
+                        * catch the extra mov's that the front-end
+                        * puts in for us already in these cases.
+                        */
+                       if (is_meta(cand) && (cand->opc == OPC_META_FO))
+                               cand = instr_cp(src->instr, true);
+
+                       src->instr = cand;
+               }
+       }
+
+       walk_children(instr, false);
+
+       return instr;
+
+}
+
+static struct ir3_instruction *
+instr_cp(struct ir3_instruction *instr, bool keep)
+{
+       /* if we've already visited this instruction, bail now: */
+       if (ir3_instr_check_mark(instr))
+               return instr;
+
+       if (is_meta(instr) && (instr->opc == OPC_META_FI))
+               return instr_cp_fanin(instr);
+
+       if (is_eligible_mov(instr) && !keep) {
+               struct ir3_register *src = instr->regs[1];
+               return instr_cp(src->instr, false);
+       }
+
+       walk_children(instr, false);
+
+       return instr;
+}
+
+static void block_cp(struct ir3_block *block)
+{
+       unsigned i, j;
+
+       for (i = 0; i < block->noutputs; i++) {
+               if (block->outputs[i]) {
+                       struct ir3_instruction *out =
+                                       instr_cp(block->outputs[i], false);
+
+                       /* To deal with things like this:
+                        *
+                        *   43: MOV OUT[2], TEMP[5]
+                        *   44: MOV OUT[0], TEMP[5]
+                        *
+                        * we need to ensure that no two outputs point to
+                        * the same instruction
+                        */
+                       for (j = 0; j < i; j++) {
+                               if (block->outputs[j] == out) {
+                                       out = instr_cp(block->outputs[i], true);
+                                       break;
+                               }
+                       }
+
+                       block->outputs[i] = out;
+               }
+       }
+}
+
+void ir3_block_cp(struct ir3_block *block)
+{
+       ir3_clear_mark(block->shader);
+       block_cp(block);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
new file mode 100644 (file)
index 0000000..dcc0362
--- /dev/null
@@ -0,0 +1,159 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Depth:
+ *
+ * Calculates weighted instruction depth, ie. the sum of # of needed
+ * instructions plus delay slots back to original input (ie INPUT or
+ * CONST).  That is to say, an instructions depth is:
+ *
+ *   depth(instr) {
+ *     d = 0;
+ *     // for each src register:
+ *     foreach (src in instr->regs[1..n])
+ *       d = max(d, delayslots(src->instr, n) + depth(src->instr));
+ *     return d + 1;
+ *   }
+ *
+ * After an instruction's depth is calculated, it is inserted into the
+ * blocks depth sorted list, which is used by the scheduling pass.
+ */
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
+ */
+int ir3_delayslots(struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer, unsigned n)
+{
+       /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+        * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+        * handled with sync bits
+        */
+
+       if (is_meta(assigner))
+               return 0;
+
+       if (writes_addr(assigner))
+               return 6;
+
+       /* handled via sync flags: */
+       if (is_sfu(assigner) || is_tex(assigner))
+               return 0;
+
+       /* assigner must be alu: */
+       if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer)) {
+               return 6;
+       } else if ((consumer->category == 3) &&
+                       is_mad(consumer->opc) && (n == 2)) {
+               /* special case, 3rd src to cat3 not required on first cycle */
+               return 1;
+       } else {
+               return 3;
+       }
+}
+
+static void insert_by_depth(struct ir3_instruction *instr)
+{
+       struct ir3_block *block = instr->block;
+       struct ir3_instruction *n = block->head;
+       struct ir3_instruction *p = NULL;
+
+       while (n && (n != instr) && (n->depth > instr->depth)) {
+               p = n;
+               n = n->next;
+       }
+
+       instr->next = n;
+       if (p)
+               p->next = instr;
+       else
+               block->head = instr;
+}
+
+static void ir3_instr_depth(struct ir3_instruction *instr)
+{
+       unsigned i;
+
+       /* if we've already visited this instruction, bail now: */
+       if (ir3_instr_check_mark(instr))
+               return;
+
+       instr->depth = 0;
+
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *src = instr->regs[i];
+               if (src->flags & IR3_REG_SSA) {
+                       unsigned sd;
+
+                       /* visit child to compute it's depth: */
+                       ir3_instr_depth(src->instr);
+
+                       sd = ir3_delayslots(src->instr, instr, i-1) +
+                                       src->instr->depth;
+
+                       instr->depth = MAX2(instr->depth, sd);
+               }
+       }
+
+       /* meta-instructions don't add cycles, other than PHI.. which
+        * might translate to a real instruction..
+        *
+        * well, not entirely true, fan-in/out, etc might need to need
+        * to generate some extra mov's in edge cases, etc.. probably
+        * we might want to do depth calculation considering the worst
+        * case for these??
+        */
+       if (!is_meta(instr))
+               instr->depth++;
+
+       insert_by_depth(instr);
+}
+
+void ir3_block_depth(struct ir3_block *block)
+{
+       unsigned i;
+
+       block->head = NULL;
+
+       ir3_clear_mark(block->shader);
+       for (i = 0; i < block->noutputs; i++)
+               if (block->outputs[i])
+                       ir3_instr_depth(block->outputs[i]);
+
+       /* at this point, any unvisited input is unused: */
+       for (i = 0; i < block->ninputs; i++) {
+               struct ir3_instruction *in = block->inputs[i];
+               if (in && !ir3_instr_check_mark(in))
+                       block->inputs[i] = NULL;
+       }
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
new file mode 100644 (file)
index 0000000..1a6f49d
--- /dev/null
@@ -0,0 +1,425 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdarg.h>
+
+#include "ir3.h"
+
+#define PTRID(x) ((unsigned long)(x))
+
+struct ir3_dump_ctx {
+       FILE *f;
+       bool verbose;
+};
+
+static void dump_instr_name(struct ir3_dump_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       /* for debugging: */
+       if (ctx->verbose) {
+#ifdef DEBUG
+               fprintf(ctx->f, "%04u:", instr->serialno);
+#endif
+               fprintf(ctx->f, "%03u: ", instr->depth);
+       }
+
+       if (instr->flags & IR3_INSTR_SY)
+               fprintf(ctx->f, "(sy)");
+       if (instr->flags & IR3_INSTR_SS)
+               fprintf(ctx->f, "(ss)");
+
+       if (is_meta(instr)) {
+               switch(instr->opc) {
+               case OPC_META_PHI:
+                       fprintf(ctx->f, "&#934;");
+                       break;
+               case OPC_META_DEREF:
+                       fprintf(ctx->f, "(*)");
+                       break;
+               default:
+                       /* shouldn't hit here.. just for debugging: */
+                       switch (instr->opc) {
+                       case OPC_META_INPUT:  fprintf(ctx->f, "_meta:in");   break;
+                       case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out");  break;
+                       case OPC_META_FO:     fprintf(ctx->f, "_meta:fo");   break;
+                       case OPC_META_FI:     fprintf(ctx->f, "_meta:fi");   break;
+                       case OPC_META_FLOW:   fprintf(ctx->f, "_meta:flow"); break;
+
+                       default: fprintf(ctx->f, "_meta:%d", instr->opc); break;
+                       }
+                       break;
+               }
+       } else if (instr->category == 1) {
+               static const char *type[] = {
+                               [TYPE_F16] = "f16",
+                               [TYPE_F32] = "f32",
+                               [TYPE_U16] = "u16",
+                               [TYPE_U32] = "u32",
+                               [TYPE_S16] = "s16",
+                               [TYPE_S32] = "s32",
+                               [TYPE_U8]  = "u8",
+                               [TYPE_S8]  = "s8",
+               };
+               if (instr->cat1.src_type == instr->cat1.dst_type)
+                       fprintf(ctx->f, "mov");
+               else
+                       fprintf(ctx->f, "cov");
+               fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
+       } else {
+               fprintf(ctx->f, "%s", ir3_instr_name(instr));
+               if (instr->flags & IR3_INSTR_3D)
+                       fprintf(ctx->f, ".3d");
+               if (instr->flags & IR3_INSTR_A)
+                       fprintf(ctx->f, ".a");
+               if (instr->flags & IR3_INSTR_O)
+                       fprintf(ctx->f, ".o");
+               if (instr->flags & IR3_INSTR_P)
+                       fprintf(ctx->f, ".p");
+               if (instr->flags & IR3_INSTR_S)
+                       fprintf(ctx->f, ".s");
+               if (instr->flags & IR3_INSTR_S2EN)
+                       fprintf(ctx->f, ".s2en");
+       }
+}
+
+static void dump_reg_name(struct ir3_dump_ctx *ctx,
+               struct ir3_register *reg)
+{
+       if ((reg->flags & IR3_REG_ABS) && (reg->flags & IR3_REG_NEGATE))
+               fprintf(ctx->f, "(absneg)");
+       else if (reg->flags & IR3_REG_NEGATE)
+               fprintf(ctx->f, "(neg)");
+       else if (reg->flags & IR3_REG_ABS)
+               fprintf(ctx->f, "(abs)");
+
+       if (reg->flags & IR3_REG_IMMED) {
+               fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
+       } else if (reg->flags & IR3_REG_SSA) {
+               if (ctx->verbose) {
+                       fprintf(ctx->f, "_[");
+                       dump_instr_name(ctx, reg->instr);
+                       fprintf(ctx->f, "]");
+               }
+       } else {
+               if (reg->flags & IR3_REG_HALF)
+                       fprintf(ctx->f, "h");
+               if (reg->flags & IR3_REG_CONST)
+                       fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+               else
+                       fprintf(ctx->f, "r%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+       }
+}
+
+static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
+               struct ir3_instruction *instr);
+static void ir3_block_dump(struct ir3_dump_ctx *ctx,
+               struct ir3_block *block, const char *name);
+
+static void dump_instr(struct ir3_dump_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       /* if we've already visited this instruction, bail now: */
+       if (ir3_instr_check_mark(instr))
+               return;
+
+       /* some meta-instructions need to be handled specially: */
+       if (is_meta(instr)) {
+               if ((instr->opc == OPC_META_FO) ||
+                               (instr->opc == OPC_META_FI)) {
+                       unsigned i;
+                       for (i = 1; i < instr->regs_count; i++) {
+                               struct ir3_register *reg = instr->regs[i];
+                               if (reg->flags & IR3_REG_SSA)
+                                       dump_instr(ctx, reg->instr);
+                       }
+               } else if (instr->opc == OPC_META_FLOW) {
+                       struct ir3_register *reg = instr->regs[1];
+                       ir3_block_dump(ctx, instr->flow.if_block, "if");
+                       if (instr->flow.else_block)
+                               ir3_block_dump(ctx, instr->flow.else_block, "else");
+                       if (reg->flags & IR3_REG_SSA)
+                               dump_instr(ctx, reg->instr);
+               } else if ((instr->opc == OPC_META_PHI) ||
+                               (instr->opc == OPC_META_DEREF)) {
+                       /* treat like a normal instruction: */
+                       ir3_instr_dump(ctx, instr);
+               }
+       } else {
+               ir3_instr_dump(ctx, instr);
+       }
+}
+
+/* arrarraggh!  if link is to something outside of the current block, we
+ * need to defer emitting the link until the end of the block, since the
+ * edge triggers pre-creation of the node it links to inside the cluster,
+ * even though it is meant to be outside..
+ */
+static struct {
+       char buf[40960];
+       unsigned n;
+} edge_buf;
+
+/* helper to print or defer: */
+static void printdef(struct ir3_dump_ctx *ctx,
+               bool defer, const char *fmt, ...)
+{
+       va_list ap;
+       va_start(ap, fmt);
+       if (defer) {
+               unsigned n = edge_buf.n;
+               n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n,
+                               fmt, ap);
+               edge_buf.n = n;
+       } else {
+               vfprintf(ctx->f, fmt, ap);
+       }
+       va_end(ap);
+}
+
+static void dump_link2(struct ir3_dump_ctx *ctx,
+               struct ir3_instruction *instr, const char *target, bool defer)
+{
+       /* some meta-instructions need to be handled specially: */
+       if (is_meta(instr)) {
+               if (instr->opc == OPC_META_INPUT) {
+                       printdef(ctx, defer, "input%lx:<in%u>:w -> %s",
+                                       PTRID(instr->inout.block),
+                                       instr->regs[0]->num, target);
+               } else if (instr->opc == OPC_META_FO) {
+                       struct ir3_register *reg = instr->regs[1];
+                       dump_link2(ctx, reg->instr, target, defer);
+                       printdef(ctx, defer, "[label=\".%c\"]",
+                                       "xyzw"[instr->fo.off & 0x3]);
+               } else if (instr->opc == OPC_META_FI) {
+                       unsigned i;
+
+                       /* recursively dump all parents and links */
+                       for (i = 1; i < instr->regs_count; i++) {
+                               struct ir3_register *reg = instr->regs[i];
+                               if (reg->flags & IR3_REG_SSA) {
+                                       dump_link2(ctx, reg->instr, target, defer);
+                                       printdef(ctx, defer, "[label=\".%c\"]",
+                                                       "xyzw"[(i - 1) & 0x3]);
+                               }
+                       }
+               } else if (instr->opc == OPC_META_OUTPUT) {
+                       printdef(ctx, defer, "output%lx:<out%u>:w -> %s",
+                                       PTRID(instr->inout.block),
+                                       instr->regs[0]->num, target);
+               } else if ((instr->opc == OPC_META_PHI) ||
+                               (instr->opc == OPC_META_DEREF)) {
+                       /* treat like a normal instruction: */
+                       printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
+               }
+       } else {
+               printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
+       }
+}
+
+static void dump_link(struct ir3_dump_ctx *ctx,
+               struct ir3_instruction *instr,
+               struct ir3_block *block, const char *target)
+{
+       bool defer = instr->block != block;
+       dump_link2(ctx, instr, target, defer);
+       printdef(ctx, defer, "\n");
+}
+
+static struct ir3_register *follow_flow(struct ir3_register *reg)
+{
+       if (reg->flags & IR3_REG_SSA) {
+               struct ir3_instruction *instr = reg->instr;
+               /* go with the flow.. */
+               if (is_meta(instr) && (instr->opc == OPC_META_FLOW))
+                       return instr->regs[1];
+       }
+       return reg;
+}
+
+static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       unsigned i;
+
+       fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{",
+                       PTRID(instr));
+       dump_instr_name(ctx, instr);
+
+       /* destination register: */
+       fprintf(ctx->f, "|<dst0>");
+
+       /* source register(s): */
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *reg = follow_flow(instr->regs[i]);
+
+               fprintf(ctx->f, "|");
+
+               if (reg->flags & IR3_REG_SSA)
+                       fprintf(ctx->f, "<src%u> ", (i - 1));
+
+               dump_reg_name(ctx, reg);
+       }
+
+       fprintf(ctx->f, "}\"];\n");
+
+       /* and recursively dump dependent instructions: */
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               char target[32];  /* link target */
+
+               if (!(reg->flags & IR3_REG_SSA))
+                       continue;
+
+               snprintf(target, sizeof(target), "instr%lx:<src%u>",
+                               PTRID(instr), (i - 1));
+
+               dump_instr(ctx, reg->instr);
+               dump_link(ctx, follow_flow(reg)->instr, instr->block, target);
+       }
+}
+
+static void ir3_block_dump(struct ir3_dump_ctx *ctx,
+               struct ir3_block *block, const char *name)
+{
+       unsigned i, n;
+
+       n = edge_buf.n;
+
+       fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block));
+       fprintf(ctx->f, "label=\"%s\";\n", name);
+
+       /* draw inputs: */
+       fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block));
+       for (i = 0; i < block->ninputs; i++)
+               if (block->inputs[i])
+                       fprintf(ctx->f, "|<in%u> i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
+       fprintf(ctx->f, "\"];\n");
+
+       /* draw instruction graph: */
+       for (i = 0; i < block->noutputs; i++)
+               dump_instr(ctx, block->outputs[i]);
+
+       /* draw outputs: */
+       fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block));
+       for (i = 0; i < block->noutputs; i++)
+               fprintf(ctx->f, "|<out%u> o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
+       fprintf(ctx->f, "\"];\n");
+
+       /* and links to outputs: */
+       for (i = 0; i < block->noutputs; i++) {
+               char target[32];  /* link target */
+
+               /* NOTE: there could be outputs that are never assigned,
+                * so skip them
+                */
+               if (!block->outputs[i])
+                       continue;
+
+               snprintf(target, sizeof(target), "output%lx:<out%u>:e",
+                               PTRID(block), i);
+
+               dump_link(ctx, block->outputs[i], block, target);
+       }
+
+       fprintf(ctx->f, "}\n");
+
+       /* and links to inputs: */
+       if (block->parent) {
+               for (i = 0; i < block->ninputs; i++) {
+                       char target[32];  /* link target */
+
+                       if (!block->inputs[i])
+                               continue;
+
+                       dump_instr(ctx, block->inputs[i]);
+
+                       snprintf(target, sizeof(target), "input%lx:<in%u>:e",
+                                       PTRID(block), i);
+
+                       dump_link(ctx, block->inputs[i], block, target);
+               }
+       }
+
+       /* dump deferred edges: */
+       if (edge_buf.n > n) {
+               fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]);
+               edge_buf.n = n;
+       }
+}
+
+void ir3_dump(struct ir3 *shader, const char *name,
+               struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
+               FILE *f)
+{
+       struct ir3_dump_ctx ctx = {
+                       .f = f,
+       };
+       ir3_clear_mark(shader);
+       fprintf(ctx.f, "digraph G {\n");
+       fprintf(ctx.f, "rankdir=RL;\n");
+       fprintf(ctx.f, "nodesep=0.25;\n");
+       fprintf(ctx.f, "ranksep=1.5;\n");
+       ir3_block_dump(&ctx, block, name);
+       fprintf(ctx.f, "}\n");
+}
+
+/*
+ * For Debugging:
+ */
+
+void
+ir3_dump_instr_single(struct ir3_instruction *instr)
+{
+       struct ir3_dump_ctx ctx = {
+                       .f = stdout,
+                       .verbose = true,
+       };
+       unsigned i;
+
+       dump_instr_name(&ctx, instr);
+       for (i = 0; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               printf(i ? ", " : " ");
+               dump_reg_name(&ctx, reg);
+       }
+       printf("\n");
+}
+
+void
+ir3_dump_instr_list(struct ir3_instruction *instr)
+{
+       unsigned n = 0;
+
+       while (instr) {
+               ir3_dump_instr_single(instr);
+               if (!is_meta(instr))
+                       n++;
+               instr = instr->next;
+       }
+       printf("%u instructions\n", n);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c
new file mode 100644 (file)
index 0000000..9389227
--- /dev/null
@@ -0,0 +1,155 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdarg.h>
+
+#include "ir3.h"
+
+/*
+ * Flatten: flatten out legs of if/else, etc
+ *
+ * TODO probably should use some heuristic to decide to not flatten
+ * if one side of the other is too large / deeply nested / whatever?
+ */
+
+struct ir3_flatten_ctx {
+       struct ir3_block *block;
+       unsigned cnt;
+};
+
+static struct ir3_register *unwrap(struct ir3_register *reg)
+{
+
+       if (reg->flags & IR3_REG_SSA) {
+               struct ir3_instruction *instr = reg->instr;
+               if (is_meta(instr)) {
+                       switch (instr->opc) {
+                       case OPC_META_OUTPUT:
+                       case OPC_META_FLOW:
+                               if (instr->regs_count > 1)
+                                       return instr->regs[1];
+                               return NULL;
+                       default:
+                               break;
+                       }
+               }
+       }
+       return reg;
+}
+
+static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       unsigned i;
+
+       /* if we've already visited this instruction, bail now: */
+       if (ir3_instr_check_mark(instr))
+               return;
+
+       instr->block = ctx->block;
+
+       /* TODO: maybe some threshold to decide whether to
+        * flatten or not??
+        */
+       if (is_meta(instr)) {
+               if (instr->opc == OPC_META_PHI) {
+                       struct ir3_register *cond, *t, *f;
+
+                       cond = unwrap(instr->regs[1]);
+                       t    = unwrap(instr->regs[2]);  /* true val */
+                       f    = unwrap(instr->regs[3]);  /* false val */
+
+                       /* must have cond, but t or f may be null if only written
+                        * one one side of the if/else (in which case we can just
+                        * convert the PHI to a simple move).
+                        */
+                       assert(cond);
+                       assert(t || f);
+
+                       if (t && f) {
+                               /* convert the PHI instruction to sel.{b16,b32} */
+                               instr->category = 3;
+
+                               /* instruction type based on dst size: */
+                               if (instr->regs[0]->flags & IR3_REG_HALF)
+                                       instr->opc = OPC_SEL_B16;
+                               else
+                                       instr->opc = OPC_SEL_B32;
+
+                               instr->regs[1] = t;
+                               instr->regs[2] = cond;
+                               instr->regs[3] = f;
+                       } else {
+                               /* convert to simple mov: */
+                               instr->category = 1;
+                               instr->cat1.dst_type = TYPE_F32;
+                               instr->cat1.src_type = TYPE_F32;
+                               instr->regs_count = 2;
+                               instr->regs[1] = t ? t : f;
+                       }
+
+                       ctx->cnt++;
+               } else if ((instr->opc == OPC_META_INPUT) &&
+                               (instr->regs_count == 2)) {
+                       type_t ftype;
+
+                       if (instr->regs[0]->flags & IR3_REG_HALF)
+                               ftype = TYPE_F16;
+                       else
+                               ftype = TYPE_F32;
+
+                       /* convert meta:input to mov: */
+                       instr->category = 1;
+                       instr->cat1.src_type = ftype;
+                       instr->cat1.dst_type = ftype;
+               }
+       }
+
+       /* recursively visit children: */
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *src = instr->regs[i];
+               if (src->flags & IR3_REG_SSA)
+                       ir3_instr_flatten(ctx, src->instr);
+       }
+}
+
+/* return >= 0 is # of phi's flattened, < 0 is error */
+int ir3_block_flatten(struct ir3_block *block)
+{
+       struct ir3_flatten_ctx ctx = {
+                       .block = block,
+       };
+       unsigned i;
+
+       ir3_clear_mark(block->shader);
+       for(i = 0; i < block->noutputs; i++)
+               if (block->outputs[i])
+                       ir3_instr_flatten(&ctx, block->outputs[i]);
+
+       return ctx.cnt;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
new file mode 100644 (file)
index 0000000..b916dd5
--- /dev/null
@@ -0,0 +1,790 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+
+#include "ir3.h"
+#include "ir3_visitor.h"
+
+/*
+ * Register Assignment:
+ *
+ * NOTE: currently only works on a single basic block.. need to think
+ * about how multiple basic blocks are going to get scheduled.  But
+ * I think I want to re-arrange how blocks work, ie. get rid of the
+ * block nesting thing..
+ *
+ * NOTE: we could do register coalescing (eliminate moves) as part of
+ * the RA step.. OTOH I think we need to do scheduling before register
+ * assignment.  And if we remove a mov that effects scheduling (unless
+ * we leave a placeholder nop, which seems lame), so I'm not really
+ * sure how practical this is to do both in a single stage.  But OTOH
+ * I'm not really sure a sane way for the CP stage to realize when it
+ * cannot remove a mov due to multi-register constraints..
+ *
+ */
+
+struct ir3_ra_ctx {
+       struct ir3_block *block;
+       enum shader_t type;
+       bool half_precision;
+       bool frag_coord;
+       bool frag_face;
+       bool has_samp;
+       int cnt;
+       bool error;
+};
+
+/* sorta ugly way to retrofit half-precision support.. rather than
+ * passing extra param around, just OR in a high bit.  All the low
+ * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
+ * will continue to work as long as you don't underflow (and that
+ * would go badly anyways).
+ */
+#define REG_HALF  0x8000
+
+struct ir3_ra_assignment {
+       int8_t  off;        /* offset of instruction dst within range */
+       uint8_t num;        /* number of components for the range */
+};
+
+static void ra_assign(struct ir3_ra_ctx *ctx,
+               struct ir3_instruction *assigner, int num);
+static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
+
+/*
+ * Register Allocation:
+ */
+
+#define REG(n, wm, f) (struct ir3_register){ \
+               .flags  = (f), \
+               .num    = (n), \
+               .wrmask = TGSI_WRITEMASK_ ## wm, \
+       }
+
+/* check that the register exists, is a GPR and is not special (a0/p0) */
+static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
+{
+       if ((n < instr->regs_count) && reg_gpr(instr->regs[n]))
+               return instr->regs[n];
+       return NULL;
+}
+
+static int output_base(struct ir3_ra_ctx *ctx)
+{
+       /* ugg, for fragment shader we need to have input at r0.x
+        * (or at least if there is a way to configure it, I can't
+        * see how because the blob driver always uses r0.x (ie.
+        * all zeros)
+        */
+       if (ctx->type == SHADER_FRAGMENT) {
+               if (ctx->half_precision)
+                       return ctx->frag_face ? 4 : 3;
+               return ctx->frag_coord ? 8 : 4;
+       }
+       return 0;
+}
+
+/* live means read before written */
+static void compute_liveregs(struct ir3_ra_ctx *ctx,
+               struct ir3_instruction *instr, regmask_t *liveregs)
+{
+       struct ir3_block *block = instr->block;
+       regmask_t written;
+       unsigned i, j;
+
+       regmask_init(liveregs);
+       regmask_init(&written);
+
+       for (instr = instr->next; instr; instr = instr->next) {
+               struct ir3_register *r;
+
+               if (is_meta(instr))
+                       continue;
+
+               /* check first src's read: */
+               for (j = 1; j < instr->regs_count; j++) {
+                       r = reg_check(instr, j);
+                       if (r)
+                               regmask_set_if_not(liveregs, r, &written);
+               }
+
+               /* then dst written (if assigned already): */
+               if (instr->flags & IR3_INSTR_MARK) {
+                       r = reg_check(instr, 0);
+                       if (r)
+                               regmask_set(&written, r);
+               }
+       }
+
+       /* be sure to account for output registers too: */
+       for (i = 0; i < block->noutputs; i++) {
+               struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
+               regmask_set_if_not(liveregs, &reg, &written);
+       }
+}
+
+/* calculate registers that are clobbered before last use of 'assigner'.
+ * This needs to be done backwards, although it could possibly be
+ * combined into compute_liveregs().  (Ie. compute_liveregs() could
+ * reverse the list, then do this part backwards reversing the list
+ * again back to original order.)  Otoh, probably I should try to
+ * construct a proper interference graph instead.
+ *
+ * XXX this need to follow the same recursion path that is used for
+ * to rename/assign registers (ie. ra_assign_src()).. this is a bit
+ * ugly right now, maybe refactor into node iterator sort of things
+ * that iterates nodes in the correct order?
+ */
+static bool compute_clobbers(struct ir3_ra_ctx *ctx,
+               struct ir3_instruction *instr, struct ir3_instruction *assigner,
+               regmask_t *liveregs)
+{
+       unsigned i;
+       bool live = false, was_live = false;
+
+       if (instr == NULL) {
+               struct ir3_block *block = ctx->block;
+
+               /* if at the end, check outputs: */
+               for (i = 0; i < block->noutputs; i++)
+                       if (block->outputs[i] == assigner)
+                               return true;
+               return false;
+       }
+
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
+                       if (is_meta(instr)) {
+                               switch (instr->opc) {
+                               case OPC_META_INPUT:
+                                       // TODO
+                                       assert(0);
+                                       break;
+                               case OPC_META_FO:
+                               case OPC_META_FI:
+                                       was_live |= compute_clobbers(ctx, instr->next,
+                                                       instr, liveregs);
+                                       break;
+                               default:
+                                       break;
+                               }
+                       }
+                       live = true;
+                       break;
+               }
+       }
+
+       was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
+
+       if (was_live && (instr->regs_count > 0) &&
+                       (instr->flags & IR3_INSTR_MARK) &&
+                       !is_meta(instr))
+               regmask_set(liveregs, instr->regs[0]);
+
+       return live || was_live;
+}
+
+static int find_available(regmask_t *liveregs, int size, bool half)
+{
+       unsigned i;
+       unsigned f = half ? IR3_REG_HALF : 0;
+       for (i = 0; i < MAX_REG - size; i++) {
+               if (!regmask_get(liveregs, &REG(i, X, f))) {
+                       unsigned start = i++;
+                       for (; (i < MAX_REG) && ((i - start) < size); i++)
+                               if (regmask_get(liveregs, &REG(i, X, f)))
+                                       break;
+                       if ((i - start) >= size)
+                               return start;
+               }
+       }
+       assert(0);
+       return -1;
+}
+
+static int alloc_block(struct ir3_ra_ctx *ctx,
+               struct ir3_instruction *instr, int size)
+{
+       if (!instr) {
+               /* special case, allocating shader outputs.  At this
+                * point, nothing is allocated, just start the shader
+                * outputs at r0.x and let compute_liveregs() take
+                * care of the rest from here:
+                */
+               return 0;
+       } else {
+               struct ir3_register *dst = instr->regs[0];
+               regmask_t liveregs;
+
+               compute_liveregs(ctx, instr, &liveregs);
+
+               // XXX XXX XXX XXX XXX XXX XXX XXX XXX
+               // XXX hack.. maybe ra_calc should give us a list of
+               // instrs to compute_clobbers() on?
+               if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
+                               (instr->regs_count == 1)) {
+                       unsigned i, base = instr->regs[0]->num & ~0x3;
+                       for (i = 0; i < 4; i++) {
+                               struct ir3_instruction *in = ctx->block->inputs[base + i];
+                               if (in)
+                                       compute_clobbers(ctx, in->next, in, &liveregs);
+                       }
+               } else
+               // XXX XXX XXX XXX XXX XXX XXX XXX XXX
+               compute_clobbers(ctx, instr->next, instr, &liveregs);
+
+               return find_available(&liveregs, size,
+                               !!(dst->flags & IR3_REG_HALF));
+       }
+}
+
+/*
+ * Constraint Calculation:
+ */
+
+struct ra_calc_visitor {
+       struct ir3_visitor base;
+       struct ir3_ra_assignment a;
+};
+
+static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
+{
+       return (struct ra_calc_visitor *)v;
+}
+
+/* calculate register assignment for the instruction.  If the register
+ * written by this instruction is required to be part of a range, to
+ * handle other (input/output/sam/bary.f/etc) contiguous register range
+ * constraints, that is calculated handled here.
+ */
+static void ra_calc_dst(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       struct ra_calc_visitor *c = ra_calc_visitor(v);
+       if (is_tex(instr)) {
+               c->a.off = 0;
+               c->a.num = 4;
+       } else {
+               c->a.off = 0;
+               c->a.num = 1;
+       }
+}
+
+static void
+ra_calc_dst_shader_input(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       struct ra_calc_visitor *c = ra_calc_visitor(v);
+       struct ir3_block *block = instr->block;
+       struct ir3_register *dst = instr->regs[0];
+       unsigned base = dst->num & ~0x3;
+       unsigned i, num = 0;
+
+       assert(!(dst->flags & IR3_REG_IA));
+
+       /* check what input components we need: */
+       for (i = 0; i < 4; i++) {
+               unsigned idx = base + i;
+               if ((idx < block->ninputs) && block->inputs[idx])
+                       num = i + 1;
+       }
+
+       c->a.off = dst->num - base;
+       c->a.num = num;
+}
+
+static void ra_calc_src_fanin(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       struct ra_calc_visitor *c = ra_calc_visitor(v);
+       unsigned srcn = ir3_instr_regno(instr, reg) - 1;
+       c->a.off += srcn;
+       c->a.num += srcn;
+       c->a.num = MAX2(c->a.num, instr->regs_count - 1);
+}
+
+static const struct ir3_visitor_funcs calc_visitor_funcs = {
+               .instr = ir3_visit_instr,
+               .dst_shader_input = ra_calc_dst_shader_input,
+               .dst_fanout = ra_calc_dst,
+               .dst_fanin = ra_calc_dst,
+               .dst = ra_calc_dst,
+               .src_fanout = ir3_visit_reg,
+               .src_fanin = ra_calc_src_fanin,
+               .src = ir3_visit_reg,
+};
+
+static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
+{
+       struct ra_calc_visitor v = {
+                       .base.funcs = &calc_visitor_funcs,
+       };
+
+       ir3_visit_instr(&v.base, assigner);
+
+       return v.a;
+}
+
+/*
+ * Register Assignment:
+ */
+
+struct ra_assign_visitor {
+       struct ir3_visitor base;
+       struct ir3_ra_ctx *ctx;
+       int num;
+};
+
+static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
+{
+       return (struct ra_assign_visitor *)v;
+}
+
+static type_t half_type(type_t type)
+{
+       switch (type) {
+       case TYPE_F32: return TYPE_F16;
+       case TYPE_U32: return TYPE_U16;
+       case TYPE_S32: return TYPE_S16;
+       /* instructions may already be fixed up: */
+       case TYPE_F16:
+       case TYPE_U16:
+       case TYPE_S16:
+               return type;
+       default:
+               assert(0);
+               return ~0;
+       }
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+       switch (instr->category) {
+       case 1: /* move instructions */
+               instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+               break;
+       case 3:
+               switch (instr->opc) {
+               case OPC_MAD_F32:
+                       instr->opc = OPC_MAD_F16;
+                       break;
+               case OPC_SEL_B32:
+                       instr->opc = OPC_SEL_B16;
+                       break;
+               case OPC_SEL_S32:
+                       instr->opc = OPC_SEL_S16;
+                       break;
+               case OPC_SEL_F32:
+                       instr->opc = OPC_SEL_F16;
+                       break;
+               case OPC_SAD_S32:
+                       instr->opc = OPC_SAD_S16;
+                       break;
+               /* instructions may already be fixed up: */
+               case OPC_MAD_F16:
+               case OPC_SEL_B16:
+               case OPC_SEL_S16:
+               case OPC_SEL_F16:
+               case OPC_SAD_S16:
+                       break;
+               default:
+                       assert(0);
+                       break;
+               }
+               break;
+       case 5:
+               instr->cat5.type = half_type(instr->cat5.type);
+               break;
+       }
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+       switch (instr->category) {
+       case 1: /* move instructions */
+               instr->cat1.src_type = half_type(instr->cat1.src_type);
+               break;
+       }
+}
+
+static void ra_assign_reg(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       struct ra_assign_visitor *a = ra_assign_visitor(v);
+
+       if (is_flow(instr) && (instr->opc == OPC_KILL))
+               return;
+
+       reg->flags &= ~IR3_REG_SSA;
+       reg->num = a->num & ~REG_HALF;
+
+       assert(reg->num >= 0);
+
+       if (a->num & REG_HALF) {
+               reg->flags |= IR3_REG_HALF;
+               /* if dst reg being assigned, patch up the instr: */
+               if (reg == instr->regs[0])
+                       fixup_half_instr_dst(instr);
+               else
+                       fixup_half_instr_src(instr);
+       }
+}
+
+static void ra_assign_dst_shader_input(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       struct ra_assign_visitor *a = ra_assign_visitor(v);
+       unsigned i, base = reg->num & ~0x3;
+       int off = base - reg->num;
+
+       ra_assign_reg(v, instr, reg);
+       reg->flags |= IR3_REG_IA;
+
+       /* trigger assignment of all our companion input components: */
+       for (i = 0; i < 4; i++) {
+               struct ir3_instruction *in = instr->block->inputs[i+base];
+               if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
+                       ra_assign(a->ctx, in, a->num + off + i);
+       }
+}
+
+static void ra_assign_dst_fanout(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       struct ra_assign_visitor *a = ra_assign_visitor(v);
+       struct ir3_register *src = instr->regs[1];
+       ra_assign_reg(v, instr, reg);
+       if (src->flags & IR3_REG_SSA)
+               ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
+}
+
+static void ra_assign_src_fanout(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       struct ra_assign_visitor *a = ra_assign_visitor(v);
+       ra_assign_reg(v, instr, reg);
+       ra_assign(a->ctx, instr, a->num + instr->fo.off);
+}
+
+
+static void ra_assign_src_fanin(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       struct ra_assign_visitor *a = ra_assign_visitor(v);
+       unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
+       ra_assign_reg(v, instr, reg);
+       ra_assign(a->ctx, instr, a->num - srcn);
+       for (j = 1; j < instr->regs_count; j++) {
+               struct ir3_register *reg = instr->regs[j];
+               if (reg->flags & IR3_REG_SSA)  /* could be renamed already */
+                       ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
+       }
+}
+
+static const struct ir3_visitor_funcs assign_visitor_funcs = {
+               .instr = ir3_visit_instr,
+               .dst_shader_input = ra_assign_dst_shader_input,
+               .dst_fanout = ra_assign_dst_fanout,
+               .dst_fanin = ra_assign_reg,
+               .dst = ra_assign_reg,
+               .src_fanout = ra_assign_src_fanout,
+               .src_fanin = ra_assign_src_fanin,
+               .src = ra_assign_reg,
+};
+
+static void ra_assign(struct ir3_ra_ctx *ctx,
+               struct ir3_instruction *assigner, int num)
+{
+       struct ra_assign_visitor v = {
+                       .base.funcs = &assign_visitor_funcs,
+                       .ctx = ctx,
+                       .num = num,
+       };
+
+       /* if we've already visited this instruction, bail now: */
+       if (ir3_instr_check_mark(assigner)) {
+               debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
+               if (assigner->regs[0]->num != (num & ~REG_HALF)) {
+                       /* impossible situation, should have been resolved
+                        * at an earlier stage by inserting extra mov's:
+                        */
+                       ctx->error = true;
+               }
+               return;
+       }
+
+       ir3_visit_instr(&v.base, assigner);
+}
+
+/*
+ *
+ */
+
+static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       struct ir3_register *dst;
+       unsigned num;
+
+       /* skip over nop's */
+       if (instr->regs_count == 0)
+               return;
+
+       dst = instr->regs[0];
+
+       /* if we've already visited this instruction, bail now: */
+       if (instr->flags & IR3_INSTR_MARK)
+               return;
+
+       /* allocate register(s): */
+       if (is_addr(instr)) {
+               num = instr->regs[2]->num;
+       } else if (reg_gpr(dst)) {
+               struct ir3_ra_assignment a;
+               a = ra_calc(instr);
+               num = alloc_block(ctx, instr, a.num) + a.off;
+       } else if (dst->flags & IR3_REG_ADDR) {
+               dst->flags &= ~IR3_REG_ADDR;
+               num = regid(REG_A0, 0) | REG_HALF;
+       } else {
+               /* predicate register (p0).. etc */
+               return;
+       }
+
+       ra_assign(ctx, instr, num);
+}
+
+/* flatten into shader: */
+// XXX this should probably be somewhere else:
+static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+       struct ir3_instruction *n;
+       struct ir3 *shader = block->shader;
+       struct ir3_instruction *end =
+                       ir3_instr_create(block, 0, OPC_END);
+       struct ir3_instruction *last_input = NULL;
+       struct ir3_instruction *last_rel = NULL;
+       regmask_t needs_ss_war;       /* write after read */
+       regmask_t needs_ss;
+       regmask_t needs_sy;
+
+       regmask_init(&needs_ss_war);
+       regmask_init(&needs_ss);
+       regmask_init(&needs_sy);
+
+       shader->instrs_count = 0;
+
+       for (n = block->head; n; n = n->next) {
+               struct ir3_register *reg;
+               unsigned i;
+
+               if (is_meta(n))
+                       continue;
+
+               for (i = 1; i < n->regs_count; i++) {
+                       reg = n->regs[i];
+
+                       if (reg_gpr(reg)) {
+
+                               /* TODO: we probably only need (ss) for alu
+                                * instr consuming sfu result.. need to make
+                                * some tests for both this and (sy)..
+                                */
+                               if (regmask_get(&needs_ss, reg)) {
+                                       n->flags |= IR3_INSTR_SS;
+                                       regmask_init(&needs_ss);
+                               }
+
+                               if (regmask_get(&needs_sy, reg)) {
+                                       n->flags |= IR3_INSTR_SY;
+                                       regmask_init(&needs_sy);
+                               }
+                       }
+
+                       /* TODO: is it valid to have address reg loaded from a
+                        * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
+                        * last_rel check below should be moved ahead of this:
+                        */
+                       if (reg->flags & IR3_REG_RELATIV)
+                               last_rel = n;
+               }
+
+               if (n->regs_count > 0) {
+                       reg = n->regs[0];
+                       if (regmask_get(&needs_ss_war, reg)) {
+                               n->flags |= IR3_INSTR_SS;
+                               regmask_init(&needs_ss_war); // ??? I assume?
+                       }
+
+                       if (last_rel && (reg->num == regid(REG_A0, 0))) {
+                               last_rel->flags |= IR3_INSTR_UL;
+                               last_rel = NULL;
+                       }
+               }
+
+               /* cat5+ does not have an (ss) bit, if needed we need to
+                * insert a nop to carry the sync flag.  Would be kinda
+                * clever if we were aware of this during scheduling, but
+                * this should be a pretty rare case:
+                */
+               if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) {
+                       struct ir3_instruction *nop;
+                       nop = ir3_instr_create(block, 0, OPC_NOP);
+                       nop->flags |= IR3_INSTR_SS;
+                       n->flags &= ~IR3_INSTR_SS;
+               }
+
+               /* need to be able to set (ss) on first instruction: */
+               if ((shader->instrs_count == 0) && (n->category >= 5))
+                       ir3_instr_create(block, 0, OPC_NOP);
+
+               if (is_nop(n) && shader->instrs_count) {
+                       struct ir3_instruction *last =
+                                       shader->instrs[shader->instrs_count-1];
+                       if (is_nop(last) && (last->repeat < 5)) {
+                               last->repeat++;
+                               last->flags |= n->flags;
+                               continue;
+                       }
+               }
+
+               shader->instrs[shader->instrs_count++] = n;
+
+               if (is_sfu(n))
+                       regmask_set(&needs_ss, n->regs[0]);
+
+               if (is_tex(n)) {
+                       /* this ends up being the # of samp instructions.. but that
+                        * is ok, everything else only cares whether it is zero or
+                        * not.  We do this here, rather than when we encounter a
+                        * SAMP decl, because (especially in binning pass shader)
+                        * the samp instruction(s) could get eliminated if the
+                        * result is not used.
+                        */
+                       ctx->has_samp = true;
+                       regmask_set(&needs_sy, n->regs[0]);
+               }
+
+               /* both tex/sfu appear to not always immediately consume
+                * their src register(s):
+                */
+               if (is_tex(n) || is_sfu(n)) {
+                       for (i = 1; i < n->regs_count; i++) {
+                               reg = n->regs[i];
+                               if (reg_gpr(reg))
+                                       regmask_set(&needs_ss_war, reg);
+                       }
+               }
+
+               if (is_input(n))
+                       last_input = n;
+       }
+
+       if (last_input)
+               last_input->regs[0]->flags |= IR3_REG_EI;
+
+       if (last_rel)
+               last_rel->flags |= IR3_INSTR_UL;
+
+       shader->instrs[shader->instrs_count++] = end;
+
+       shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+}
+
+static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+       struct ir3_instruction *n;
+
+       if (!block->parent) {
+               unsigned i, j;
+               int base, off = output_base(ctx);
+
+               base = alloc_block(ctx, NULL, block->noutputs + off);
+
+               if (ctx->half_precision)
+                       base |= REG_HALF;
+
+               for (i = 0; i < block->noutputs; i++)
+                       if (block->outputs[i] && !is_kill(block->outputs[i]))
+                               ra_assign(ctx, block->outputs[i], base + i + off);
+
+               if (ctx->type == SHADER_FRAGMENT) {
+                       i = 0;
+                       if (ctx->frag_face) {
+                               /* if we have frag_face, it gets hr0.x */
+                               ra_assign(ctx, block->inputs[i], REG_HALF | 0);
+                               i += 4;
+                       }
+                       for (j = 0; i < block->ninputs; i++, j++)
+                               if (block->inputs[i])
+                                       ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j);
+               } else {
+                       for (i = 0; i < block->ninputs; i++)
+                               if (block->inputs[i])
+                                       ir3_instr_ra(ctx, block->inputs[i]);
+               }
+       }
+
+       /* then loop over instruction list and assign registers:
+        */
+       n = block->head;
+       while (n) {
+               ir3_instr_ra(ctx, n);
+               if (ctx->error)
+                       return -1;
+               n = n->next;
+       }
+
+       legalize(ctx, block);
+
+       return 0;
+}
+
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+               bool half_precision, bool frag_coord, bool frag_face,
+               bool *has_samp)
+{
+       struct ir3_ra_ctx ctx = {
+                       .block = block,
+                       .type = type,
+                       .half_precision = half_precision,
+                       .frag_coord = frag_coord,
+                       .frag_face = frag_face,
+       };
+       int ret;
+
+       ir3_clear_mark(block->shader);
+       ret = block_ra(&ctx, block);
+       *has_samp = ctx.has_samp;
+
+       return ret;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
new file mode 100644 (file)
index 0000000..3ef6773
--- /dev/null
@@ -0,0 +1,401 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+enum {
+       SCHEDULED = -1,
+       DELAYED = -2,
+};
+
+/*
+ * Instruction Scheduling:
+ *
+ * Using the depth sorted list from depth pass, attempt to recursively
+ * schedule deepest unscheduled path.  The first instruction that cannot
+ * be scheduled, returns the required delay slots it needs, at which
+ * point we return back up to the top and attempt to schedule by next
+ * highest depth.  After a sufficient number of instructions have been
+ * scheduled, return back to beginning of list and start again.  If you
+ * reach the end of depth sorted list without being able to insert any
+ * instruction, insert nop's.  Repeat until no more unscheduled
+ * instructions.
+ *
+ * There are a few special cases that need to be handled, since sched
+ * is currently independent of register allocation.  Usages of address
+ * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
+ * if you have two pairs of instructions that write the same special
+ * register and then read it, then those pairs cannot be interleaved.
+ * To solve this, when we are in such a scheduling "critical section",
+ * and we encounter a conflicting write to a special register, we try
+ * to schedule any remaining instructions that use that value first.
+ */
+
+struct ir3_sched_ctx {
+       struct ir3_instruction *scheduled; /* last scheduled instr */
+       struct ir3_instruction *addr;      /* current a0.x user, if any */
+       struct ir3_instruction *pred;      /* current p0.x user, if any */
+       unsigned cnt;
+};
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+       struct ir3_instruction *d = NULL;
+       unsigned i = 0, id = 0;
+
+       while ((i < nsrcs) && !(d = srcs[id = i]))
+               i++;
+
+       if (!d)
+               return NULL;
+
+       for (; i < nsrcs; i++)
+               if (srcs[i] && (srcs[i]->depth > d->depth))
+                       d = srcs[id = i];
+
+       srcs[id] = NULL;
+
+       return d;
+}
+
+static unsigned distance(struct ir3_sched_ctx *ctx,
+               struct ir3_instruction *instr, unsigned maxd)
+{
+       struct ir3_instruction *n = ctx->scheduled;
+       unsigned d = 0;
+       while (n && (n != instr) && (d < maxd)) {
+               if (is_alu(n) || is_flow(n))
+                       d++;
+               n = n->next;
+       }
+       return d;
+}
+
+/* TODO maybe we want double linked list? */
+static struct ir3_instruction * prev(struct ir3_instruction *instr)
+{
+       struct ir3_instruction *p = instr->block->head;
+       while (p && (p->next != instr))
+               p = p->next;
+       return p;
+}
+
+static void schedule(struct ir3_sched_ctx *ctx,
+               struct ir3_instruction *instr, bool remove)
+{
+       struct ir3_block *block = instr->block;
+
+       /* maybe there is a better way to handle this than just stuffing
+        * a nop.. ideally we'd know about this constraint in the
+        * scheduling and depth calculation..
+        */
+       if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr))
+               schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+
+       /* remove from depth list:
+        */
+       if (remove) {
+               struct ir3_instruction *p = prev(instr);
+
+               /* NOTE: this can happen for inputs which are not
+                * read.. in that case there is no need to schedule
+                * the input, so just bail:
+                */
+               if (instr != (p ? p->next : block->head))
+                       return;
+
+               if (p)
+                       p->next = instr->next;
+               else
+                       block->head = instr->next;
+       }
+
+       if (writes_addr(instr)) {
+               assert(ctx->addr == NULL);
+               ctx->addr = instr;
+       }
+
+       if (writes_pred(instr)) {
+               assert(ctx->pred == NULL);
+               ctx->pred = instr;
+       }
+
+       instr->flags |= IR3_INSTR_MARK;
+
+       instr->next = ctx->scheduled;
+       ctx->scheduled = instr;
+
+       ctx->cnt++;
+}
+
+/*
+ * Delay-slot calculation.  Follows fanin/fanout.
+ */
+
+static unsigned delay_calc2(struct ir3_sched_ctx *ctx,
+               struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer, unsigned srcn)
+{
+       unsigned delay = 0;
+
+       if (is_meta(assigner)) {
+               unsigned i;
+               for (i = 1; i < assigner->regs_count; i++) {
+                       struct ir3_register *reg = assigner->regs[i];
+                       if (reg->flags & IR3_REG_SSA) {
+                               unsigned d = delay_calc2(ctx, reg->instr,
+                                               consumer, srcn);
+                               delay = MAX2(delay, d);
+                       }
+               }
+       } else {
+               delay = ir3_delayslots(assigner, consumer, srcn);
+               delay -= distance(ctx, assigner, delay);
+       }
+
+       return delay;
+}
+
+static unsigned delay_calc(struct ir3_sched_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       unsigned i, delay = 0;
+
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               if (reg->flags & IR3_REG_SSA) {
+                       unsigned d = delay_calc2(ctx, reg->instr,
+                                       instr, i - 1);
+                       delay = MAX2(delay, d);
+               }
+       }
+
+       return delay;
+}
+
+/* A negative return value signals that an instruction has been newly
+ * scheduled, return back up to the top of the stack (to block_sched())
+ */
+static int trysched(struct ir3_sched_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1];
+       struct ir3_instruction *src;
+       unsigned i, delay, nsrcs = 0;
+
+       /* if already scheduled: */
+       if (instr->flags & IR3_INSTR_MARK)
+               return 0;
+
+       /* figure out our src's: */
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               if (reg->flags & IR3_REG_SSA)
+                       srcs[nsrcs++] = reg->instr;
+       }
+
+       /* for each src register in sorted order:
+        */
+       delay = 0;
+       while ((src = deepest(srcs, nsrcs))) {
+               delay = trysched(ctx, src);
+               if (delay)
+                       return delay;
+       }
+
+       /* all our dependents are scheduled, figure out if
+        * we have enough delay slots to schedule ourself:
+        */
+       delay = delay_calc(ctx, instr);
+       if (delay)
+               return delay;
+
+       /* if this is a write to address/predicate register, and that
+        * register is currently in use, we need to defer until it is
+        * free:
+        */
+       if (writes_addr(instr) && ctx->addr) {
+               assert(ctx->addr != instr);
+               return DELAYED;
+       }
+       if (writes_pred(instr) && ctx->pred) {
+               assert(ctx->pred != instr);
+               return DELAYED;
+       }
+
+       schedule(ctx, instr, true);
+       return SCHEDULED;
+}
+
+static struct ir3_instruction * reverse(struct ir3_instruction *instr)
+{
+       struct ir3_instruction *reversed = NULL;
+       while (instr) {
+               struct ir3_instruction *next = instr->next;
+               instr->next = reversed;
+               reversed = instr;
+               instr = next;
+       }
+       return reversed;
+}
+
+static bool uses_current_addr(struct ir3_sched_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       unsigned i;
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               if (reg->flags & IR3_REG_SSA) {
+                       if (is_addr(reg->instr)) {
+                               struct ir3_instruction *addr;
+                               addr = reg->instr->regs[1]->instr; /* the mova */
+                               if (ctx->addr == addr)
+                                       return true;
+                       }
+               }
+       }
+       return false;
+}
+
+static bool uses_current_pred(struct ir3_sched_ctx *ctx,
+               struct ir3_instruction *instr)
+{
+       unsigned i;
+       for (i = 1; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               if ((reg->flags & IR3_REG_SSA) && (ctx->pred == reg->instr))
+                               return true;
+       }
+       return false;
+}
+
+/* when we encounter an instruction that writes to the address register
+ * when it is in use, we delay that instruction and try to schedule all
+ * other instructions using the current address register:
+ */
+static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
+               struct ir3_block *block)
+{
+       struct ir3_instruction *instr = block->head;
+       bool addr_in_use = false;
+       bool pred_in_use = false;
+       unsigned cnt = ~0;
+
+       while (instr) {
+               struct ir3_instruction *next = instr->next;
+               bool addr = uses_current_addr(ctx, instr);
+               bool pred = uses_current_pred(ctx, instr);
+
+               if (addr || pred) {
+                       int ret = trysched(ctx, instr);
+                       if (ret == SCHEDULED)
+                               cnt = 0;
+                       else if (ret > 0)
+                               cnt = MIN2(cnt, ret);
+                       if (addr)
+                               addr_in_use = true;
+                       if (pred)
+                               pred_in_use = true;
+               }
+
+               instr = next;
+       }
+
+       if (!addr_in_use)
+               ctx->addr = NULL;
+
+       if (!pred_in_use)
+               ctx->pred = NULL;
+
+       return cnt;
+}
+
+static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+       struct ir3_instruction *instr;
+
+       /* schedule all the shader input's (meta-instr) first so that
+        * the RA step sees that the input registers contain a value
+        * from the start of the shader:
+        */
+       if (!block->parent) {
+               unsigned i;
+               for (i = 0; i < block->ninputs; i++) {
+                       struct ir3_instruction *in = block->inputs[i];
+                       if (in)
+                               schedule(ctx, in, true);
+               }
+       }
+
+       while ((instr = block->head)) {
+               /* NOTE: always grab next *before* trysched(), in case the
+                * instruction is actually scheduled (and therefore moved
+                * from depth list into scheduled list)
+                */
+               struct ir3_instruction *next = instr->next;
+               int cnt = trysched(ctx, instr);
+
+               if (cnt == DELAYED)
+                       cnt = block_sched_undelayed(ctx, block);
+
+               /* -1 is signal to return up stack, but to us means same as 0: */
+               cnt = MAX2(0, cnt);
+               cnt += ctx->cnt;
+               instr = next;
+
+               /* if deepest remaining instruction cannot be scheduled, try
+                * the increasingly more shallow instructions until needed
+                * number of delay slots is filled:
+                */
+               while (instr && (cnt > ctx->cnt)) {
+                       next = instr->next;
+                       trysched(ctx, instr);
+                       instr = next;
+               }
+
+               /* and if we run out of instructions that can be scheduled,
+                * then it is time for nop's:
+                */
+               while (cnt > ctx->cnt)
+                       schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+       }
+
+       /* at this point, scheduled list is in reverse order, so fix that: */
+       block->head = reverse(ctx->scheduled);
+}
+
+void ir3_block_sched(struct ir3_block *block)
+{
+       struct ir3_sched_ctx ctx = {0};
+       ir3_clear_mark(block->shader);
+       block_sched(&ctx, block);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
new file mode 100644 (file)
index 0000000..ddf99db
--- /dev/null
@@ -0,0 +1,211 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "freedreno_context.h"
+#include "freedreno_lowering.h"
+#include "freedreno_util.h"
+
+#include "ir3_shader.h"
+#include "ir3_compiler.h"
+
+
+static void
+delete_variant(struct ir3_shader_variant *v)
+{
+       ir3_destroy(v->ir);
+       fd_bo_del(v->bo);
+       free(v);
+}
+
+static void
+assemble_variant(struct ir3_shader_variant *v)
+{
+       struct fd_context *ctx = fd_context(v->shader->pctx);
+       uint32_t sz, *bin;
+
+       bin = ir3_assemble(v->ir, &v->info);
+       sz = v->info.sizedwords * 4;
+
+       v->bo = fd_bo_new(ctx->dev, sz,
+                       DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
+                       DRM_FREEDRENO_GEM_TYPE_KMEM);
+
+       memcpy(fd_bo_map(v->bo), bin, sz);
+
+       free(bin);
+
+       v->instrlen = v->info.sizedwords / 8;
+       v->constlen = v->info.max_const + 1;
+}
+
+/* for vertex shader, the inputs are loaded into registers before the shader
+ * is executed, so max_regs from the shader instructions might not properly
+ * reflect the # of registers actually used:
+ */
+static void
+fixup_vp_regfootprint(struct ir3_shader_variant *v)
+{
+       unsigned i;
+       for (i = 0; i < v->inputs_count; i++) {
+               if (v->inputs[i].compmask) {
+                       uint32_t regid = (v->inputs[i].regid + 3) >> 2;
+                       v->info.max_reg = MAX2(v->info.max_reg, regid);
+               }
+       }
+       for (i = 0; i < v->outputs_count; i++) {
+               uint32_t regid = (v->outputs[i].regid + 3) >> 2;
+               v->info.max_reg = MAX2(v->info.max_reg, regid);
+       }
+}
+
+static struct ir3_shader_variant *
+create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
+{
+       struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
+       const struct tgsi_token *tokens = shader->tokens;
+       int ret;
+
+       if (!v)
+               return NULL;
+
+       v->shader = shader;
+       v->key = key;
+       v->type = shader->type;
+
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
+                       key.binning_pass, key.color_two_side, key.half_precision);
+               tgsi_dump(tokens, 0);
+       }
+
+       if (!(fd_mesa_debug & FD_DBG_NOOPT)) {
+               ret = ir3_compile_shader(v, tokens, key);
+               if (ret) {
+                       debug_error("new compiler failed, trying fallback!");
+
+                       v->inputs_count = 0;
+                       v->outputs_count = 0;
+                       v->total_in = 0;
+                       v->has_samp = false;
+                       v->immediates_count = 0;
+               }
+       } else {
+               ret = -1;  /* force fallback to old compiler */
+       }
+
+       if (ret)
+               ret = ir3_compile_shader_old(v, tokens, key);
+
+       if (ret) {
+               debug_error("compile failed!");
+               goto fail;
+       }
+
+       assemble_variant(v);
+       if (!v->bo) {
+               debug_error("assemble failed!");
+               goto fail;
+       }
+
+       if (shader->type == SHADER_VERTEX)
+               fixup_vp_regfootprint(v);
+
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+                       key.binning_pass, key.color_two_side, key.half_precision);
+               disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
+       }
+
+       return v;
+
+fail:
+       delete_variant(v);
+       return NULL;
+}
+
+struct ir3_shader_variant *
+ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key)
+{
+       struct ir3_shader_variant *v;
+
+       /* some shader key values only apply to vertex or frag shader,
+        * so normalize the key to avoid constructing multiple identical
+        * variants:
+        */
+       if (shader->type == SHADER_FRAGMENT) {
+               key.binning_pass = false;
+       }
+       if (shader->type == SHADER_VERTEX) {
+               key.color_two_side = false;
+               key.half_precision = false;
+       }
+
+       for (v = shader->variants; v; v = v->next)
+               if (!memcmp(&key, &v->key, sizeof(key)))
+                       return v;
+
+       /* compile new variant if it doesn't exist already: */
+       v = create_variant(shader, key);
+       v->next = shader->variants;
+       shader->variants = v;
+
+       return v;
+}
+
+
+void
+ir3_shader_destroy(struct ir3_shader *shader)
+{
+       struct ir3_shader_variant *v, *t;
+       for (v = shader->variants; v; ) {
+               t = v;
+               v = v->next;
+               delete_variant(t);
+       }
+       free((void *)shader->tokens);
+       free(shader);
+}
+
+struct ir3_shader *
+ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
+               enum shader_t type)
+{
+       struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+       shader->pctx = pctx;
+       shader->type = type;
+       shader->tokens = tgsi_dup_tokens(tokens);
+       return shader;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
new file mode 100644 (file)
index 0000000..1a91fcb
--- /dev/null
@@ -0,0 +1,163 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef IR3_SHADER_H_
+#define IR3_SHADER_H_
+
+#include "ir3.h"
+#include "disasm.h"
+
+typedef uint16_t ir3_semantic;  /* semantic name + index */
+static inline ir3_semantic
+ir3_semantic_name(uint8_t name, uint16_t index)
+{
+       return (name << 8) | (index & 0xff);
+}
+
+static inline uint8_t sem2name(ir3_semantic sem)
+{
+       return sem >> 8;
+}
+
+static inline uint16_t sem2idx(ir3_semantic sem)
+{
+       return sem & 0xff;
+}
+
+/* Configuration key used to identify a shader variant.. different
+ * shader variants can be used to implement features not supported
+ * in hw (two sided color), binning-pass vertex shader, etc.
+ */
+struct ir3_shader_key {
+       /* vertex shader variant parameters: */
+       unsigned binning_pass : 1;
+
+       /* fragment shader variant parameters: */
+       unsigned color_two_side : 1;
+       unsigned half_precision : 1;
+};
+
+struct ir3_shader_variant {
+       struct fd_bo *bo;
+
+       struct ir3_shader_key key;
+
+       struct ir3_info info;
+       struct ir3 *ir;
+
+       /* the instructions length is in units of instruction groups
+        * (4 instructions, 8 dwords):
+        */
+       unsigned instrlen;
+
+       /* the constants length is in units of vec4's, and is the sum of
+        * the uniforms and the built-in compiler constants
+        */
+       unsigned constlen;
+
+       /* About Linkage:
+        *   + Let the frag shader determine the position/compmask for the
+        *     varyings, since it is the place where we know if the varying
+        *     is actually used, and if so, which components are used.  So
+        *     what the hw calls "outloc" is taken from the "inloc" of the
+        *     frag shader.
+        *   + From the vert shader, we only need the output regid
+        */
+
+       /* for frag shader, pos_regid holds the frag_pos, ie. what is passed
+        * to bary.f instructions
+        */
+       uint8_t pos_regid;
+       bool frag_coord, frag_face;
+
+       /* varyings/outputs: */
+       unsigned outputs_count;
+       struct {
+               ir3_semantic semantic;
+               uint8_t regid;
+       } outputs[16 + 2];  /* +POSITION +PSIZE */
+       bool writes_pos, writes_psize;
+
+       /* vertices/inputs: */
+       unsigned inputs_count;
+       struct {
+               ir3_semantic semantic;
+               uint8_t regid;
+               uint8_t compmask;
+               uint8_t ncomp;
+               /* in theory inloc of fs should match outloc of vs: */
+               uint8_t inloc;
+               uint8_t bary;
+       } inputs[16 + 2];  /* +POSITION +FACE */
+
+       unsigned total_in;       /* sum of inputs (scalar) */
+
+       /* do we have one or more texture sample instructions: */
+       bool has_samp;
+
+       /* const reg # of first immediate, ie. 1 == c1
+        * (not regid, because TGSI thinks in terms of vec4 registers,
+        * not scalar registers)
+        */
+       unsigned first_immediate;
+       unsigned immediates_count;
+       struct {
+               uint32_t val[4];
+       } immediates[64];
+
+       /* shader variants form a linked list: */
+       struct ir3_shader_variant *next;
+
+       /* replicated here to avoid passing extra ptrs everywhere: */
+       enum shader_t type;
+       struct ir3_shader *shader;
+};
+
+struct ir3_shader {
+       enum shader_t type;
+
+       struct pipe_context *pctx;
+       const struct tgsi_token *tokens;
+
+       struct ir3_shader_variant *variants;
+
+       /* so far, only used for blit_prog shader.. values for
+        * VPC_VARYING_INTERP[i].MODE and VPC_VARYING_PS_REPL[i].MODE
+        */
+       uint32_t vinterp[4], vpsrepl[4];
+};
+
+
+struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
+               const struct tgsi_token *tokens, enum shader_t type);
+void ir3_shader_destroy(struct ir3_shader *shader);
+
+struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
+               struct ir3_shader_key key);
+
+#endif /* IR3_SHADER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_visitor.h b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h
new file mode 100644 (file)
index 0000000..1c60d16
--- /dev/null
@@ -0,0 +1,154 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef IR3_VISITOR_H_
+#define IR3_VISITOR_H_
+
+/**
+ * Visitor which follows dst to src relationships between instructions,
+ * first visiting the dst (writer) instruction, followed by src (reader)
+ * instruction(s).
+ *
+ * TODO maybe we want multiple different visitors to walk the
+ * graph in different ways?
+ */
+
+struct ir3_visitor;
+
+typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v,
+               struct ir3_instruction *instr);
+
+typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg);
+
+struct ir3_visitor_funcs {
+       ir3_visit_instr_func instr;  // TODO do we need??
+
+       ir3_visit_reg_func dst_shader_input;
+       ir3_visit_reg_func dst_block_input;
+       ir3_visit_reg_func dst_fanout;
+       ir3_visit_reg_func dst_fanin;
+       ir3_visit_reg_func dst;
+
+       ir3_visit_reg_func src_block_input;
+       ir3_visit_reg_func src_fanout;
+       ir3_visit_reg_func src_fanin;
+       ir3_visit_reg_func src;
+};
+
+struct ir3_visitor {
+       const struct ir3_visitor_funcs *funcs;
+       bool error;
+};
+
+#include "util/u_debug.h"
+
+static void visit_instr_dst(struct ir3_visitor *v,
+               struct ir3_instruction *instr)
+{
+       struct ir3_register *reg = instr->regs[0];
+
+       if (is_meta(instr)) {
+               switch (instr->opc) {
+               case OPC_META_INPUT:
+                       if (instr->regs_count == 1)
+                               v->funcs->dst_shader_input(v, instr, reg);
+                       else
+                               v->funcs->dst_block_input(v, instr, reg);
+                       return;
+               case OPC_META_FO:
+                       v->funcs->dst_fanout(v, instr, reg);
+                       return;
+               case OPC_META_FI:
+                       v->funcs->dst_fanin(v, instr, reg);
+                       return;
+               default:
+                       break;
+
+               }
+       }
+
+       v->funcs->dst(v, instr, reg);
+}
+
+static void visit_instr_src(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       if (is_meta(instr)) {
+               switch (instr->opc) {
+               case OPC_META_INPUT:
+                       /* shader-input does not have a src, only block input: */
+                       debug_assert(instr->regs_count == 2);
+                       v->funcs->src_block_input(v, instr, reg);
+                       return;
+               case OPC_META_FO:
+                       v->funcs->src_fanout(v, instr, reg);
+                       return;
+               case OPC_META_FI:
+                       v->funcs->src_fanin(v, instr, reg);
+                       return;
+               default:
+                       break;
+
+               }
+       }
+
+       v->funcs->src(v, instr, reg);
+}
+
+static void ir3_visit_instr(struct ir3_visitor *v,
+               struct ir3_instruction *instr)
+{
+       struct ir3_instruction *n;
+
+       /* visit instruction that assigns value: */
+       if (instr->regs_count > 0)
+               visit_instr_dst(v, instr);
+
+       /* and of any following instructions which read that value: */
+       n = instr->next;
+       while (n && !v->error) {
+               unsigned i;
+
+               for (i = 1; i < n->regs_count; i++) {
+                       struct ir3_register *reg = n->regs[i];
+                       if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr))
+                               visit_instr_src(v, n, reg);
+               }
+
+               n = n->next;
+       }
+}
+
+static void ir3_visit_reg(struct ir3_visitor *v,
+               struct ir3_instruction *instr, struct ir3_register *reg)
+{
+       /* no-op */
+}
+
+#endif /* IR3_VISITOR_H_ */